├── LICENCE ├── README.md ├── crontab ├── doc ├── emissary2.png ├── emissary3.png ├── emissary4.png └── emissary5.png ├── emissary ├── __init__.py ├── client.py ├── config.py ├── controllers │ ├── __init__.py │ ├── cron.py │ ├── fetch.py │ ├── load.py │ ├── log.py │ ├── manager.py │ ├── parser.py │ ├── scripts.py │ ├── tui.py │ └── utils.py ├── models.py ├── repl.py ├── resources │ ├── __init__.py │ ├── api_key.py │ ├── articles.py │ ├── feedgroups.py │ └── feeds.py └── run.py ├── scripts └── hello.py └── setup.py /LICENCE: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person 2 | obtaining a copy of this software and associated documentation 3 | files (the "Software"), to deal in the Software without 4 | restriction, including without limitation the rights to use, 5 | copy, modify, merge, publish, distribute, sublicense, and/or sell 6 | copies of the Software, and to permit persons to whom the 7 | Software is furnished to do so, subject to the following 8 | conditions: 9 | 10 | The above copyright notice and this permission notice shall be 11 | included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 14 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 15 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 16 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 17 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 | OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Emissary 2 | ======== 3 | 4 | An intelligence utility / test for researchers, programmers and generally carnivorous primates who want personally curated news archives. 5 | Emissary is a web content extractor that has a RESTful API and the ability to run pre-store scripts. 6 | Emissary stores the full text of linked articles from RSS feeds or URLs containing links. 7 | 8 | Documentation lives [here](http://docs.psybernetics.org/). 9 | 10 | -------- 11 | ![Alt text](doc/emissary4.png?raw=true "ncurses Client") 12 | ![Alt text](doc/emissary3.png?raw=true "Feed Groups") 13 | ![Alt text](doc/emissary2.png?raw=true "Articles") 14 |
15 | 
16 | Installation requires the python interpreter headers, libevent, libxml2 and libxslt headers.
17 | Optional article compression requires libsnappy. 
18 | All of these can be obtained on debian-based systems with:
19 | sudo apt-get install -y zlib1g-dev libxml2-dev libxslt1-dev python-dev libevent-dev libsnappy-dev
20 | 
21 | You're then ready to install the package for all users:
22 | sudo python setup.py install
23 | 
24 | 
25 |  Usage: python -m emissary.run 
26 | 
27 |   -h, --help            show this help message and exit
28 |   -c, --crontab         Crontab to parse
29 |   --config              (defaults to emissary.config)
30 |   -a, --address         (defaults to 0.0.0.0)
31 |   -p, --port            (defaults to 6362)
32 |   --export              Write the existing database as a crontab
33 |   --key                 SSL key file
34 |   --cert                SSL certificate
35 |   --pidfile             (defaults to ./emissary.pid)
36 |   --logfile             (defaults to ./emissary.log)
37 |   --stop                
38 |   --debug               Log to stdout
39 |   -d                    Run in the background
40 |   --run-as              (defaults to the invoking user)
41 |   --scripts-dir         (defaults to ./scripts/)
42 | 
43 | 
44 | Some initial setup has to be done before the system will start.
45 | Communication with Emissary is mainly done over HTTPS connections
46 | and for that you're going to need an SSL certificate and a key:
47 | 
48 | user@host $ openssl genrsa 4096 > key
49 | user@host $ openssl req -new -x509 -nodes -sha256 -days 365 -key key > cert
50 | 
51 | To prevent your API keys ever getting put into version control for all
52 | the world to see, you need to put a database URI into the environment:
53 | 
54 | export EMISSARY_DATABASE="sqlite://///home/you/.emissary.db"
55 | 
56 | Protip: Put that last line in your shells' rc file.
57 | 
58 | Start an instance in the foreground to obtain your first API key:
59 | 
60 | user@host $ python -m emissary.run --cert cert --key key
61 | 14/06/2015 16:31:30 - Emissary - INFO - Starting Emissary 2.0.0.
62 | e5a59e0a-b457-45c6-9d30-d983419c43e1
63 | ^That UUID is your Primary API key. Add it to this example crontab:
64 | 
65 | user@host $ cat feeds.txt
66 | apikey: your-api-key-here
67 | 
68 | # url                                                 name            group            minute  hour    day     month   weekday
69 | http://news.ycombinator.com/rss                       "HN"            "HN"             */15    *       *       *       *
70 | http://phys.org/rss-feed/                             "Phys.org"      "Phys.org"       1       12      *       *       *
71 | http://feeds.nature.com/news/rss/most_recent          "Nature"        "Nature"         30      13      *       *       *
72 | 
73 | user@host $ python -m emissary.run -c feeds.txt
74 | Using API key "Primary".
75 | Primary: Creating feed group HN.
76 | Primary: HN: Creating feed "HN"
77 | 
78 | Emissary supports multiple apikey directives in one crontab.
79 | Subsequent feed definitions are associated with the previous key.
80 | 
81 | Start an instance in the background and connect to it:
82 | user@host $ python -m emissary.run -d --cert cert --key key
83 | user@host $ python -m emissary.repl
84 | Emissary 2.0.0
85 | Psybernetics 2015
86 | 
87 | (3,204) > help
88 | 
89 | 
90 | 91 | If the prospect of creating an NSA profile of your reading habits is 92 | something that rightfully bothers you then my advice is to subscribe 93 | to many things and then use Emissary to read the things that really 94 | interest you. 95 | 96 | ![Alt text](doc/emissary5.png?raw=true "ncurses programmatic access") 97 | -------------------------------------------------------------------------------- /crontab: -------------------------------------------------------------------------------- 1 | apikey: your-api-key-here 2 | 3 | # url name group minute hour day month weekday 4 | http://news.ycombinator.com/rss "HN" "HN" 20! * * * * 5 | http://mf.feeds.reuters.com/reuters/UKdomesticNews "Reuters UK" "Reuters" 0 3! * * * 6 | -------------------------------------------------------------------------------- /doc/emissary2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary2.png -------------------------------------------------------------------------------- /doc/emissary3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary3.png -------------------------------------------------------------------------------- /doc/emissary4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary4.png -------------------------------------------------------------------------------- /doc/emissary5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary5.png -------------------------------------------------------------------------------- /emissary/__init__.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # 3 | # The structure of this package is essentially as follows 4 | # 5 | # models.py Our abstractions for the types of data we persist to a database, 6 | # including how to represent columns and joins on other tables as singular 7 | # JSON documents. Handy for building list comprehensions of models. 8 | # resources/ RESTful API endpoints for interacting with models over HTTP 9 | # controllers/ Miscellaneous utilities used throughout the whole project 10 | # run.py A runner program that inserts a database schema if none is present, 11 | # binds to a network interface and changes UID if asked. 12 | # repl.py An interactive read-eval-print loop for working with the REST interface. 13 | # config.py Defines how to obtain a database URI. 14 | """ 15 | A democracy thing for researchers, programmers and news junkies who want personally curated news archives. 16 | Emissary is a web content extractor that has a RESTful API and a scripting system. 17 | Emissary stores the full text of linked articles from RSS feeds or URLs containing links. 18 | """ 19 | 20 | from pkgutil import extend_path 21 | __path__ = extend_path(__path__, __name__) 22 | __all__ = ["client", "controllers", "models", "resources", "run", "repl"] 23 | 24 | import time 25 | from flask import Flask 26 | from flask.ext import restful 27 | from flask.ext.sqlalchemy import SQLAlchemy 28 | from multiprocessing import Queue, cpu_count 29 | from sqlalchemy.engine.reflection import Inspector 30 | 31 | app = Flask("emissary") 32 | 33 | # This config is the default and can be overridden by 34 | # using options.config in run.py (python -m emissary.run -c somefile.py) 35 | app.config.from_object("emissary.config") 36 | 37 | app.version = "2.1.1" 38 | app.inbox = Queue() 39 | app.scripts = None 40 | app.feedmanager = None 41 | app.config["HTTP_BASIC_AUTH_REALM"] = "Emissary " + app.version 42 | 43 | 44 | # These are response queues that enable the main thread of execution to 45 | # share data with the REST interface. Mainly for reporting the status of crontabs. 46 | app.queues = [] 47 | for i in range(cpu_count() * 2): 48 | q = Queue() 49 | q.access = time.time() 50 | app.queues.append(q) 51 | 52 | db = SQLAlchemy(app) 53 | api = restful.Api(app, prefix='/v1') 54 | 55 | def init(): 56 | # Models are imported here to prevent a circular import where we would 57 | # import models and the models would import that db object just above us. 58 | 59 | # They're also imported here in this function because they implicitly 60 | # monkey-patch the threading module, and we might not need that if all we want 61 | # from the namespace is something like app.version, like in repl.py for example. 62 | from models import APIKey 63 | from models import FeedGroup 64 | from models import Feed 65 | from models import Article 66 | from models import Event 67 | 68 | from resources import api_key 69 | from resources import feeds 70 | from resources import feedgroups 71 | from resources import articles 72 | 73 | api.add_resource(api_key.KeyCollection, "/keys") 74 | api.add_resource(api_key.KeyResource, "/keys/") 75 | 76 | api.add_resource(feedgroups.FeedGroupCollection, "/feeds") 77 | api.add_resource(feedgroups.FeedGroupResource, "/feeds/") 78 | api.add_resource(feedgroups.FeedGroupStop, "/feeds//stop") 79 | api.add_resource(feedgroups.FeedGroupStart, "/feeds//start") 80 | api.add_resource(feedgroups.FeedGroupArticles, "/feeds//articles") 81 | api.add_resource(feedgroups.FeedGroupSearch, "/feeds//search/") 82 | api.add_resource(feedgroups.FeedGroupCount, "/feeds//count") 83 | 84 | api.add_resource(feeds.FeedResource, "/feeds//") 85 | api.add_resource(feeds.FeedArticleCollection, "/feeds///articles") 86 | api.add_resource(feeds.FeedSearch, "/feeds///search/") 87 | api.add_resource(feeds.FeedStartResource, "/feeds///start") 88 | api.add_resource(feeds.FeedStopResource, "/feeds///stop") 89 | 90 | api.add_resource(articles.ArticleCollection, "/articles") 91 | api.add_resource(articles.ArticleResource, "/articles/") 92 | api.add_resource(articles.ArticleSearch, "/articles/search/") 93 | api.add_resource(articles.ArticleCount, "/articles/count") 94 | 95 | # Create the database schema if it's not already laid out. 96 | inspector = Inspector.from_engine(db.engine) 97 | tables = [table_name for table_name in inspector.get_table_names()] 98 | 99 | if 'api_keys' not in tables: 100 | db.create_all() 101 | master = models.APIKey(name = app.config['MASTER_KEY_NAME']) 102 | if app.config['MASTER_KEY']: master.key = app.config['MASTER_KEY'] 103 | else: master.key = master.generate_key_str() 104 | print master.key 105 | master.active = True 106 | db.session.add(master) 107 | db.session.commit() 108 | -------------------------------------------------------------------------------- /emissary/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pprint 3 | import json 4 | import cmd 5 | import os 6 | os.environ['no_proxy'] = '127.0.0.1,localhost' 7 | requests.packages.urllib3.disable_warnings() 8 | 9 | class Client(object): 10 | def __init__(self, key, base_url, verify=True, timeout=2.500): 11 | self.key = key 12 | self.base = base_url 13 | pp = pprint.PrettyPrinter(indent=4) 14 | self.p = pp.pprint 15 | self.verify_https = verify 16 | self.timeout = timeout 17 | 18 | # Defining a username manually on your client objects will 19 | # permit you to use the .can() shortcut for determining 20 | # the username's access rights. 21 | self.username = None 22 | 23 | if not self.base.endswith('/'): 24 | self.base += '/' 25 | 26 | def _send_request(self, url, type='GET', body={}, headers={}): 27 | headers['Authorization'] = "Basic %s" % self.key 28 | url = self.base+url 29 | resp = None 30 | if type=='GET': 31 | resp = requests.get(url, verify=self.verify_https, 32 | headers=headers, timeout=self.timeout) 33 | elif type=='DELETE': 34 | resp = requests.delete(url, verify=self.verify_https, 35 | data=body, headers=headers, timeout=self.timeout) 36 | elif type=='PUT': 37 | resp = requests.put(url, verify=self.verify_https, 38 | data=body, headers=headers, timeout=self.timeout) 39 | elif type=='POST': 40 | resp = requests.post(url, verify=self.verify_https, 41 | data=body, headers=headers, timeout=self.timeout) 42 | try: return resp.json(), resp.status_code 43 | except: return {}, resp.status_code 44 | 45 | def get(self, url, body={}, headers={}): 46 | return self._send_request(url, body=body, headers=headers) 47 | 48 | def put(self, url, body={}, headers={}): 49 | return self._send_request(url, type='PUT', body=body, headers=headers) 50 | 51 | def post(self, url, body={}, headers={}): 52 | return self._send_request(url, type='POST', body=body, headers=headers) 53 | 54 | def delete(self, url, body={}, headers={}): 55 | return self._send_request(url, type='DELETE', body=body, headers=headers) 56 | 57 | def pp(self, url, type='GET', body={}, headers={}): 58 | self.p(self._send_request(url, type, body, headers)) 59 | 60 | def keys(self, type='GET', body={}, headers={}): 61 | return self._send_request("keys", type, body, headers) 62 | 63 | def __repr__(self): 64 | return "" % self.base 65 | -------------------------------------------------------------------------------- /emissary/config.py: -------------------------------------------------------------------------------- 1 | import os, getpass 2 | if not 'EMISSARY_DATABASE' in os.environ: 3 | print 'You need to export a URI for EMISSARY_DATABASE' 4 | print 'Eg: export EMISSARY_DATABASE="sqlite://///home/%s/.emissary.db"' % getpass.getuser() 5 | raise SystemExit 6 | else: 7 | SQLALCHEMY_DATABASE_URI = ( 8 | os.environ['EMISSARY_DATABASE'] 9 | ) 10 | 11 | MASTER_KEY = None 12 | MASTER_KEY_NAME = "Primary" 13 | PERMIT_NEW = False 14 | GZIP_HERE = True 15 | COMPRESS_ARTICLES = True 16 | ENABLE_CORS = False 17 | if "NO_DUPLICATE_TITLES" in os.environ: 18 | NO_DUPLICATE_TITLES = os.environ['DUPLICATE_TITLES'] 19 | else: 20 | NO_DUPLICATE_TITLES = True 21 | -------------------------------------------------------------------------------- /emissary/controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/emissary/controllers/__init__.py -------------------------------------------------------------------------------- /emissary/controllers/cron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # From http://stackoverflow.com/questions/373335/suggestions-for-a-cron-like-scheduler-in-python 3 | import gevent 4 | import time, sys 5 | from datetime import datetime, timedelta 6 | 7 | class CronError(Exception): 8 | def __init__(self, message): 9 | self.message = message 10 | def __str__(self): 11 | return repr(self.message) 12 | 13 | class days: 14 | mon = 0 15 | tue = 1 16 | wed = 2 17 | thu = 3 18 | fri = 4 19 | sat = 5 20 | sun = 6 21 | 22 | class months: 23 | jan = 1 24 | feb = 2 25 | mar = 3 26 | apr = 4 27 | may = 5 28 | jun = 6 29 | jul = 7 30 | aug = 8 31 | sep = 9 32 | oct = 10 33 | nov = 11 34 | dec = 12 35 | 36 | # Turn a list of timing data into raw numeric values 37 | def parse_timings(timings): 38 | # minute hour day month weekday 39 | # 0 6,12 * 0-11 mon-sun 40 | # Currently contains off by one errors. 41 | if type(timings) == str: 42 | timings = timings.split() 43 | if len(timings) != 5: 44 | print len(timings), timings 45 | raise CronError('Timings require five fields.') 46 | minute = hour = day = month = weekday = [] 47 | if timings[0] == '*': minute = allMatch # range(0,60) 48 | if timings[1] == '*': hour = allMatch # range(0,24) 49 | if timings[2] == '*': day = allMatch # range(0,32) 50 | if timings[3] == '*': month = allMatch # range(0,12) 51 | if timings[4] == '*': weekday = allMatch # range(0,7) 52 | for i, v in enumerate(timings): 53 | if len(v) < 3: 54 | try: 55 | r = int(v) 56 | if i == 0: minute = [r] 57 | if i == 1: hour = [r] 58 | if i == 2: day = [r] 59 | if i == 3: month = [r] 60 | if i == 4: weekday = [r] 61 | except: 62 | pass 63 | if ',' in v: # TODO: Incorporate lists of days and months. 64 | t = v.split(',') 65 | x=[] 66 | for f in t: 67 | x.append(int(f)) 68 | if i == 0: minute = x 69 | if i == 1: hour = x 70 | if i == 2: day = x 71 | if i == 3: month = x 72 | if i == 4: weekday = x 73 | del t,f,x 74 | if v.endswith("!") or v.startswith("*/"): 75 | s = "" 76 | for j in v: 77 | if j.isdigit(): 78 | s += j 79 | s = int(s) 80 | if i == 0: minute = range(0,60,s) 81 | if i == 1: hour = range(0,24,s) 82 | if i == 2: day = range(0,32,s) 83 | if i == 3: month = range(0,12,s) 84 | if i == 4: weekday = range(0,7,s) 85 | if '-' in v and len(v) > 2: 86 | r = v.split('-') 87 | for n,m in enumerate(r): 88 | try: 89 | r[n] = int(m) 90 | except: 91 | pass 92 | if type(r[n]) == int: 93 | if i == 0: minute = range(r[0],int(r[1])+1) 94 | if i == 1: hour = range(r[0],int(r[1])+1) 95 | if i == 2: day = range(r[0],int(r[1])+1) 96 | if i == 3: month = range(r[0],int(r[1])+1) 97 | if i == 4: weekday = range(r[0],int(r[1])+1) 98 | continue 99 | else: 100 | start = stop = None 101 | if i == 3: # Months 102 | if hasattr(months,r[0]): 103 | start = getattr(months,r[0]) 104 | if hasattr(months,r[1]): 105 | stop = getattr(months,r[1]) 106 | if (start and stop) != None: 107 | month = range(start,stop+1) 108 | del start, stop 109 | else: 110 | raise CronError('Malformed month data.') 111 | if i == 4: # Weekdays 112 | if hasattr(days,r[0]): 113 | start = getattr(days,r[0]) 114 | if hasattr(days,r[1]): 115 | stop = getattr(days,r[1]) 116 | if (start and stop) != None: 117 | weekday = range(start,stop+1) 118 | del start, stop 119 | else: 120 | raise CronError('Malformed day-of-the-week data.') 121 | del v,i,r,n,m, 122 | return minute, hour, day, month, weekday 123 | 124 | def parse_crontab_line(line,lineno=None,tcpd=False): 125 | url=line.split()[0] 126 | f=line.split()[1:] 127 | for i,w in enumerate(f): 128 | if w.endswith("'"): break 129 | name = ' '.join(f[:i+1]).strip("'") 130 | timings = ' '.join(f[i+1:]) # Minutes Hour Day Month Weekday 131 | parse_timings(timings) 132 | if not tcpd: 133 | if lineno: 134 | print "Line %s. %s: %s %s" % (lineno,name,url,timings) 135 | else: 136 | print "%s: %s %s" % (name,url,timings) 137 | return (url,name,timings) 138 | 139 | # Some utility classes / functions first 140 | class AllMatch(set): 141 | """Universal set - match everything""" 142 | def __contains__(self, item): return True 143 | 144 | allMatch = AllMatch() 145 | 146 | def conv_to_set(obj): # Allow single integer to be provided 147 | if isinstance(obj, (int,long)): 148 | return set([obj]) # Single item 149 | if not isinstance(obj, set): 150 | obj = set(obj) 151 | return obj 152 | 153 | class Event(object): 154 | def __init__(self, action, min=allMatch, hour=allMatch, 155 | day=allMatch, month=allMatch, dow=allMatch, 156 | args=(), kwargs={}): 157 | self.mins = conv_to_set(min) 158 | self.hours= conv_to_set(hour) 159 | self.days = conv_to_set(day) 160 | self.months = conv_to_set(month) 161 | self.dow = conv_to_set(dow) 162 | self.action = action 163 | self.args = args 164 | self.kwargs = kwargs 165 | self.running = False 166 | self.name = None 167 | 168 | def matchtime(self, t): 169 | """Return True if this event should trigger at the specified datetime""" 170 | return ((t.minute in self.mins) and 171 | (t.hour in self.hours) and 172 | (t.day in self.days) and 173 | (t.month in self.months) and 174 | (t.weekday() in self.dow)) 175 | 176 | def check(self, t): 177 | if self.matchtime(t): 178 | self.running = True 179 | self.action(*self.args, **self.kwargs) 180 | self.running = False 181 | 182 | class CronTab(gevent.Greenlet): 183 | def __init__(self, *events): 184 | self.events = events 185 | self.name = None 186 | gevent.Greenlet.__init__(self) 187 | 188 | def _run(self): 189 | t=datetime(*datetime.now().timetuple()[:5]) 190 | while 1: 191 | for e in self.events: 192 | # print zip([i for i in dir(self)], [getattr(self,i) for i in dir(self)]) 193 | if self.inbox: # This .get() blocks, preventing duplicate greenlets running 194 | msg = self.inbox.get() # in the same addr due to our use of multiprocessing.Process 195 | e.check(t) 196 | t += timedelta(minutes=1) 197 | n = datetime.now() 198 | while n < t: 199 | s = (t - n).seconds + 1 200 | time.sleep(s) 201 | n = datetime.now() 202 | 203 | def __repr__(self): 204 | if self.name: 205 | return "" % (self.name, hex(id(self))) 206 | else: 207 | return "" % hex(id(self)) 208 | 209 | def parse_crontab(db,log): 210 | table = db['feeds'] 211 | 212 | crontab = sys.stdin.read() 213 | feedlines={} 214 | 215 | for index, line in enumerate(crontab.split('\n')): 216 | if line.startswith('http'): 217 | index+=1 218 | feedlines['%s' % index] = line 219 | elif (line.startswith('#')) or (line == ''): continue 220 | else: print Utils.parse_option(line,config) 221 | 222 | for lineno, feedline in feedlines.items(): 223 | url=name=timings=None 224 | try: 225 | (url,name,timings) = Cron.parse_crontab_line(feedline,lineno) 226 | except EmissaryError, e: 227 | print e 228 | 229 | if url and name and timings: 230 | # Check URL isn't already loaded 231 | feed = Feed.Feed(db,log,url=url) 232 | if 'name' in feed.feed.keys(): 233 | if name != feed['name'] or timings != feed['timings']: 234 | feed.adjust(name,timings) 235 | sys.stdout.write("Adjusted %s: %s\n" % (name,feed.feed)) 236 | else: 237 | sys.stdout.write('Adding %s\n' % name) 238 | feed = Feed.Feed(db,log).create(name,url,timings) 239 | 240 | raise SystemExit 241 | 242 | #if __name__ == '__main__': 243 | # c = CronTab(Event(lambda x: print "Hello", range(0,59), range(0,23), dow=range(0,5))) 244 | # c.run() 245 | 246 | -------------------------------------------------------------------------------- /emissary/controllers/fetch.py: -------------------------------------------------------------------------------- 1 | import time 2 | import urlparse 3 | import requests 4 | import feedparser 5 | from emissary import app, db 6 | from sqlalchemy import and_, or_ 7 | from emissary.models import Article 8 | from emissary.controllers import parser 9 | from emissary.controllers.utils import uid, tconv 10 | requests.packages.urllib3.disable_warnings() 11 | 12 | snappy = None 13 | if app.config['COMPRESS_ARTICLES']: 14 | try: 15 | import snappy 16 | except ImportError: 17 | pass 18 | 19 | 20 | # This is a little globally-available (as far as coroutines calling this are concerned) 21 | # dictionary of urls we've already visited. It permits us to only try a url 22 | # four times every half an hour. If we see it again after half an hour we'll 23 | # try it again, otherwise it stays in the seen dictionary. It also needs periodically 24 | # emptying, lest it grow infinitely. 25 | seen = {} 26 | 27 | def get(url): 28 | headers = {"User-Agent": "Emissary "+ app.version} 29 | return requests.get(url, headers=headers, verify=False) 30 | 31 | # Fetch a feed.url, parse the links, visit the links and store articles. 32 | def fetch_feed(feed, log): 33 | 34 | if feed.group: 35 | log("%s: %s: Fetching %s." % \ 36 | (feed.key.name, feed.group.name, feed.name)) 37 | else: 38 | log("%s: Fetching %s." % (feed.key.name, feed.name)) 39 | try: 40 | r = get(feed.url) 41 | except Exception, e: 42 | log("%s: %s: Error fetching %s: %s" % \ 43 | (feed.key.name, feed.group.name, feed.name, e.message[0])) 44 | return 45 | 46 | # Fetch the links and create articles 47 | links = parser.extract_links(r) 48 | title = None 49 | for link in links: 50 | # try: 51 | fetch_and_store(link, feed, log) 52 | # except Exception, e: 53 | # log("%s: %s: Error with %s: %s" % \ 54 | # (feed.key.name, feed.name, link, e.message), "error") 55 | 56 | def fetch_and_store(link, feed, log, key=None, overwrite=False): 57 | """ 58 | Fetches, extracts and stores a URL. 59 | link can be a list of urls or a dictionary of url/title pairs. 60 | """ 61 | then = int(time.time()) 62 | # If the feed was XML data then we probably have a dictionary of 63 | # url:title pairs, otherwise we have a list of urls. 64 | if type(link) == dict: 65 | for url, title in link.items(): continue 66 | else: 67 | url = link 68 | title = None 69 | 70 | # Skip this url if we've already extracted and stored it for this feed, unless we're overwriting. 71 | if Article.query.filter(and_(Article.url == url, Article.feed == feed)).first(): 72 | if overwrite: 73 | log("%s: %s/%s: Preparing to overwrite existing copy of %s" % \ 74 | (feed.key.name, feed.group.name, feed.name, url), "debug") 75 | else: 76 | log("%s: %s/%s: Already storing %s" % (feed.key.name, feed.group.name, feed.name, url), "debug") 77 | return 78 | 79 | # Fix links with no schema 80 | if not "://" in url: 81 | url = "http://" + url 82 | 83 | # Store our awareness of this url during this run in a globally available dictionary, 84 | # in the form [counter, timestamp]. 85 | if url not in seen: 86 | seen[url] = [1, int(time.time())] 87 | else: 88 | # If we haven't modified the counter for half an hour, reset it. 89 | now = int(time.time()) 90 | if (now - seen[url][1]) > 60*30: 91 | seen[url] = [1, int(time.time())] 92 | # If we have tried this URL four times, disregard it. 93 | # We might reset its counter in half an hour anyway. 94 | if seen[url][0] >= 4: 95 | return 96 | # Otherwise increment and continue with storing. 97 | seen[url][0] += 1 98 | seen[url][1] = int(time.time()) 99 | 100 | # Prune seen URLs older than a day. 101 | for _ in seen.copy(): 102 | if int(time.time()) - seen[_][1] > 86400: 103 | del seen[_] 104 | 105 | try: 106 | document = get(url) 107 | except Exception, e: 108 | log("%s: %s/%s: Error fetching %s: %s" % \ 109 | (feed.key.name, feed.group.name, feed.name, url, e.message[0])) 110 | return 111 | 112 | # Mimetype detection. 113 | if 'content-type' in document.headers: 114 | if 'application' in document.headers['content-type']: 115 | if not title: 116 | title = url 117 | article = Article( 118 | url=url, 119 | title=title, 120 | ) 121 | if not "://" in article.url: 122 | article.url = "http://" + article.url 123 | commit_to_feed(feed, article) 124 | log("%s: %s/%s: Stored %s, reference to %s (%s)" % \ 125 | (feed.key.name, feed.group.name, feed.name, article.uid, url, document.headers['content-type'])) 126 | return 127 | 128 | # Document parsing. 129 | try: 130 | article_content = parser.extract_body(document.text) 131 | summary = parser.summarise(article_content) 132 | except Exception, e: 133 | log("%s: %s: Error parsing %s: %s" % (feed.key.name, feed.group.name, url, e.message)) 134 | return 135 | 136 | # Ensure a title and disregard dupes 137 | if not title: 138 | title = parser.extract_title(document.text) 139 | 140 | if app.config['NO_DUPLICATE_TITLES']: 141 | if Article.query.filter( 142 | and_(Article.title == title, Article.key == feed.key) 143 | ).first(): 144 | return 145 | 146 | # Initial article object 147 | article = Article( 148 | url=url, 149 | title=title, 150 | summary=summary 151 | ) 152 | 153 | # Determine whether to store the full content or a compressed copy 154 | if not app.config['COMPRESS_ARTICLES']: 155 | article.content=article_content 156 | else: 157 | article.ccontent = snappy.compress(article_content.encode("utf-8", "ignore")) 158 | article.compressed = True 159 | 160 | # 161 | # We execute scripts before committing articles to the database 162 | # it runs the risk of a singular script halting the entire thing 163 | # in return we get to modify articles (ie machine translation) before storing. 164 | 165 | # Non-blocking IO will result in the most reliable performance within your scripts. 166 | # 167 | for s in app.scripts.scripts.values(): 168 | try: 169 | s.execute(env={'article':article, 'feed':feed}) 170 | article = s['article'] 171 | except Exception, e: 172 | log("Error executing %s: %s" % (s.file, e.message), "error") 173 | 174 | commit_to_feed(feed, article) 175 | 176 | now = int(time.time()) 177 | duration = tconv(now-then) 178 | log('%s: %s/%s: Stored %s "%s" (%s)' % \ 179 | (feed.key.name, feed.group.name, feed.name, article.uid, article.title, duration)) 180 | del then, now, duration, feed, article, url, title 181 | return 182 | 183 | def fetch_feedless_article(key, url, overwrite=False): 184 | """ 185 | Given a URL, create an Article and attach it to a Key. 186 | """ 187 | then = int(time.time()) 188 | log = app.log 189 | 190 | if Article.query.filter(Article.url == url).first(): 191 | if overwrite: 192 | log("%s: Preparing to overwrite existing copy of %s" % (key.name,url), "debug") 193 | else: 194 | log("%s: Already storing %s" % (key.name, url), "debug") 195 | return 196 | 197 | try: 198 | response = get(url) 199 | except Exception, e: 200 | log("%s: Error fetching %s: %s." % (key.name, url, e.message)) 201 | return 202 | 203 | article_content = parser.extract_body(response.text) 204 | title = parser.extract_title(response.text) 205 | summary = parser.summarise(article_content) 206 | article = Article( 207 | url=url, 208 | title=title, 209 | summary=summary 210 | ) 211 | 212 | if not app.config['COMPRESS_ARTICLES']: 213 | article.content = article_content 214 | else: 215 | article.ccontent = snappy.compress(article_content.encode("utf-8", "ignore")) 216 | article.compress = True 217 | 218 | for s in app.scripts.scripts.values(): 219 | try: 220 | s.execute(env={'article':article, 'feed':None}) 221 | article = s['article'] 222 | except Exception, e: 223 | log("Error executing %s: %s" % (s.file, e.message), "error") 224 | 225 | key.articles.append(article) 226 | 227 | article.uid = uid() 228 | 229 | db.session.add(article) 230 | db.session.add(key) 231 | db.session.commit() 232 | 233 | now = int(time.time()) 234 | duration = tconv(now-then) 235 | log('%s: Stored %s "%s" (%s)' % (key.name, article.uid, article.title, duration)) 236 | return article 237 | 238 | def commit_to_feed(feed, article): 239 | """ 240 | Place a new article on the api key of a feed, the feed itself, 241 | and commit changes. 242 | """ 243 | 244 | # We give articles UIDs manually to ensure unique time data is used. 245 | article.uid = uid() 246 | 247 | session = feed._sa_instance_state.session 248 | feed.articles.append(article) 249 | feed.key.articles.append(article) 250 | 251 | session.add(article) 252 | session.add(feed) 253 | session.commit() 254 | del article, feed, session 255 | -------------------------------------------------------------------------------- /emissary/controllers/load.py: -------------------------------------------------------------------------------- 1 | # This file contains functions designed for 2 | # loading cron tables and storing new feeds. 3 | 4 | from emissary import db 5 | from sqlalchemy import and_ 6 | from emissary.controllers.utils import spaceparse 7 | from emissary.controllers.cron import parse_timings 8 | from emissary.models import APIKey, Feed, FeedGroup 9 | 10 | def create_feed(log, db, key, group, feed): 11 | """ 12 | Takes a key object, a group name and a dictionary 13 | describing a feed ({name:,url:,schedule:,active:}) 14 | and reliably attaches a newly created feed to the key 15 | and group. 16 | """ 17 | if not type(feed) == dict: 18 | log('Unexpected type when creating feed for API key "%s"' % key.name) 19 | return 20 | 21 | for i in ['name', 'schedule', 'active', 'url']: 22 | if not i in feed.keys(): 23 | log('%s: Error creating feed. Missing "%s" field from feed definition.' % (key.name, i)) 24 | return 25 | 26 | f = Feed.query.filter(and_(Feed.key == key, Feed.name == feed['name'])).first() 27 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == group)).first() 28 | 29 | if f: 30 | if f.group: 31 | log('%s: Error creating feed "%s" in group "%s", feed already exists in group "%s".' % \ 32 | (key.name, feed['name'], group, f.group.name)) 33 | return 34 | elif fg: 35 | log('%s: %s: Adding feed "%s"' % (key.name, fg.name, f.name)) 36 | fg.append(f) 37 | db.session.add(fg) 38 | db.session.add(f) 39 | db.session.commit() 40 | return 41 | 42 | if not fg: 43 | log('%s: Creating feed group %s.' % (key.name, group)) 44 | fg = FeedGroup(name=group) 45 | key.feedgroups.append(fg) 46 | 47 | try: 48 | parse_timings(feed['schedule']) 49 | except Exception, e: 50 | log('%s: %s: Error creating "%s": %s' % \ 51 | (key.name, fg.name, feed['name'], e.message)) 52 | 53 | log('%s: %s: Creating feed "%s"' % (key.name, fg.name, feed['name'])) 54 | f = Feed( 55 | name=feed['name'], 56 | url=feed['url'], 57 | active=feed['active'], 58 | schedule=feed['schedule'] 59 | ) 60 | fg.feeds.append(f) 61 | key.feeds.append(f) 62 | db.session.add(key) 63 | db.session.add(fg) 64 | db.session.add(f) 65 | db.session.commit() 66 | 67 | def parse_crontab(filename): 68 | """ 69 | Get a file descriptor on filename and 70 | create feeds and groups for API keys therein. 71 | """ 72 | def log(message): 73 | print message 74 | # read filename into a string named crontab 75 | try: 76 | fd = open(filename, "r") 77 | except OSError: 78 | print "Error opening %s" % filename 79 | raise SystemExit 80 | crontab = fd.read() 81 | fd.close() 82 | 83 | # keep a resident api key on hand 84 | key = None 85 | 86 | for i, line in enumerate(crontab.split('\n')): 87 | 88 | # Set the APIKey we're working with when we find a line starting 89 | # with apikey: 90 | if line.startswith("apikey:"): 91 | if ' ' in line: 92 | key_str = line.split()[1] 93 | key = APIKey.query.filter(APIKey.key == key_str).first() 94 | if not key: 95 | print 'Malformed or unknown API key at line %i in %s: %s' % (i+1, filename, line) 96 | raise SystemExit 97 | else: 98 | print 'Using API key "%s".' % key.name 99 | 100 | if line.startswith("http"): 101 | feed = {'active': True} 102 | 103 | # Grab the URL and set the string to the remainder 104 | feed['url'] = line.split().pop(0) 105 | line = ' '.join(line.split()[1:]) 106 | 107 | # Grab names and groups 108 | names = spaceparse(line) 109 | if not names: 110 | print "Error parsing feed or group name at line %i in %s: %s" % (i+1, filename, line) 111 | continue 112 | feed['name'], group = names[:2] 113 | 114 | # The schedule should be the last five items 115 | schedule = line.split()[-5:] 116 | try: 117 | parse_timings(schedule) 118 | except Exception, e: 119 | print "Error parsing schedule at line %i in %s: %s" % (i+1, filename, e.message) 120 | continue 121 | 122 | feed['schedule'] = ' '.join(schedule) 123 | 124 | create_feed(log, db, key, group, feed) 125 | -------------------------------------------------------------------------------- /emissary/controllers/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file provides a generic logging class. 3 | It could do with automatic file rotation and syslog support. 4 | 5 | Luke Brooks 2015 6 | MIT License. 7 | """ 8 | import logging, time 9 | 10 | class Log(object): 11 | def __init__(self, program, log_file=None, log_stdout=False): 12 | self.program = program 13 | self.log = None 14 | self.debug = False 15 | 16 | if log_file or log_stdout: 17 | formatter = logging.Formatter( 18 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%d/%m/%Y %H:%M:%S' 19 | ) 20 | self.log = logging.getLogger(program) 21 | self.log.setLevel(logging.DEBUG) 22 | 23 | if log_stdout: 24 | ch = logging.StreamHandler() 25 | ch.setLevel(logging.DEBUG) 26 | ch.setFormatter(formatter) 27 | self.log.addHandler(ch) 28 | 29 | if log_file: 30 | ch = logging.FileHandler(log_file, 'a') 31 | ch.setLevel(logging.DEBUG) 32 | ch.setFormatter(formatter) 33 | self.log.addHandler(ch) 34 | 35 | def __call__(self, data, level='info'): 36 | if self.log: 37 | if level == 'debug': level = 10 38 | if level == 'info': level = 20 39 | if level == 'warning': level = 30 40 | if level == 'error': level = 40 41 | if level == 'critical': level = 50 42 | 43 | if (level > 15) or (self.debug): 44 | self.log.log(level,data) 45 | -------------------------------------------------------------------------------- /emissary/controllers/manager.py: -------------------------------------------------------------------------------- 1 | from gevent.queue import Queue 2 | import sys, os, time, pwd, optparse, gevent, hashlib 3 | 4 | from sqlalchemy import and_ 5 | from emissary.models import Feed, FeedGroup, APIKey 6 | from emissary.controllers import cron 7 | from emissary.controllers import fetch 8 | 9 | class EmissaryError(Exception): 10 | def __init__(self, message): 11 | self.message = message 12 | def __str__(self): 13 | return repr(self.message) 14 | 15 | class FeedManager(object): 16 | """Keeps CronTab objects in rotation""" 17 | def __init__(self, log): 18 | self.log = log 19 | self.app = None 20 | self.running = False 21 | self.crontabs = {} 22 | self.threads = [] 23 | self.revived = {} # {name: [amt, time]} 24 | 25 | def load_feeds(self): 26 | """ 27 | Currently just starts all feeds flat, by checking if they and their 28 | FeedGroup are active. 29 | 30 | 31 | TODO: Start feeds by API key. Where each CronTab corresponds to a FeedGroup. 32 | """ 33 | for key in APIKey.query.all(): 34 | 35 | if key.reader: 36 | continue 37 | 38 | if not key.active: 39 | self.log('API key "%s" marked inactive. Skipped.' % (key.name)) 40 | continue 41 | 42 | self.log("%s: Processing feed groups." % key.name) 43 | for fg in key.feedgroups: 44 | 45 | if not fg.active: 46 | self.log('%s: Feed group "%s" marked inactive. Skipped.' % \ 47 | (key.name, fg.name)) 48 | continue 49 | 50 | for feed in fg.feeds: 51 | if not feed.active: 52 | self.log('%s:%s: Feed "%s" marked inactive. Skipped.' % \ 53 | (key.name, fg.name, feed.name)) 54 | continue 55 | 56 | self.log('%s: %s: Scheduling "%s" (%s)' % \ 57 | (key.name, fg.name, feed.name, feed.schedule)) 58 | 59 | ct = self.create_crontab(feed) 60 | g = gevent.spawn(ct.run) 61 | g.name = ct.name 62 | self.threads.append(g) 63 | name = self.generate_ct_name(feed) 64 | self.crontabs[name] = ct 65 | 66 | def run(self): 67 | """ 68 | Receive inbox messages and revive feeds. 69 | Also block duplicate crontab execution..... 70 | 71 | The reason we do this is due to a quirk of 72 | using Gevent with multiprocessing.Process. 73 | 74 | It's why obtaining the article count in the REPL prompt 75 | takes a second, but the tradeoff is that Emissary won't 76 | overutilise your CPU in this loop. 77 | 78 | If you run a greenlet in a subprocess we end up with 79 | CronTab greenlets executing twice but in the same address space... 80 | So I've settled on this solution for now after investigating GIPC, 81 | which works with Flask's built in httpd, but that's not as nimble 82 | as gevent.WSGIServer. 83 | """ 84 | self.running = True 85 | while self.running: 86 | while not self.app.inbox.empty(): 87 | self.receive(self.app.inbox.get(block=False)) 88 | # Run feeds 89 | gevent.sleep() 90 | for ct in self.crontabs.values(): 91 | if ct.inbox.empty(): 92 | ct.inbox.put("ping") 93 | # Check if revive needed 94 | self.revive(ct) 95 | for i in self.threads: 96 | if i.started == False: 97 | self.threads.remove(i) 98 | # the sleep for 50ms keeps cpu utilisation low 99 | gevent.sleep() 100 | time.sleep(0.05) 101 | self.log("Cleaning up..") 102 | 103 | def create_crontab(self, feed): 104 | t = cron.parse_timings(feed.schedule.split()) 105 | evt = cron.Event( # One possible design for these crontabs 106 | fetch.fetch_feed, # is to have them correspond to a FeedGroup 107 | t[0], t[1], t[2], t[3], t[4],# where each event is a member feed 108 | [feed, self.log]) # and stopping the crontab stops the group. 109 | evt.feed = feed 110 | ct = cron.CronTab(evt) 111 | ct.name = self.generate_ct_name(feed) 112 | ct.inbox = Queue() 113 | return ct 114 | 115 | def generate_ct_name(self, feed): 116 | """ 117 | Generate a crontab name from a feed object that's 118 | hopefully unique between multiple feeds in multiple groups 119 | on multiple API keys. 120 | 121 | Determining the feed.key.key string here proved to be too expensive, 122 | so instead it's trusted that the name and creation time are unique enough. 123 | 124 | Improvements to this implementation are most welcome. 125 | """ 126 | return hashlib.sha1("%s %s" % (feed.name, feed.created)).hexdigest() 127 | 128 | def revive(self, ct): 129 | """ 130 | Restart a dead crontab. 131 | Permit a ceiling amount of restarts. 132 | Only restart a feed once per minute. 133 | """ 134 | if ct.name in self.revived: 135 | now = time.time() 136 | then = self.revived[ct.name][1] 137 | if (now - then) < 60: 138 | return 139 | self.revived[ct.name][0] += 1 140 | self.revived[ct.name][1] = now 141 | else: 142 | self.revived[ct.name] = [1, time.time()] 143 | 144 | if ct.started == False: 145 | feed = ct.events[0].feed 146 | ct = self.create_crontab(feed) 147 | self[ct.name] = ct 148 | gevent.spawn(ct.run) 149 | # if feed.name in self.crontabs.keys(): 150 | # self.log("Restarting %s" % ct.name, "warning") 151 | 152 | # name = self.generate_ct_name(feed) 153 | # self.crontabs[name] = ct 154 | # self.log(self.crontabs) 155 | 156 | def receive(self, payload): 157 | """ 158 | The Feed manager is an actor with an inbox that responds to commands 159 | issued by the HTTPD process. We accept a list containing a queue ID 160 | a command name that corresponds to FeedManager.handle_ and 161 | arguments, even if it's just a None. 162 | """ 163 | if len(payload) < 3 or type(payload) != list: return 164 | qid, command, args = payload 165 | func = getattr(self, "handle_" + command, None) 166 | # Execute on messages with a Queue ID of zero without emitting a response 167 | if func and not qid: return(func(args)) 168 | # Otherwise, use response queues based on access times 169 | elif func: 170 | # We do a double comparison here in order to sort the queue out of the loop 171 | q = [q for q in self.app.queues if hex(id(q)) == qid] 172 | if not q: 173 | self.log("Couldn't find response queue at %s." % id) 174 | return 175 | q=q[0] 176 | # Put our response on the queue and rotate its priority. 177 | try: 178 | q.put(func(args)) 179 | except Exception,e: 180 | self.app.log(e.message,'warning') 181 | q.access = time.time() 182 | self.app.queues.sort(key=lambda q: q.access, reverse=True) 183 | return 184 | return 185 | 186 | def handle_check(self, feed): 187 | """ 188 | Return whether we have a feed running or not. 189 | """ 190 | name = self.generate_ct_name(feed) 191 | if name in self.crontabs and self.crontabs[name].started: 192 | return True 193 | return False 194 | 195 | def handle_start(self, args): 196 | """ 197 | Schedule a feed. 198 | 199 | We look the feed up here because for some reason freshly 200 | created ones aren't great at journeying over IPC queues. 201 | """ 202 | key, name = args 203 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first() 204 | if not feed: return 205 | 206 | self.app.log('%s: %s: Scheduling "%s" (%s)' % \ 207 | (key.name, feed.group.name, feed.name, feed.schedule)) 208 | ct = self.create_crontab(feed) 209 | self.crontabs[ct.name] = ct 210 | g = gevent.spawn(ct.run) 211 | g.name = ct.name 212 | self.threads.append(g) 213 | return True 214 | 215 | def handle_stop(self, args): 216 | """ 217 | Halt a feed. 218 | 219 | We can't look the feed up from the database here because we may have 220 | already deleted it from our records, so instead we iterate through 221 | all of our green threads until something sticks. 222 | """ 223 | key, name = args 224 | 225 | for id, ct in self.crontabs.items(): 226 | feed = ct.events[0].feed 227 | if feed.name == name and feed.key.key == key.key: 228 | if self.app.debug: 229 | self.app.log('%s: %s: Unscheduling "%s". [thread %s]' % \ 230 | (key.name, feed.group.name, feed.name, id)) 231 | else: 232 | self.app.log('%s: %s: Unscheduling "%s".' % \ 233 | (key.name, feed.group.name, feed.name)) 234 | for t in self.threads: 235 | if t.name == id: 236 | gevent.kill(t) 237 | break 238 | self.threads.remove(t) 239 | del ct 240 | del self.crontabs[id] 241 | return True 242 | return False 243 | 244 | def __setitem__(self, name, crontab): 245 | if name in self.crontabs.keys(): 246 | if crontab.name: 247 | self.log("Restarting %s" % crontab.name, "warning") 248 | else: 249 | self.log("Restarting %s" % name, "warning") 250 | crontab.name = name 251 | self.crontabs[name] = crontab 252 | gevent.spawn(crontab) 253 | 254 | def __getitem__(self, name): 255 | if name in self.crontabs.keys(): 256 | return self.crontabs[name] 257 | else: 258 | raise KeyError('Invalid CronTab') 259 | 260 | def __delitem__(self, name): 261 | """Halt crontab, delete""" 262 | if name in self.crontabs.keys(): 263 | self.crontabs[name].kill() 264 | del self.crontabs[name] 265 | 266 | def keys(self): 267 | return self.crontabs.keys() 268 | -------------------------------------------------------------------------------- /emissary/controllers/parser.py: -------------------------------------------------------------------------------- 1 | # This file implements routines for extracting links from response objects. 2 | import re 3 | import lxml 4 | import urlparse 5 | import feedparser 6 | # We have sought to disperse power, to set men and women free. 7 | # That really means: to help them to discover that they are free. 8 | # Everybody's free. The slave is free. 9 | # The ultimate weapon isn't this plague out in Vegas, or any new super H-bomb. 10 | # The ultimate weapon has always existed. Every man, every woman, and every child owns it. 11 | # It's the ability to say No and take the consequences. 12 | # 'Fear is failure.' 'The fear of death is the beginning of slavery.' 13 | # "Thou hast no right but to do thy will.' 14 | # The goose can break the bottle at any second. 15 | # Socrates took the hemlock to prove it. 16 | # Jesus went to the cross to prove it. 17 | # It's in all history, all myth, all poetry. 18 | # It's right out in the open all the time." 19 | from goose import Goose 20 | 21 | def extract_links(response): 22 | urls = [] 23 | if ('content-type' in response.headers.keys()) and ('xml' in response.headers['content-type']): 24 | f = feedparser.parse(response.text) 25 | for entry in f.entries: 26 | urls.append({entry.link: entry.title}) 27 | del f 28 | else: # The following is a highly experimental feature. 29 | url = urlparse.urlparse(response.url) 30 | url = url.scheme + "://" + url.netloc 31 | p = Parser(response.text, url=url) 32 | urls = p.parse() 33 | del url, p 34 | return urls 35 | 36 | class Parser(object): 37 | """ 38 | Build a list of relevant links from an HTML string and the root URL. 39 | 40 | p = Parser(html_text, root_url) 41 | urls = p.parse() 42 | """ 43 | def __init__(self,html=None,doc=None,url=None): 44 | self.html=html 45 | self.doc=doc 46 | try: self.url = urlparse.urlparse(url).netloc 47 | except: self.url = url 48 | self.links=[] 49 | 50 | def root_to_urls(self, doc, titles): 51 | """ 52 | Return a list of urls from an lxml root. 53 | """ 54 | if doc is None: 55 | return [] 56 | 57 | a_tags = doc.xpath('//a') 58 | # tries to find titles of link elements via tag text 59 | if titles: 60 | return [ (a.get('href'), a.text) for a in a_tags if a.get('href') ] 61 | return [ a.get('href') for a in a_tags if a.get('href') ] 62 | 63 | def get_urls(self,_input=None,titles=False,regex=False): 64 | if (not _input) and (not self.html): return [] 65 | if not _input: _input = self.html 66 | if regex: 67 | text = re.sub('<[^<]+?>', ' ', _input) 68 | text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', _input) 69 | text = [i.strip() for i in _input] 70 | return _input or [] 71 | if isinstance(_input, str) or isinstance(_input, unicode): 72 | doc = self.fromstring(_input) 73 | else: 74 | doc = text 75 | return self.root_to_urls(doc, titles) 76 | 77 | def fromstring(self, html): 78 | try: 79 | self.doc = lxml.html.fromstring(html) 80 | except Exception, e: 81 | return None 82 | return self.doc 83 | 84 | def parse(self,html=None,url=None): 85 | """ 86 | Whittle a list of urls into things we're interested in. 87 | """ 88 | if self.links: self.links=[] 89 | urls = self.get_urls(html) 90 | if not urls: return urls 91 | else: urls = set(urls) 92 | if url: url = "http://%s/" % urlparse.urlparse(url).netloc 93 | for u in urls: 94 | if url: 95 | if u == url: continue 96 | if self.url: 97 | if u == self.url: continue 98 | if u.startswith('#'): continue 99 | if not u.startswith('http'): 100 | if url: 101 | if (url[-1] == '/') and (u[0] == '/'): u = url + u[1:] 102 | else: u = url+u 103 | elif self.url: 104 | if (self.url[-1] == '/') and (u[0] == '/'): u = self.url + u[1:] 105 | else: u = self.url+u 106 | else: continue 107 | self.links.append(u) 108 | return self.links 109 | 110 | def extract_body(html): 111 | """ 112 | Extract the body text of a web page 113 | """ 114 | g = Goose({'enable_image_fetching':False}) 115 | article = g.extract(raw_html=html) 116 | del g 117 | return article.cleaned_text 118 | 119 | def extract_title(html): 120 | """ 121 | Extract the body title of a web page 122 | """ 123 | g = Goose({'enable_image_fetching':False}) 124 | article = g.extract(raw_html=html) 125 | del g 126 | return article.title 127 | 128 | 129 | def summarise(article): 130 | stopnum = c = 0 131 | for i,v in enumerate(article.split()): 132 | if v.endswith('.'): 133 | if c >= 2: 134 | stopnum = i+1 135 | break 136 | else: 137 | c += 1 138 | return ' '.join(article.split()[:stopnum]) 139 | 140 | -------------------------------------------------------------------------------- /emissary/controllers/scripts.py: -------------------------------------------------------------------------------- 1 | #! _*_ coding: utf-8 _*_ 2 | # This file provides scripting capabilities 3 | import os 4 | from emissary import app 5 | from emissary.controllers.utils import sha1sum 6 | 7 | class Scripts(object): 8 | 9 | def __init__(self, dir): 10 | self.dir = None 11 | self.scripts = {} 12 | 13 | dir = os.path.abspath(dir) 14 | if not os.path.isdir(dir): 15 | app.log("%s isn't a valid system path." % dir, "error") 16 | return 17 | 18 | self.dir = dir 19 | 20 | def reload(self, *args): # args caught for SIGHUP handler 21 | 22 | if self.dir: 23 | if self.scripts: 24 | app.log("Reloading scripts.") 25 | for file in os.listdir(self.dir): 26 | self.unload(file) 27 | self.load(file) 28 | 29 | def load(self, file): 30 | 31 | file = os.path.abspath(os.path.join(self.dir, file)) 32 | 33 | for script in self.scripts.values(): 34 | if script.file == file: return 35 | 36 | if os.path.isfile(file): 37 | self.scripts[file] = Script(file) 38 | app.log("Loaded %s" % file) 39 | 40 | def unload(self, file): 41 | file = os.path.abspath(os.path.join(self.dir, file)) 42 | 43 | if file in self.scripts: 44 | del self.scripts[file] 45 | 46 | class Script(object): 47 | """ 48 | Represents the execution environment for a third-party script. 49 | We send custom values into the environment and work with whatever's left. 50 | Scripts can also call any methods on objects put in their environment. 51 | """ 52 | def __init__(self, file=None, env={}): 53 | self.read_on_exec = app.debug 54 | self.file = file 55 | self.env = env 56 | self.script = '' 57 | self.code = None 58 | self.hash = None 59 | self.cache = { 60 | 'app': app 61 | } 62 | 63 | def execute(self, env={}): 64 | if not self.code or self.read_on_exec: self.compile() 65 | if env: self.env = env 66 | self.env['cache'] = self.cache 67 | exec self.code in self.env 68 | del self.env['__builtins__'] 69 | if 'cache' in self.env.keys(): 70 | self.cache = self.env['cache'] 71 | return (self.env) 72 | 73 | def compile(self, script=''): 74 | if self.file: 75 | f = file(self.file, 'r') 76 | self.script = f.read() 77 | f.close() 78 | elif script: 79 | self.script = script 80 | if self.script: 81 | hash = sha1sum(self.script) 82 | if self.hash != hash: 83 | self.hash = hash 84 | self.code = compile(self.script, '', 'exec') 85 | self.script = '' 86 | 87 | def __getitem__(self, key): 88 | if key in self.env.keys(): 89 | return (self.env[key]) 90 | else: 91 | raise (KeyError(key)) 92 | 93 | def keys(self): 94 | return self.env.keys() 95 | -------------------------------------------------------------------------------- /emissary/controllers/tui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from emissary.controllers.utils import tconv 4 | from window import Window, Pane, ALIGN_LEFT, EXPAND, palette 5 | 6 | class EmissaryMenu(Pane): 7 | """ 8 | Defines a menu where items call local methods. 9 | """ 10 | geometry = [EXPAND, EXPAND] 11 | # Default and selection colours. 12 | col = [-1, -1] # fg, bg 13 | sel = [-1, "blue"] 14 | items = [] 15 | 16 | def update(self): 17 | for i, item in enumerate(self.items): 18 | if item[0]: 19 | colours = palette(self.sel[0], self.sel[1]) 20 | else: 21 | colours = palette(self.col[0], self.col[1]) 22 | text = ' ' + item[1] 23 | spaces = ' ' * (self.width - len(text)) 24 | text += spaces 25 | self.change_content(i, text + '\n', ALIGN_LEFT, colours) 26 | 27 | def process_input(self, character): 28 | # Handle the return key and the right arrow key 29 | if character == 10 or character == 13 or character == 261: 30 | for i, item in enumerate(self.items): 31 | if item[0]: 32 | func = getattr(self, item[2].lower(), None) 33 | if func: 34 | func() 35 | 36 | # Handle navigating the menu 37 | elif character in [259, 258, 339, 338]: 38 | for i, item in enumerate(self.items): 39 | if item[0]: 40 | if character == 259: # up arrow 41 | if i == 0: break 42 | item[0] = 0 43 | self.items[i-1][0] = 1 44 | break 45 | if character == 258: # down arrow 46 | if i+1 >= len(self.items): break 47 | item[0] = 0 48 | self.items[i+1][0] = 1 49 | break 50 | if character == 339: # page up 51 | item[0] = 0 52 | self.items[0][0] = 1 53 | break 54 | if character == 338: # page down 55 | item[0] = 0 56 | self.items[-1][0] = 1 57 | break 58 | 59 | class FeedGroups(EmissaryMenu): 60 | geometry = [EXPAND, EXPAND] 61 | def update(self): 62 | if not self.items: 63 | (res, status) = self.window.c.get("feeds") 64 | 65 | 66 | class Feeds(EmissaryMenu): 67 | geometry = [EXPAND, EXPAND] 68 | items = [] 69 | 70 | 71 | class Articles(Pane): 72 | """ 73 | items for Articles are [1, "text", "uid"] 74 | """ 75 | geometry = [EXPAND, EXPAND] 76 | items = [] 77 | col = [-1, -1] # fg, bg 78 | sel = ["black", "white"] 79 | avail = ["black", "green"] 80 | 81 | def update(self): 82 | if not self.items: 83 | self.fetch_items() 84 | 85 | for i, item in enumerate(self.items): 86 | if item[0]: 87 | if item[3]: 88 | colours = palette(self.avail[0], self.avail[1]) 89 | else: 90 | colours = palette(self.sel[0], self.sel[1]) 91 | else: 92 | colours = palette(self.col[0], self.col[1]) 93 | text = ' ' + item[1] 94 | spaces = ' ' * (self.width - len(text)) 95 | text += spaces 96 | self.change_content(i, text + '\n', ALIGN_LEFT, colours) 97 | 98 | def process_input(self, character): 99 | # Handle the return key and the right arrow key 100 | if character in [10, 13, 261]: 101 | for i, item in enumerate(self.items): 102 | if item[0]: 103 | uid = item[2] 104 | (article, status) = self.window.c.get('articles/' + uid) 105 | statuspane = self.window.get("status") 106 | 107 | if status != 200: 108 | statuspane.status = str(status) 109 | else: 110 | self.reader.article = article 111 | if article['content'] == None: 112 | self.reader.data = "" 113 | else: 114 | self.reader.data = article['content'].encode("ascii", "ignore") 115 | self.reader.active = True 116 | self.active = False 117 | 118 | elif character == 114: # r to refresh 119 | self.fetch_items() 120 | 121 | elif character == 9: # tab to reader 122 | reader = self.window.get("reader") 123 | reader.active = True 124 | self.active = False 125 | 126 | # Handle navigating the menu 127 | elif character in [259, 258, 339, 338]: 128 | for i, item in enumerate(self.items): 129 | if item[0]: 130 | if character == 259: # up arrow 131 | if i == 0: break 132 | item[0] = 0 133 | self.items[i-1][0] = 1 134 | break 135 | if character == 258: # down arrow 136 | if i+1 >= len(self.items): break 137 | item[0] = 0 138 | self.items[i+1][0] = 1 139 | break 140 | if character == 339: # page up 141 | item[0] = 0 142 | self.items[0][0] = 1 143 | break 144 | if character == 338: # page down 145 | item[0] = 0 146 | self.items[-1][0] = 1 147 | break 148 | 149 | def fetch_items(self): 150 | (res, status) = self.window.c.get("articles?per_page=%i" % self.height) 151 | if status == 200: 152 | self.fill_menu(res) 153 | else: 154 | status = self.window.get("status") 155 | status.status = str(res) 156 | 157 | def fill_menu(self, res): 158 | self.items = [] 159 | self.content = [] 160 | for r in res["data"]: 161 | self.items.append([0, r['title'].encode("ascii", "ignore"), r['uid'], r['content_available']]) 162 | if self.items: 163 | self.items[0][0] = 1 164 | 165 | class Reader(Pane): 166 | """ 167 | Defines a scrolling pager for long multi-line strings. 168 | """ 169 | geometry = [EXPAND, EXPAND] 170 | data = "" 171 | outbuffer = "" 172 | position = 0 173 | article = None 174 | 175 | def update(self): 176 | if self.article: 177 | feed = self.article.get('feed', None) 178 | heading = "%s\n%s (%s %s ago)\n%s\n\n" % \ 179 | (self.article['title'].encode("ascii","ignore"), feed if feed else "", 180 | self.article['uid'], tconv(int(time.time()) - int(self.article['created'])), 181 | self.article['url']) 182 | self.change_content(0, heading) 183 | self.outbuffer = self.data.split('\n')[self.position:] 184 | self.change_content(1, '\n'.join(self.outbuffer)) 185 | 186 | def process_input(self, character): 187 | self.window.window.clear() 188 | if character == 259: # Up arrow 189 | if self.position != 0: 190 | self.position -= 1 191 | elif character == 258: # Down arrow 192 | self.position += 1 193 | elif character == 339: # Page up 194 | if self.position - self.height < 0: 195 | self.position = 0 196 | else: 197 | self.position -= self.height 198 | elif character == 338: # Page down 199 | if not self.position + self.height > len(self.data.split('\n')): 200 | self.position += self.height 201 | 202 | elif character in [260, 9]: # Left arrow or tab 203 | articles = self.window.get("articles") 204 | articles.active = True 205 | self.active = False 206 | 207 | elif character in [70, 102]: # f/F to fullscreen the pager 208 | articles = self.window.get("articles") 209 | if articles.hidden: 210 | articles.hidden = False 211 | else: 212 | articles.hidden = True 213 | 214 | class StatusLine(Pane): 215 | geometry = [EXPAND, 1] 216 | content = [] 217 | buffer = "" 218 | status = "" 219 | searching = False 220 | tagline = "Thanks God." 221 | 222 | def update(self): 223 | if self.searching: 224 | self.change_content(0, "/"+self.buffer, palette("black", "white")) 225 | else: 226 | state = self.tagline 227 | state += ' ' * ((self.width /2) - len(self.tagline) - (len(str(self.status))/2)) 228 | state += str(self.status) 229 | self.change_content(0, state) 230 | 231 | def process_input(self, character): 232 | self.window.window.clear() 233 | if not self.searching and character in [80, 112]: # p/P to enter a python REPL 234 | try: # You might need to 235 | import pprint # "sudo pip install ptpython" 236 | from ptpython.repl import embed # to enable this feature. 237 | 238 | def configure(repl): 239 | repl.prompt_style = "ipython" 240 | repl.vi_mode = True 241 | repl.confirm_exit = False 242 | repl.show_status_bar = False 243 | repl.show_line_numbers = True 244 | repl.show_sidebar_help = False 245 | repl.highlight_matching_parenthesis = True 246 | repl.use_code_colorscheme("native") 247 | 248 | def a(uid): 249 | """ 250 | Return raw article text given an article uid. 251 | """ 252 | response = self.window.c.get("articles/%s" % uid) 253 | if response[1] == 200: 254 | return response[0]['content'] 255 | return "" 256 | 257 | p = pprint.PrettyPrinter() 258 | p = p.pprint 259 | l = {"a": a, "c": self.window.c, "p": p, "window": self.window} 260 | reader = self.window.get("reader") 261 | article = getattr(reader, "article", None) 262 | if article: 263 | l['article'] = article 264 | 265 | self.window.stop() 266 | print("\nStarting REPL. ^D to exit.") 267 | embed(locals=l, configure=configure) 268 | self.window.start() 269 | except ImportError: 270 | pass 271 | 272 | if not self.searching and character == 47: # / to search 273 | articles = self.window.get("articles") 274 | articles.active = False 275 | self.searching = True 276 | return 277 | 278 | if self.searching: 279 | self.window.window.clear() 280 | if character == 23 and self.buffer: # Clear buffer on ^W 281 | self.buffer = '' 282 | elif character == 263: # Handle backspace 283 | if self.buffer: 284 | self.buffer = self.buffer[:-1] 285 | if not self.buffer: 286 | self.searching = False 287 | articles = self.window.get("articles") 288 | articles.active = True 289 | 290 | elif character == 10 or character == 13: # Handle the return key 291 | # Pass control back to the articles view 292 | self.searching = False 293 | articles = self.window.get("articles") 294 | articles.active = True 295 | reader = self.window.get("reader") 296 | reader.active = False 297 | self.buffer = "" 298 | else: 299 | try: self.buffer += chr(character) # Append input to buffer 300 | except: pass 301 | # Perform a search for what's in the current buffer. 302 | articles = self.window.get("articles") 303 | url = "articles/search/"+self.buffer+"?per_page=" + str(articles.height) 304 | (res, status) = self.window.c.get(url) 305 | if status == 200: 306 | articles.fill_menu(res) 307 | 308 | 309 | window = Window(blocking=True) 310 | 311 | feedgroups = FeedGroups("feedgroups") 312 | feedgroups.active = False 313 | feedgroups.hidden = True 314 | feeds = Feeds("feeds") 315 | feeds.active = False 316 | feeds.hidden = True 317 | articles = Articles("articles") 318 | reader = Reader("reader") 319 | reader.wrap = True 320 | reader.active = False 321 | articles.reader = reader 322 | status = StatusLine("status") 323 | 324 | panes = [feedgroups, feeds, articles, reader] 325 | window.add(panes) 326 | window.add(status) 327 | 328 | window.exit_keys.append(4) # ^D to exit 329 | -------------------------------------------------------------------------------- /emissary/controllers/utils.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # This file defines a nifty utility for querying the database, 3 | # gzipping requests thanks to a snippet on pocoo.org and unique ID generation. 4 | import gzip 5 | import uuid 6 | import urllib 7 | import hashlib 8 | import urlparse 9 | import functools 10 | from emissary import app, db 11 | from sqlalchemy import or_, and_ 12 | from cStringIO import StringIO as IO 13 | from flask import after_this_request, request 14 | from emissary.controllers.cron import parse_timings 15 | 16 | def sha1sum(text): 17 | return(hashlib.sha1(text).hexdigest()) 18 | 19 | def cors(f): 20 | if not 'ENABLE_CORS' in app.config or not app.config['ENABLE_CORS']: 21 | return f 22 | 23 | @functools.wraps(f) 24 | def view_func(*args, **kwargs): 25 | @after_this_request 26 | def enable_cors(response): 27 | response.headers['Access-Control-Allow-Headers'] = "Cache-Control, Pragma, Origin, Authorization, Content-Type, X-Requested-With, Accept" 28 | response.headers['Access-Control-Allow-Methods'] = "OPTIONS, GET, POST, PUT, DELETE" 29 | response.headers['Access-Control-Allow-Origin'] = "*" 30 | 31 | return response 32 | 33 | return f(*args, **kwargs) 34 | 35 | return view_func 36 | 37 | def gzipped(f): 38 | if not 'GZIP_HERE' in app.config or not app.config['GZIP_HERE']: 39 | return f 40 | 41 | @functools.wraps(f) 42 | def view_func(*args, **kwargs): 43 | 44 | @after_this_request 45 | def zipper(response): 46 | accept_encoding = request.headers.get('Accept-Encoding', '') 47 | 48 | if 'gzip' not in accept_encoding.lower(): 49 | return response 50 | 51 | response.direct_passthrough = False 52 | 53 | if (response.status_code < 200 or 54 | response.status_code >= 300 or 55 | 'Content-Encoding' in response.headers): 56 | return response 57 | gzip_buffer = IO() 58 | gzip_file = gzip.GzipFile(mode='wb', 59 | fileobj=gzip_buffer) 60 | gzip_file.write(response.data) 61 | gzip_file.close() 62 | 63 | response.data = gzip_buffer.getvalue() 64 | response.headers['Content-Encoding'] = 'gzip' 65 | response.headers['Vary'] = 'Accept-Encoding' 66 | response.headers['Content-Length'] = len(response.data.replace(' ','')) 67 | 68 | return response 69 | 70 | return f(*args, **kwargs) 71 | 72 | return view_func 73 | 74 | def uid(): return str(uuid.uuid4()) 75 | 76 | def tconv(seconds): 77 | minutes, seconds = divmod(seconds, 60) 78 | hours, minutes = divmod(minutes, 60) 79 | days, hours = divmod(hours, 24) 80 | weeks, days = divmod(days, 7) 81 | s="" 82 | if weeks: 83 | if weeks == 1: 84 | s+= "1 week, " 85 | else: 86 | s+= "%i weeks, " % (weeks) 87 | if days: 88 | if days == 1: 89 | s+= "1 day, " 90 | else: 91 | s+= "%i days, " % (days) 92 | if hours: 93 | if hours == 1: 94 | s+= "1 hour, " 95 | else: 96 | s+= "%i hours, " % (hours) 97 | if minutes: 98 | if minutes == 1: 99 | s+= "1 minute" 100 | else: 101 | s+= "%i minutes" % (minutes) 102 | if seconds: 103 | if len(s) > 0: 104 | if seconds == 1: 105 | s+= " and %i second" % (seconds) 106 | else: 107 | s+= " and %i seconds" % (seconds) 108 | else: 109 | if seconds == 1: 110 | s+= "1 second" 111 | else: 112 | s+= "%i seconds" % (seconds) 113 | return s 114 | 115 | def spaceparse(string): 116 | """ 117 | Return strings surrounded in quotes as a list, or dict if they're key="value". 118 | """ 119 | results = [] 120 | quotes = string.count('"') 121 | quoted = quotes / 2 122 | keyvalue = False 123 | 124 | # Return an empty resultset if there are an uneven number of quotation marks 125 | if quotes % 2 != 0: 126 | return results 127 | 128 | # for every quoted phrase determine if it's an assignment and include the variable name 129 | # disregard it from the string we're working with and continue onto the next quoted part 130 | for phrase in range(0,quoted+1): 131 | if not string: break 132 | start = string.find('"') 133 | end = string.find('"', start+1) 134 | 135 | if start > 0 and string[start-1] == '=': 136 | keyvalue = True 137 | for i in range(start,-1,-1): 138 | if string[i] == ' ' or i == 0: 139 | results.append(string[i:end]) 140 | break 141 | else: 142 | results.append(string[start+1:end]) 143 | string = string[end+1:] 144 | if keyvalue: 145 | res = {} 146 | for item in results: 147 | k,v = item.split('=') 148 | if k.startswith(' '): 149 | k=k[1:] 150 | if v.startswith('"'): 151 | v=v[1:] 152 | res[k]=v 153 | return res 154 | return results 155 | 156 | def update_url(url, params): 157 | url_parts = list(urlparse.urlparse(request.url)) 158 | query = dict(urlparse.parse_qsl(url_parts[4])) 159 | query.update(params) 160 | url_parts[4] = urllib.urlencode(query) 161 | return urlparse.urlunparse(url_parts) 162 | 163 | def make_response(url, query, jsonify=True): 164 | """ 165 | Take a paginated SQLAlchemy query and return 166 | a response that's more easily reasoned about 167 | by other programs. 168 | """ 169 | response = {} 170 | if jsonify: 171 | response['data'] = [i.jsonify() for i in query.items] 172 | 173 | response['links'] = {} 174 | response['links']['self'] = url 175 | if query.has_next: 176 | response['links']['next'] = update_url(url, {"page": str(query.next_num)}) 177 | return response 178 | -------------------------------------------------------------------------------- /emissary/models.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | """ 3 | MIT License. 4 | Luke Brooks 2015 5 | Database layout for Emissary. 6 | """ 7 | import time 8 | import snappy 9 | from hashlib import sha256 10 | from emissary import db, app 11 | from multiprocessing import Queue 12 | from emissary.controllers.utils import uid 13 | 14 | class APIKey(db.Model): 15 | """ 16 | An Emissary API Key. 17 | Reader keys cannot PUT, POST or DELETE. 18 | """ 19 | __tablename__ = 'api_keys' 20 | id = db.Column(db.Integer, primary_key=True) 21 | parent_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id")) 22 | name = db.Column(db.String(80)) 23 | key = db.Column(db.String(120)) 24 | active = db.Column(db.Boolean()) 25 | reader = db.Column(db.Boolean(), default=False) 26 | created = db.Column(db.DateTime(timezone=True), default=db.func.now()) 27 | parent = db.relationship("APIKey", backref="readers", remote_side=[id]) 28 | feedgroups = db.relationship("FeedGroup", backref="key") 29 | feeds = db.relationship("Feed", backref="key") 30 | articles = db.relationship("Article", backref="key") 31 | events = db.relationship("Event", backref="key") 32 | 33 | def generate_key_str(self): 34 | """ 35 | Returns a SHA256 of the time as an API Key. 36 | """ 37 | return sha256(time.asctime() + str(time.time())).hexdigest() 38 | 39 | def __repr__(self): 40 | if not self.name: 41 | return "" 42 | return '' % self.name 43 | 44 | def jsonify(self, feedgroups=False, with_key_str=False): 45 | response = {} 46 | response['name'] = self.name 47 | if with_key_str: 48 | response['apikey'] = self.key 49 | if feedgroups: 50 | response['feedgroups'] = [group.jsonify() for group in self.feedgroups] 51 | response['active'] = self.active 52 | response['reader'] = self.reader 53 | if self.reader: 54 | response['parent'] = self.parent.name 55 | return response 56 | 57 | class FeedGroup(db.Model): 58 | __tablename__ = "feed_groups" 59 | id = db.Column(db.Integer(), primary_key=True) 60 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id")) 61 | uid = db.Column(db.String(36), default=uid()) 62 | name = db.Column(db.String(80)) 63 | feeds = db.relationship('Feed', backref="group") 64 | created = db.Column(db.DateTime(timezone=True), default=db.func.now()) 65 | active = db.Column(db.Boolean(), default=True) 66 | 67 | def __repr__(self): 68 | if self.name: 69 | return '' % (self.name, len(self.feeds)) 70 | return "" 71 | 72 | def jsonify(self): 73 | response = {} 74 | if self.created: 75 | response['name'] = self.name 76 | response['uid'] = self.uid 77 | response['created'] = time.mktime(self.created.timetuple()) 78 | response['active'] = self.active 79 | response['feeds'] = [feed.jsonify() for feed in self.feeds] 80 | return response 81 | 82 | class Feed(db.Model): 83 | __tablename__ = "feeds" 84 | id = db.Column(db.Integer(), primary_key=True) 85 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id")) 86 | group_id = db.Column(db.Integer(), db.ForeignKey("feed_groups.id")) 87 | uid = db.Column(db.String(36), default=uid()) 88 | name = db.Column(db.String(100)) 89 | url = db.Column(db.String(150)) 90 | schedule = db.Column(db.String(80)) 91 | active = db.Column(db.Boolean(), default=True) 92 | created = db.Column(db.DateTime(timezone=True), default=db.func.now()) 93 | articles = db.relationship('Article', backref="feed") 94 | 95 | def __repr__(self): 96 | if self.name: 97 | return '' % (self.name, len(self.articles)) 98 | return "" 99 | 100 | def is_running(self): 101 | """ 102 | Ask the feedmanager what's happening. 103 | """ 104 | if not app.inbox: 105 | return None 106 | 107 | response_queue = app.queues[-1] 108 | qid = hex(id(response_queue)) 109 | app.inbox.put([qid, "check", self]) 110 | 111 | # Wait somewhere around 500ms max for a response 112 | then = time.time() 113 | while response_queue.empty(): 114 | now = time.time() 115 | if (now - then) >= 0.5: 116 | return None 117 | 118 | return response_queue.get() 119 | 120 | def jsonify(self, articles=False): 121 | response = {} 122 | if self.created: 123 | response['name'] = self.name 124 | response['uid'] = self.uid 125 | response['url'] = self.url 126 | response['created'] = time.mktime(self.created.timetuple()) 127 | response['schedule'] = self.schedule 128 | response['active'] = self.active 129 | response['article_count'] = len(self.articles) 130 | response['running'] = self.is_running() 131 | if self.group: 132 | response['group'] = self.group.name 133 | else: 134 | response['group'] = None 135 | return response 136 | 137 | 138 | class Article(db.Model): 139 | __tablename__ = "articles" 140 | id = db.Column(db.Integer(), primary_key=True) 141 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id")) 142 | uid = db.Column(db.String(36)) 143 | feed_id = db.Column(db.Integer(), db.ForeignKey("feeds.id")) 144 | title = db.Column(db.String(80)) 145 | url = db.Column(db.String(400)) 146 | content = db.Column(db.String(2000)) 147 | ccontent = db.Column(db.LargeBinary()) 148 | summary = db.Column(db.String(800)) 149 | created = db.Column(db.DateTime(timezone=True), default=db.func.now()) 150 | compressed = db.Column(db.Boolean(), default=False) 151 | 152 | def text(self): 153 | if self.content: 154 | return self.content.decode("utf-8", "ignore") 155 | if self.ccontent: 156 | return snappy.decompress(self.ccontent).decode("utf-8", "ignore") 157 | return "" 158 | 159 | def __repr__(self): 160 | if self.content or self.ccontent: 161 | return '
' % self.title.encode("utf-8", "ignore") 162 | if self.url and self.title: 163 | return '
' % self.title.encode("utf-8", "ignore") 164 | return "
" 165 | 166 | def jsonify(self, summary=False, content=False): 167 | response = {} 168 | if self.title: 169 | response['title'] = self.title.encode("utf-8", "ignore") 170 | response['url'] = self.url.encode("utf-8", "ignore") 171 | response['uid'] = self.uid 172 | response['created'] = time.mktime(self.created.timetuple()) 173 | if self.feed: 174 | response['feed'] = self.feed.name 175 | if content: 176 | response['compressed'] = self.compressed 177 | if self.ccontent: 178 | response['content'] = snappy.decompress(self.ccontent) 179 | else: 180 | response['content'] = self.content 181 | if not content: 182 | if self.content or self.ccontent: 183 | response['content_available'] = True 184 | else: 185 | response['content_available'] = False 186 | if summary and self.summary: 187 | response['summary'] = self.summary 188 | return response 189 | 190 | class Event(db.Model): 191 | __tablename__ = "events" 192 | id = db.Column(db.Integer(), primary_key=True) 193 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id")) 194 | created = db.Column(db.DateTime(timezone=True), default=db.func.now()) 195 | feed_id = db.Column(db.Integer(), db.ForeignKey("feeds.id")) 196 | success = db.Column(db.Boolean()) 197 | message = db.Column(db.String(200)) 198 | 199 | def __repr__(self): 200 | return "" 201 | 202 | def jsonify(self): 203 | return {} 204 | -------------------------------------------------------------------------------- /emissary/repl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import cmd 4 | import json 5 | import time 6 | import errno 7 | import _curses 8 | import optparse 9 | import textwrap 10 | from emissary import app 11 | from emissary.client import Client 12 | from emissary.models import APIKey 13 | from subprocess import Popen, PIPE 14 | from emissary.controllers.utils import tconv, spaceparse 15 | from emissary.controllers.tui import window 16 | 17 | try: 18 | from pygments import highlight 19 | from pygments.lexers import JsonLexer 20 | from pygments.styles import get_style_by_name, STYLE_MAP 21 | from pygments.formatters.terminal256 import Terminal256Formatter 22 | except ImportError: highlight = False 23 | 24 | class repl(cmd.Cmd): 25 | 26 | prompt = "> " 27 | intro = "Emissary %s\nPsybernetics %i\n" % (app.version, time.gmtime()[0]) 28 | ruler = '-' 29 | width = 80 30 | 31 | 32 | def parse_args(self, args): 33 | body = {} 34 | parsed = spaceparse(args) 35 | args = args.split() 36 | for i in args: 37 | try: 38 | x=i.split('=') 39 | if type(parsed) == dict and not x[0] in parsed: 40 | parsed[x[0]] = x[1] 41 | else: 42 | body[x[0]] = x[1] 43 | except: continue 44 | if type(parsed) == dict: body = parsed 45 | return body 46 | 47 | def formatted_prompt(self): 48 | """ 49 | Here we format the first return value of /v1/articles/count 50 | into something that adds commas to triple digit (etc) values. 51 | """ 52 | try: 53 | return "({:,}) > ".format( 54 | self.c.get("articles/count")[0] 55 | ) 56 | except: 57 | return "no connection> " 58 | 59 | def do_setkey(self,key): 60 | "Sets the API key to transmit requests with." 61 | if key: 62 | self.c.key = key 63 | print 'Changed active API key to "%s"' % key 64 | else: 65 | print "Usage: setkey " 66 | 67 | def do_use(self,key): 68 | "Alias of setkey." 69 | self.do_setkey(key) 70 | 71 | def do_getkey(self,line): 72 | "Displays the current API key." 73 | print self.c.key 74 | 75 | def do_get(self,line): 76 | """ 77 | Sends GET requests 78 | EG: get articles 79 | get feeds 80 | get feedgroups 81 | """ 82 | response = self.c._send_request(line) 83 | self.display(response) 84 | 85 | def do_put(self,line): 86 | """ 87 | Creates a new feed or feed group. 88 | EG: put feedgroups name=HN 89 | """ 90 | if not ' ' in line: 91 | print "Need data to transmit." 92 | else: 93 | line, body = line.split(' ',1) 94 | body = self.parse_args(body) 95 | response = self.c._send_request(line, 'PUT', body) 96 | self.display(response) 97 | 98 | 99 | def do_post(self,line): 100 | """ 101 | Modifies an existing feed or feed group. 102 | EG: post feeds/SomeFeed schedule="20 3 2! * *" 103 | """ 104 | 105 | if not ' ' in line: 106 | print "Need data to transmit." 107 | else: 108 | line, body = line.split(' ',1) 109 | body = self.parse_args(body) 110 | response = self.c._send_request(line, 'POST', body) 111 | self.display(response) 112 | 113 | def do_exit(self,line): 114 | try: 115 | _curses.endwin() 116 | except _curses.error: 117 | pass 118 | finally: 119 | raise SystemExit 120 | 121 | def do_read(self,line): 122 | """ 123 | Usage: read 124 | Pipes article content into the system pager. 125 | 126 | Text column width can be configured with the width command. 127 | """ 128 | then = time.time() 129 | response = self.c._send_request("articles/" + line) 130 | if response[1] != 200: 131 | print response[1] 132 | return 133 | 134 | data = response[0] 135 | 136 | if not 'content' in data: 137 | print None 138 | else: 139 | 140 | p = Popen(['less', '-P', data['title']], stdin=PIPE) 141 | 142 | try: 143 | duration = tconv(int(then) - int(data['created'])) 144 | p.stdin.write('%s\n(%i paragraphs, fetched %s ago)\n%s\n\n' % \ 145 | (data['title'].encode("utf-8", "ignore"), 146 | len(data['content'].encode("utf-8","ignore").split("\n"))/2+1, 147 | duration, 148 | data['url'].encode("utf-8","ignore"))) 149 | 150 | content = data['content'].encode("utf-8", "ignore") 151 | # Get TTY width and wrap the text 152 | if self.width == "auto": 153 | s = _curses.initscr() 154 | width = s.getmaxyx()[1] 155 | _curses.endwin() 156 | 157 | else: 158 | width = self.width 159 | 160 | content = '\n'.join( 161 | textwrap.wrap(content, width, break_long_words=False, replace_whitespace=False) 162 | ) 163 | p.stdin.write(content) 164 | 165 | except IOError as e: 166 | if e.errno == errno.EPIPE or e.errno == errno.EINVAL: 167 | sys.stderr.write("Error writing to pipe.\n") 168 | else: 169 | raise 170 | 171 | p.stdin.close() 172 | p.wait() 173 | now = time.time() 174 | duration = tconv(now-then) 175 | # print "\n%s" % duration 176 | 177 | def do_delete(self,line): 178 | """ 179 | Sends a DELETE request. 180 | EG: delete feeds/somefeed 181 | """ 182 | if ' ' in line: 183 | line, body = line.split(' ',1) 184 | body = self.parse_args(body) 185 | else: body = '' 186 | response = self.c._send_request(line, 'DELETE', body) 187 | self.display(response) 188 | 189 | def do_EOF(self,line): 190 | print "^D", 191 | return True 192 | 193 | def postcmd(self, stop, line): 194 | self.prompt = self.formatted_prompt() 195 | return stop 196 | 197 | def emptyline(self): 198 | pass 199 | 200 | def postloop(self): 201 | print 202 | 203 | def do_width(self, line): 204 | """ 205 | Set the text width for the read command. 206 | Acceptable values are an integer amount of characters or "auto". 207 | """ 208 | if line == "auto": 209 | self.width = "auto" 210 | elif line == "": 211 | print "The current width is set to %s" % str(self.width) 212 | else: 213 | try: 214 | self.width = int(line) 215 | except: 216 | print "width must be an integer." 217 | 218 | def do_search(self, line): 219 | self.do_get("articles/search/" + line) 220 | 221 | def do_style(self, style): 222 | """ 223 | Usage: style 224 | Lists the available themes if no 225 | name is supplied, or sets the theme to use. 226 | """ 227 | if not self.highlight: 228 | print "For syntax highlighting you will need to install the Pygments package." 229 | print "sudo pip install pygments" 230 | return 231 | if style: 232 | self.style = style 233 | print 'Changed style to "%s"' % style 234 | else: 235 | print ', '.join(self.AVAILABLE_STYLES) 236 | print 'Currently using "%s"' % self.style 237 | 238 | def display(self, response): 239 | if self.highlight: 240 | print response[1] 241 | print highlight(json.dumps(response[0],indent=4), JsonLexer(), Terminal256Formatter(style=self.style)) 242 | else: self.c.p(response) 243 | 244 | def reqwrap(func): 245 | def wrapper(*args, **kwargs): 246 | try: return func(*args, **kwargs) 247 | except: return ({'error':'Connection refused.'}, 000) 248 | return wrapper 249 | 250 | 251 | if __name__ == "__main__": 252 | parser = optparse.OptionParser(prog="python -m emissary.repl") 253 | parser.add_option("--host", dest="host", action="store", default='localhost:6362/v1/') 254 | parser.add_option("--ncurses", dest="ncurses", action="store_true", default=False) 255 | (options,args) = parser.parse_args() 256 | 257 | if options.ncurses: 258 | r = window 259 | else: 260 | r = repl() 261 | 262 | r.c = Client('','https://%s' % options.host, verify=False) 263 | 264 | r.c.key = "" 265 | 266 | try: 267 | k = APIKey.query.first() 268 | except Exception, e: 269 | print "Encountered an error: " + e.message 270 | print "This either means there's no URI exported as EMISSARY_DATABASE or you've exported a URI" 271 | print "but haven't given Emissary a first run in order to write the schema and a primary API key." 272 | raise SystemExit 273 | 274 | if k: r.c.key = k.key 275 | r.c.verify_https = False 276 | 277 | if not options.ncurses: 278 | r.highlight = highlight 279 | r.prompt = r.formatted_prompt() 280 | if highlight: 281 | r.AVAILABLE_STYLES = set(STYLE_MAP.keys()) 282 | if 'tango' in r.AVAILABLE_STYLES: r.style = 'tango' 283 | else: 284 | for s in r.AVAILABLE_STYLES: break 285 | r.style = s 286 | r.c._send_request = reqwrap(r.c._send_request) 287 | 288 | try: 289 | if options.ncurses: 290 | window.start() 291 | else: 292 | r.cmdloop() 293 | except KeyboardInterrupt: 294 | print "^C" 295 | raise SystemExit 296 | -------------------------------------------------------------------------------- /emissary/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/emissary/resources/__init__.py -------------------------------------------------------------------------------- /emissary/resources/api_key.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # This module determines the behavior of API Keys within the system. 3 | # You may also want to check the definition of API keys in models.py. 4 | import re 5 | from flask import request 6 | from sqlalchemy import and_ 7 | from emissary import app, db 8 | from emissary.models import * 9 | from flask.ext import restful 10 | from flask.ext.restful import reqparse, abort 11 | from emissary.controllers.utils import cors, gzipped 12 | 13 | def auth(forbid_reader_keys=False): 14 | """ 15 | Here we determine that inactive keys are invalid 16 | and that reader keys are their parent unless forbidden. 17 | """ 18 | if 'Authorization' in request.headers: 19 | key_str = request.headers['Authorization'].replace('Basic ', '') 20 | key = APIKey.query.filter(APIKey.key == key_str).first() 21 | if key and key.active: 22 | if key.reader: 23 | if not forbid_reader_keys: 24 | return key.parent 25 | abort(401, message="Forbidden to reader keys.") 26 | return key 27 | abort(401, message="Invalid API Key.") 28 | 29 | class KeyCollection(restful.Resource): 30 | 31 | @cors 32 | @gzipped 33 | def get(self): 34 | key = auth() 35 | response = key.jsonify(feedgroups=False) 36 | 37 | if key.name == app.config['MASTER_KEY_NAME'] or key.systemwide: 38 | response['system'] = {} 39 | 40 | if key.name == app.config['MASTER_KEY_NAME']: 41 | keys = [] 42 | for i in APIKey.query.all(): keys.append(i.name) 43 | response['system']['keys'] = keys 44 | response['system']['permit_new'] = app.config['PERMIT_NEW'] 45 | 46 | return [response] 47 | 48 | @cors 49 | @gzipped 50 | def put(self): 51 | """ 52 | This method creates keys under the specified name, 53 | presuming config['PERMIT_NEW'] is enabled or the master key is in use. 54 | 55 | Reader keys (keys that can only perform GET requests) are created by setting 56 | the "reader" parameter to a value in the body of the request. 57 | They are automatically associated with the requesting key. 58 | """ 59 | key = None 60 | parser = reqparse.RequestParser() 61 | parser.add_argument("name",type=str, help="Name associated with the key", required=True) 62 | parser.add_argument("reader",type=bool, help="Creates a reader key", default=False) 63 | args = parser.parse_args() 64 | 65 | if 'Authorization' in request.headers: 66 | key_str = request.headers['Authorization'].replace('Basic ', '') 67 | key = APIKey.query.filter(APIKey.key == key_str).first() 68 | if key.reader: 69 | abort(401, message="Reader keys cannot create API keys.") 70 | 71 | # Create a reader key if this request has been made with an existing key 72 | if key and args.name and args.reader: 73 | new_key = APIKey(name=args.name, active=True, reader=True) 74 | new_key.key = new_key.generate_key_str() 75 | key.readers.append(new_key) 76 | db.session.add(key) 77 | db.session.add(new_key) 78 | db.session.commit() 79 | 80 | return new_key.jsonify(with_key_str=True), 201 81 | 82 | if (key and key.name == app.config['MASTER_KEY_NAME']) or app.config['PERMIT_NEW']: 83 | # Permit only simple names (character limit, alphanumeric) 84 | if re.match("^$|\s+[a-zA-Z0-9_]+$",args.name) or len(args.name) > 60: 85 | abort(422, message="Invalid key name. Must contain alphanumeric characters.") 86 | # Determine if already exists 87 | key = APIKey.query.filter(APIKey.name == args.name).first() 88 | 89 | if key: abort(403, message="A key already exists with this name.") 90 | 91 | key = APIKey(name=args.name) 92 | key.key = key.generate_key_str() 93 | key.active = True 94 | db.session.add(key) 95 | db.session.commit() 96 | 97 | return key.jsonify(with_key_str=True), 201 98 | 99 | abort(403, message="This server isn't currently generating new keys.") 100 | 101 | @cors 102 | @gzipped 103 | def post(self): 104 | "This method is for updating existing API keys via the master key." 105 | 106 | key = auth(forbid_reader_keys=True) 107 | 108 | parser = reqparse.RequestParser() 109 | parser.add_argument("key",type=str, help="API Key") 110 | parser.add_argument("name",type=str, help="Name associated with the key") 111 | parser.add_argument("permit_new", type=bool, help="Determines whether new API keys can be created.") 112 | parser.add_argument("active", type=bool, help="Determines whether a key is active or not.", default=None) 113 | args = parser.parse_args() 114 | 115 | if key.name != app.config['MASTER_KEY_NAME']: abort(403) 116 | 117 | response={} 118 | subject = None 119 | 120 | if args.key and args.name: 121 | subject = APIKey.query.filter(APIKey.key == args.key).first() 122 | if APIKey.query.filter(APIKey.name == args.name).first(): 123 | return {'message':"A key already exists with this name."}, 304 124 | subject.name = args.name 125 | elif args.name and not args.key: 126 | subject = APIKey.query.filter(APIKey.name == args.name).first() 127 | elif args.key and not args.name: 128 | subject = APIKey.query.filter(APIKey.key == args.key).first() 129 | 130 | if not subject: abort(404) 131 | 132 | if subject.name == app.config['MASTER_KEY_NAME']: abort(403) 133 | if args.active or args.active == False: 134 | subject.active = args.active 135 | 136 | response['key'] = subject.jsonify(with_key_str=True) 137 | db.session.add(subject) 138 | 139 | if (args.permit_new or args.permit_new == False) and key.name == app.config['MASTER_KEY_NAME']: 140 | app.config['PERMIT_NEW'] = args.permit_new 141 | response['system'] = {} 142 | response['system']['permit_new'] = app.config['PERMIT_NEW'] 143 | 144 | db.session.commit() 145 | return response 146 | 147 | @cors 148 | @gzipped 149 | def delete(self): 150 | # http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html#configuring-delete-delete-orphan-cascade 151 | key = auth(forbid_reader_keys=True) 152 | 153 | parser = reqparse.RequestParser() 154 | parser.add_argument("key",type=str, help="API Key") 155 | args = parser.parse_args() 156 | 157 | target = APIKey.query.filter(APIKey.key == args.key).first() 158 | if not target: abort(404, message="Unrecognized key.") 159 | 160 | if args.key != key.key and key.name != app.config['MASTER_KEY_NAME']: 161 | abort(403, message="You do not have permission to remove this key.") 162 | if key.name == app.config['MASTER_KEY_NAME'] and args.key == key.key: 163 | abort(403, message="You are attempting to delete the master key.") 164 | 165 | for fg in target.feedgroups: db.session.delete(fg) 166 | for f in target.feeds: db.session.delete(f) 167 | for a in target.articles: db.session.delete(a) 168 | 169 | db.session.delete(target) 170 | db.session.commit() 171 | return {}, 204 172 | 173 | class KeyResource(restful.Resource): 174 | 175 | @cors 176 | @gzipped 177 | def get(self, name): 178 | """ 179 | Permit the administrative key to review another key by name. 180 | """ 181 | key = auth(forbid_reader_keys=True) 182 | if key.name != app.config['MASTER_KEY_NAME'] and name != key.name: 183 | abort(403) 184 | 185 | target = APIKey.query.filter_by(name=name).first() 186 | if target: 187 | return target.jsonify(feedgroups=True, with_key_str=True) 188 | 189 | abort(404, message="Unrecognised key.") 190 | -------------------------------------------------------------------------------- /emissary/resources/articles.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # This file determines how articles are accessed. 3 | # You may also want to examine the Article class in emissary/models.py 4 | from emissary import db 5 | from flask import request 6 | from flask.ext import restful 7 | from sqlalchemy import desc, and_ 8 | from emissary.models import Article 9 | from emissary.resources.api_key import auth 10 | from emissary.controllers.fetch import fetch_feedless_article 11 | from emissary.controllers.utils import make_response, gzipped, cors 12 | 13 | class ArticleCollection(restful.Resource): 14 | 15 | @cors 16 | def get(self): 17 | """ 18 | Review all articles associated with this key. 19 | """ 20 | key = auth() 21 | 22 | parser = restful.reqparse.RequestParser() 23 | parser.add_argument("page", type=int, default=1) 24 | parser.add_argument("per_page", type=int, default=10) 25 | parser.add_argument("content", type=bool, default=None) 26 | args = parser.parse_args() 27 | 28 | # Construct a query for Articles ordered by descending creation date and paginated. 29 | if args.content == True: 30 | query = Article.query.filter(and_(Article.key == key, Article.content != None))\ 31 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 32 | elif args.content == False: 33 | query = Article.query.filter(and_(Article.key == key, Article.content == None))\ 34 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 35 | else: 36 | query = Article.query.filter(Article.key == key)\ 37 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 38 | 39 | # Attach links to help consuming applications 40 | response = make_response(request.url, query) 41 | return response 42 | 43 | @cors 44 | def put(self): 45 | """ 46 | Fetch an article without an associated feed. 47 | """ 48 | key = auth() 49 | 50 | parser = restful.reqparse.RequestParser() 51 | parser.add_argument("url", type=str, required=True) 52 | args = parser.parse_args() 53 | 54 | try: 55 | article = fetch_feedless_article(key, args.url) 56 | except Exception, e: 57 | return {"Error": e.message} 58 | 59 | if not article: 60 | return {"Error": "This URL has already been stored."}, 304 61 | 62 | return article.jsonify(), 201 63 | 64 | class ArticleSearch(restful.Resource): 65 | 66 | @cors 67 | def get(self, terms): 68 | """ 69 | The /v1/articles/search/ endpoint. 70 | """ 71 | key = auth() 72 | 73 | parser = restful.reqparse.RequestParser() 74 | parser.add_argument("page", type=int, help="", default=1) 75 | parser.add_argument("per_page", type=int, help="", default=10) 76 | parser.add_argument("content", type=bool, help="", default=None) 77 | args = parser.parse_args() 78 | 79 | if args.content == True: 80 | query = Article.query.filter( 81 | and_( 82 | Article.key == key, 83 | Article.content != None, 84 | Article.title.like("%" + terms + "%") 85 | ))\ 86 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 87 | 88 | response = make_response(request.url, query) 89 | 90 | # This method of manually pruning JSON documents because they 91 | # don't relate to items that have content can omit them from search 92 | # completely. They don't have content but they're showing up here in 93 | # content != None rather than content == None.. You could always just 94 | # comment out this next for loop 95 | for doc in response['data']: 96 | if not doc['content_available']: 97 | response['data'].remove(doc) 98 | return response 99 | 100 | elif args.content == False: 101 | query = Article.query.filter( 102 | and_( 103 | Article.key == key, 104 | Article.content == None, 105 | Article.title.like("%" + terms + "%") 106 | ))\ 107 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 108 | return make_response(request.url, query) 109 | 110 | query = Article.query.filter( 111 | and_(Article.key == key, Article.title.like("%" + terms + "%")))\ 112 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 113 | return make_response(request.url, query) 114 | 115 | class ArticleResource(restful.Resource): 116 | 117 | @cors 118 | def get(self, uid): 119 | """ 120 | Read an article. 121 | """ 122 | key = auth() 123 | 124 | article = Article.query.filter(and_(Article.key == key, Article.uid == uid)).first() 125 | if article: 126 | return article.jsonify(summary=True, content=True) 127 | 128 | restful.abort(404) 129 | 130 | @cors 131 | @gzipped 132 | def delete(self, uid): 133 | """ 134 | Delete an article. 135 | """ 136 | key = auth(forbid_reader_keys=True) 137 | 138 | article = Article.query.filter(and_(Article.key == key, Article.uid == uid)).first() 139 | if article: 140 | db.session.delete(article) 141 | db.session.commit() 142 | return {} 143 | 144 | restful.abort(404) 145 | 146 | class ArticleCount(restful.Resource): 147 | 148 | @cors 149 | def get(self): 150 | """ 151 | Return the amount of articles belonging to an API key. 152 | """ 153 | key = auth() 154 | return len(key.articles) 155 | 156 | -------------------------------------------------------------------------------- /emissary/resources/feedgroups.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # This file provides the HTTP endpoints for operating on groups of feeds. 3 | from emissary import app, db 4 | from flask import request 5 | from flask.ext import restful 6 | from sqlalchemy import and_, desc 7 | from emissary.resources.api_key import auth 8 | from emissary.models import FeedGroup, Feed, Article 9 | from emissary.controllers.cron import CronError, parse_timings 10 | from emissary.controllers.utils import cors, gzipped, make_response 11 | 12 | class FeedGroupCollection(restful.Resource): 13 | 14 | @cors 15 | @gzipped 16 | def get(self): 17 | """ 18 | Paginate an array of feed groups 19 | associated with the requesting key. 20 | """ 21 | key = auth() 22 | 23 | parser = restful.reqparse.RequestParser() 24 | parser.add_argument("page", type=int, default=1) 25 | parser.add_argument("per_page", type=int, default=10) 26 | parser.add_argument("content", type=bool, default=None) 27 | args = parser.parse_args() 28 | 29 | query = FeedGroup.query.filter(FeedGroup.key == key)\ 30 | .order_by(desc(FeedGroup.created)).paginate(args.page, args.per_page) 31 | 32 | return make_response(request.url, query) 33 | 34 | @cors 35 | @gzipped 36 | def put(self): 37 | """ 38 | Create a new feed group, providing the name isn't already in use. 39 | """ 40 | key = auth(forbid_reader_keys=True) 41 | 42 | parser = restful.reqparse.RequestParser() 43 | parser.add_argument("name", type=str, required=True) 44 | parser.add_argument("active", type=bool, default=True, help="Feed is active", required=False) 45 | args = parser.parse_args() 46 | 47 | # Check for this name already existing in the groups on this key 48 | if [fg for fg in key.feedgroups if fg.name == args.name]: 49 | return {"message":"Feed group %s already exists." % args.name}, 304 50 | 51 | fg = FeedGroup(name=args.name, active=args.active) 52 | key.feedgroups.append(fg) 53 | db.session.add(fg) 54 | db.session.add(key) 55 | db.session.commit() 56 | 57 | return fg.jsonify(), 201 58 | 59 | class FeedGroupResource(restful.Resource): 60 | 61 | @cors 62 | @gzipped 63 | def get(self, groupname): 64 | """ 65 | Review a specific feed group. 66 | """ 67 | key = auth() 68 | 69 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 70 | if not fg: 71 | restful.abort(404) 72 | return fg.jsonify() 73 | 74 | @cors 75 | @gzipped 76 | def put(self, groupname): 77 | """ 78 | Create a new feed providing the name and url are unique. 79 | Feeds must be associated with a group. 80 | """ 81 | key = auth(forbid_reader_keys=True) 82 | 83 | parser = restful.reqparse.RequestParser() 84 | parser.add_argument("name", type=str, required=True) 85 | parser.add_argument("url", type=str, required=True) 86 | parser.add_argument("schedule", type=str, required=True) 87 | parser.add_argument("active", type=bool, default=True, help="Feed is active", required=False) 88 | args = parser.parse_args() 89 | 90 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 91 | if not fg: 92 | return {"message":"Unknown Feed Group %s" % groupname}, 304 93 | 94 | # Verify the schedule 95 | try: 96 | parse_timings(args.schedule) 97 | except CronError, err: 98 | return {"message": err.message}, 500 99 | 100 | # Check the URL isn't already scheduled on this key 101 | if [feed for feed in key.feeds if feed.url == args.url]: 102 | return {"message": "A feed on this key already exists with this url."}, 500 103 | 104 | # Check the name is unique to this feedgroup 105 | if [feed for feed in fg.feeds if feed.name == args.name]: 106 | return {"message": "A feed in this group already exists with this name."}, 500 107 | 108 | feed = Feed(name=args.name, url=args.url, schedule=args.schedule, active=args.active) 109 | 110 | # We generally don't want to have objects in this system that don't belong to API keys. 111 | fg.feeds.append(feed) 112 | key.feeds.append(feed) 113 | 114 | db.session.add(feed) 115 | db.session.add(fg) 116 | db.session.add(key) 117 | db.session.commit() 118 | 119 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == args.name)).first() 120 | if not feed: 121 | return {"message":"Error saving feed."}, 304 122 | 123 | # Schedule this feed. 0 here is a response 124 | # queue ID (we're not waiting for a reply) 125 | app.inbox.put([0, "start", [key,feed.name]]) 126 | return feed.jsonify(), 201 127 | 128 | @cors 129 | @gzipped 130 | def post(self, groupname): 131 | "Rename a feedgroup or toggle active status" 132 | 133 | key = auth(forbid_reader_keys=True) 134 | 135 | parser = restful.reqparse.RequestParser() 136 | parser.add_argument("name", type=str, help="Rename a feed group",) 137 | parser.add_argument("active", type=bool, default=None) 138 | args = parser.parse_args() 139 | 140 | fg = FeedGroup.query.filter( 141 | and_(FeedGroup.key == key, FeedGroup.name == groupname) 142 | ).first() 143 | if not fg: 144 | restful.abort(404) 145 | 146 | if args.name: 147 | if FeedGroup.query.filter( 148 | and_(FeedGroup.key == key, FeedGroup.name == args.name) 149 | ).first(): 150 | return {"message":"A feed already exists with this name."}, 304 151 | fg.name = args.name 152 | 153 | if args.active or args.active == False: 154 | fg.active = args.active 155 | 156 | db.session.add(fg) 157 | db.session.commit() 158 | return fg.jsonify() 159 | 160 | @cors 161 | @gzipped 162 | def delete(self, groupname): 163 | key = auth(forbid_reader_keys=True) 164 | 165 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 166 | if not fg: 167 | restful.abort(404) 168 | count=0 169 | for feed in fg.feeds: 170 | for article in feed.articles: 171 | count += 1 172 | db.session.delete(article) 173 | db.session.delete(feed) 174 | db.session.delete(fg) 175 | db.session.commit() 176 | count = "{:,}".format(count) 177 | app.log('%s: Deleted feed group "%s". (%s articles)' % (key.name, fg.name, count)) 178 | 179 | return {} 180 | 181 | class FeedGroupArticles(restful.Resource): 182 | 183 | @cors 184 | def get(self, groupname): 185 | """ 186 | Retrieve articles by feedgroup. 187 | """ 188 | key = auth() 189 | 190 | # Summon the group or 404. 191 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 192 | if not fg: restful.abort(404) 193 | 194 | parser = restful.reqparse.RequestParser() 195 | parser.add_argument("page", type=int, default=1) 196 | parser.add_argument("per_page", type=int, default=10) 197 | parser.add_argument("content", type=bool, default=None) 198 | args = parser.parse_args() 199 | 200 | if args.content == True: 201 | 202 | query = Article.query.filter( 203 | and_(Article.feed.has(group=fg), Article.content != None))\ 204 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 205 | 206 | response = make_response(request.url, query) 207 | 208 | # for doc in response['data']: 209 | # if not doc['content_available']: 210 | # response['data'].remove(doc) 211 | # return response 212 | 213 | if args.content == False: 214 | query = Article.query.filter( 215 | and_(Article.feed.has(group=fg), Article.content == None))\ 216 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 217 | 218 | return make_response(request.url, query) 219 | 220 | query = Article.query.filter( 221 | Article.feed.has(group=fg))\ 222 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 223 | 224 | return make_response(request.url, query) 225 | 226 | class FeedGroupStart(restful.Resource): 227 | 228 | @cors 229 | def post(self, groupname): 230 | """ 231 | Start all feeds within a group. 232 | """ 233 | key = auth(forbid_reader_keys=True) 234 | 235 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 236 | if not fg: 237 | restful.abort(404) 238 | 239 | for feed in fg.feeds: 240 | app.inbox.put([0, "start", [key,feed.name]]) 241 | return {} 242 | 243 | class FeedGroupStop(restful.Resource): 244 | 245 | def post(self, groupname): 246 | key = auth(forbid_reader_keys=True) 247 | 248 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 249 | if not fg: 250 | restful.abort(404) 251 | 252 | for feed in fg.feeds: 253 | app.inbox.put([0, "stop", [key,feed.name]]) 254 | return {} 255 | 256 | class FeedGroupSearch(restful.Resource): 257 | 258 | def get(self, groupname, terms): 259 | """ 260 | Return articles on feeds in this group with our search terms in the title. 261 | """ 262 | key = auth() 263 | 264 | parser = restful.reqparse.RequestParser() 265 | parser.add_argument("page", type=int, default=1) 266 | parser.add_argument("per_page", type=int, default=10) 267 | # parser.add_argument("content", type=bool, default=None) 268 | args = parser.parse_args() 269 | 270 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 271 | if not fg: 272 | restful.abort(404) 273 | 274 | query = Article.query.filter( 275 | and_(Article.feed.has(group=fg), Article.title.like("%" + terms + "%")))\ 276 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 277 | return make_response(request.url, query) 278 | 279 | class FeedGroupCount(restful.Resource): 280 | 281 | def get(self, groupname): 282 | key = auth() 283 | 284 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 285 | if not fg: 286 | restful.abort(404) 287 | 288 | return sum(len(f.articles) for f in fg.feeds) 289 | -------------------------------------------------------------------------------- /emissary/resources/feeds.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # This file provides the HTTP endpoints for operating on individual feeds 3 | from emissary import app, db 4 | from flask import request 5 | from flask.ext import restful 6 | from sqlalchemy import desc, and_ 7 | from emissary.models import Feed, FeedGroup, Article 8 | from emissary.resources.api_key import auth 9 | from emissary.controllers.cron import CronError, parse_timings 10 | from emissary.controllers.utils import make_response, gzipped, cors 11 | 12 | class FeedResource(restful.Resource): 13 | 14 | @cors 15 | @gzipped 16 | def get(self, groupname, name): 17 | """ 18 | Review a feed. 19 | """ 20 | key = auth() 21 | 22 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first() 23 | if feed: 24 | return feed.jsonify() 25 | restful.abort(404) 26 | 27 | @cors 28 | @gzipped 29 | def post(self, groupname, name): 30 | """ 31 | Modify an existing feed. 32 | """ 33 | key = auth(forbid_reader_keys=True) 34 | 35 | parser = restful.reqparse.RequestParser() 36 | parser.add_argument("name", type=str) 37 | parser.add_argument("group", type=str) 38 | parser.add_argument("url", type=str) 39 | parser.add_argument("schedule", type=str) 40 | parser.add_argument("active", type=bool, default=None, help="Feed is active") 41 | args = parser.parse_args() 42 | 43 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first() 44 | if not feed: 45 | restful.abort(404) 46 | 47 | if args.name: 48 | if Feed.query.filter(and_(Feed.key == key, Feed.name == args.name)).first(): 49 | return {"message":"A feed already exists with this name."}, 304 50 | feed.name = args.name 51 | 52 | if args.group: 53 | pass 54 | 55 | if args.active != None: 56 | feed.active = args.active 57 | 58 | if args.url: 59 | feed.url = args.url 60 | 61 | if args.schedule: 62 | try: 63 | parse_timings(args.schedule) 64 | except CronError, err: 65 | return {"message": err.message}, 500 66 | feed.schedule = args.schedule 67 | 68 | db.session.add(feed) 69 | db.session.commit() 70 | 71 | if args.url or args.schedule: 72 | app.inbox.put([0, "stop", [feed.key, feed.name]]) 73 | app.inbox.put([0, "start", [feed.key, feed.name]]) 74 | 75 | return feed.jsonify() 76 | 77 | @cors 78 | @gzipped 79 | def delete(self, groupname, name): 80 | """ 81 | Halt and delete a feed. 82 | Default to deleting its articles. 83 | """ 84 | key = auth(forbid_reader_keys=True) 85 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first() 86 | if not feed: 87 | restful.abort(404) 88 | app.inbox.put([0, "stop", [key, feed.name]]) 89 | app.log('%s: %s: Deleting feed "%s".' % (feed.key.name, feed.group.name, feed.name)) 90 | for a in feed.articles: 91 | db.session.delete(a) 92 | 93 | db.session.delete(feed) 94 | db.session.commit() 95 | 96 | return {} 97 | 98 | class FeedArticleCollection(restful.Resource): 99 | 100 | @cors 101 | def get(self, groupname, name): 102 | """ 103 | Review the articles for a specific feed on this key. 104 | """ 105 | key = auth() 106 | 107 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first() 108 | if not feed: abort(404) 109 | 110 | parser = restful.reqparse.RequestParser() 111 | parser.add_argument("page", type=int, default=1) 112 | parser.add_argument("per_page", type=int, default=10) 113 | parser.add_argument("content", type=bool, default=None) 114 | args = parser.parse_args() 115 | 116 | # Return a list of the JSONified Articles ordered by descending creation date and paginated. 117 | if args.content == True: 118 | query = Article.query.filter(and_(Article.key == key, Article.content != None, Article.feed == feed))\ 119 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 120 | 121 | return make_response(request.url, query) 122 | 123 | elif args.content == False: 124 | query = Article.query.filter(and_(Article.key == key, Article.content == None, Article.feed == feed))\ 125 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 126 | 127 | return make_response(request.url, query) 128 | 129 | query = Article.query.filter(and_(Article.key == key, Article.feed == feed))\ 130 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 131 | 132 | return make_response(request.url, query) 133 | 134 | class FeedSearch(restful.Resource): 135 | 136 | @cors 137 | def get(self, groupname, name, terms): 138 | """ 139 | Search for articles within a feed. 140 | """ 141 | key = auth() 142 | 143 | parser = restful.reqparse.RequestParser() 144 | parser.add_argument("page", type=int, default=1) 145 | parser.add_argument("per_page", type=int, default=10) 146 | # parser.add_argument("content", type=bool, default=None) 147 | args = parser.parse_args() 148 | 149 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first() 150 | if not fg: 151 | restful.abort(404) 152 | 153 | f = [f for f in fg.feeds if f.name == name] 154 | if not f: abort(404) 155 | 156 | f = f[0] 157 | 158 | query = Article.query.filter( 159 | and_(Article.feed == f, Article.title.like("%" + terms + "%")))\ 160 | .order_by(desc(Article.created)).paginate(args.page, args.per_page) 161 | 162 | return make_response(request.url, query) 163 | 164 | class FeedStartResource(restful.Resource): 165 | 166 | @cors 167 | def post(self, groupname, name): 168 | key = auth(forbid_reader_keys=True) 169 | 170 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first() 171 | if feed: 172 | app.inbox.put([0, "start", [key, feed.name]]) 173 | return feed.jsonify() 174 | restful.abort(404) 175 | 176 | class FeedStopResource(restful.Resource): 177 | 178 | @cors 179 | def post(self, groupname, name): 180 | key = auth(forbid_reader_keys=True) 181 | 182 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first() 183 | if feed: 184 | app.inbox.put([0, "stop", [key, feed.name]]) 185 | return feed.jsonify() 186 | restful.abort(404) 187 | 188 | -------------------------------------------------------------------------------- /emissary/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding: utf-8 _*_ 3 | 4 | # The reason we don't patch threading is because 5 | # our IPC queues rely on it for locking. We can't have them 6 | # be greenlets otherwise they will need the HTTPD to yeild 7 | # before data from the fetch process can be transmitted. 8 | from gevent import monkey; monkey.patch_all(thread=False) 9 | import gevent 10 | from gevent.queue import Queue 11 | from gevent.socket import socket 12 | from gevent.wsgi import WSGIServer 13 | 14 | import os 15 | import sys 16 | import pwd 17 | import time 18 | import signal 19 | import _socket 20 | import optparse 21 | from multiprocessing import Process 22 | 23 | from emissary import app, init, db 24 | from emissary.models import APIKey 25 | from emissary.controllers.log import Log 26 | from emissary.controllers.scripts import Scripts 27 | from emissary.controllers.load import parse_crontab 28 | from emissary.controllers.manager import FeedManager 29 | 30 | try: 31 | import setproctitle 32 | setproctitle.setproctitle("emissary") 33 | except ImportError: 34 | pass 35 | 36 | def Daemonise(pidfile): 37 | try: 38 | pid = os.fork() 39 | if pid > 0: 40 | sys.exit(0) # End parent 41 | except OSError, e: 42 | sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) 43 | sys.exit(-2) 44 | os.setsid() 45 | os.umask(0) 46 | try: 47 | pid = os.fork() 48 | if pid > 0: 49 | try: 50 | # TODO: Read the file first and determine if already running. 51 | f = file(pidfile, 'w') 52 | f.write(str(pid)) 53 | f.close() 54 | except IOError, e: 55 | logging.error(e) 56 | sys.stderr.write(repr(e)) 57 | sys.exit(0) # End parent 58 | except OSError, e: 59 | sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) 60 | sys.exit(-2) 61 | for fd in (0, 1, 2): 62 | try: 63 | os.close(fd) 64 | except OSError: 65 | pass 66 | 67 | def export_crontab(filename): 68 | """ 69 | Defined here to prevent circular imports. 70 | """ 71 | crontab = "" 72 | fd = open(filename, "w") 73 | keys = [k for k in APIKey.query.all() if not k.reader] 74 | for key in keys: 75 | crontab += "apikey: %s\n\n" % key.key 76 | for feed in key.feeds: 77 | crontab += '%s "%s" "%s" %s\n' % (feed.url, feed.name, feed.group.name, feed.schedule) 78 | crontab += '\n\n' 79 | fd.write(crontab) 80 | fd.close() 81 | 82 | if __name__ == "__main__": 83 | prog = "Emissary" 84 | description = "A microservice for archiving the news." 85 | epilog = "Psybernetics %s." % time.asctime().split()[-1] 86 | parser = optparse.OptionParser(prog=prog,version=app.version,description=description,epilog=epilog) 87 | 88 | parser.set_usage('python -m emissary.run [options]') 89 | parser.add_option("-c", "--crontab", dest="crontab", action="store", default=None, help="Crontab to parse") 90 | parser.add_option("--config", dest="config", action="store", default=None, help="(defaults to emissary.config)") 91 | parser.add_option("-a", "--address", dest="address", action="store", default='0.0.0.0', help="(defaults to 0.0.0.0)") 92 | parser.add_option("-p", "--port", dest="port", action="store", default='6362', help="(defaults to 6362)") 93 | parser.add_option("--key", dest="key", action="store", default=None, help="SSL key file") 94 | parser.add_option("--cert", dest="cert", action="store", default=None, help="SSL certificate") 95 | parser.add_option("--export", dest="export", action="store", default=False, help="Write out current database as a crontab") 96 | parser.add_option("--pidfile", dest="pidfile", action="store", default="emissary.pid", help="(defaults to ./emissary.pid)") 97 | parser.add_option("--logfile", dest="logfile", action="store", default="emissary.log", help="(defaults to ./emissary.log)") 98 | parser.add_option("--stop", dest="stop", action="store_true", default=False) 99 | parser.add_option("--debug", dest="debug", action="store_true", default=False, help="Log to stdout") 100 | parser.add_option("-d", dest="daemonise", action="store_true", default=False, help="Run in the background") 101 | parser.add_option("--run-as", dest="run_as", action="store",default=None, help="(defaults to the invoking user)") 102 | parser.add_option("--scripts-dir", dest="scripts_dir", action="store", default="scripts", help="(defaults to ./scripts/)") 103 | (options,args) = parser.parse_args() 104 | 105 | if options.config: 106 | app.config.from_object(options.config) 107 | 108 | if options.crontab: 109 | parse_crontab(options.crontab) 110 | raise SystemExit 111 | 112 | app.debug = options.debug 113 | 114 | # Build logger from config 115 | log = Log("Emissary", log_file=options.logfile, log_stdout= not options.daemonise) 116 | log.debug = options.debug 117 | app.log = log 118 | 119 | log("Starting Emissary %s." % app.version) 120 | 121 | if options.stop: 122 | pid = None 123 | try: 124 | f = file(options.pidfile, 'r') 125 | pids = f.readline().split() 126 | f.close() 127 | os.unlink(options.pidfile) 128 | except ValueError, e: 129 | sys.stderr.write('Error in pid file "%s". Aborting\n' % options.pidfile) 130 | sys.exit(-1) 131 | except IOError, e: 132 | pass 133 | if pids: 134 | for pid in pids: 135 | os.kill(int(pid), 15) 136 | print "Killed process with ID %s." % pid 137 | else: 138 | sys.stderr.write('Emissary not running or no PID file found\n') 139 | sys.exit(0) 140 | 141 | if options.export: 142 | try: 143 | export_crontab(options.export_crontab) 144 | log('Crontab written to "%s".' % options.export_crontab) 145 | except Exception, e: 146 | log('Error writing crontab: %s' % e.message) 147 | raise SystemExit 148 | 149 | 150 | if not options.key and not options.cert: 151 | print "SSL cert and key required. (--key and --cert)" 152 | print "Keys and certs can be generated with:" 153 | print "$ openssl genrsa 1024 > key" 154 | print "$ openssl req -new -x509 -nodes -sha1 -days 365 -key key > cert" 155 | raise SystemExit 156 | 157 | if '~' in options.cert: options.cert = os.path.expanduser(options.cert) 158 | if '~' in options.key: options.key = os.path.expanduser(options.key) 159 | 160 | if not os.path.isfile(options.cert): 161 | sys.exit("Certificate not found at %s" % options.cert) 162 | 163 | if not os.path.isfile(options.key): 164 | sys.exit("Key not found at %s" % options.key) 165 | 166 | if (pwd.getpwuid(os.getuid())[2] == 0) and not options.run_as: 167 | print "Running as root is not permitted.\nExecute this as a different user." 168 | raise SystemExit 169 | 170 | sock = (options.address, int(options.port)) 171 | 172 | if options.run_as: 173 | sock = socket(family=_socket.AF_INET) 174 | try: 175 | sock.bind((options.address, int(options.port))) 176 | except _socket.error: 177 | ex = sys.exc_info()[1] 178 | strerror = getattr(ex, 'strerror', None) 179 | if strerror is not None: 180 | ex.strerror = strerror + ': ' + repr(options.address+':'+options.port) 181 | raise 182 | sock.listen(50) 183 | sock.setblocking(0) 184 | uid = pwd.getpwnam(options.run_as)[2] 185 | try: 186 | os.setuid(uid) 187 | log("Now running as %s." % options.run_as) 188 | except Exception, e: raise 189 | 190 | # Create the database schema and insert an administrative key 191 | init() 192 | 193 | if options.daemonise: Daemonise(options.pidfile) 194 | 195 | # Load scripts 196 | app.scripts = Scripts(options.scripts_dir) 197 | app.scripts.reload() 198 | 199 | # Trap SIGHUP to reload scripts 200 | signal.signal(signal.SIGHUP, app.scripts.reload) 201 | 202 | 203 | # Initialise the feed manager with the logger, provide IPC access and load feeds. 204 | fm = FeedManager(log) 205 | fm.db = db 206 | fm.app = app # Queue access 207 | fm.load_feeds() 208 | 209 | # Start the REST interface 210 | httpd = WSGIServer(sock, app, certfile=options.cert, keyfile=options.key) 211 | httpd.loop.reinit() 212 | httpd_process = Process(target=httpd.serve_forever) 213 | log("Binding to %s:%s" % (options.address, options.port)) 214 | httpd_process.start() 215 | 216 | if options.daemonise: 217 | f = file(options.pidfile, 'a') 218 | f.write(' %i' % httpd_process.pid) 219 | f.close() 220 | 221 | try: 222 | fm.run() 223 | except KeyboardInterrupt: 224 | log("Stopping...") 225 | httpd_process.terminate() 226 | -------------------------------------------------------------------------------- /scripts/hello.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | # 3 | # This script creates a named pipe (if it doesn't exist) 4 | # and writes the feed name, article title and url to it 5 | # whenever an article is saved to the database. 6 | # 7 | # This is useful for composing systems that constantly read 8 | # the FIFO and do things like emit the data to IRC channels. 9 | # 10 | # You could, for instance, perform fuzzy pattern matching and be 11 | # notified when certain keywords are in the news. 12 | # 13 | # Transmission to a natural language processing/translation service 14 | # can also be done in a script or by reading a FIFO like the one here. 15 | # 16 | # Whether you use this system to profit, perform intelligence analysis 17 | # or inform your next vote is hopefully up to you! 18 | # 19 | # Luke Brooks, 2015 20 | # MIT License 21 | # Many big thanks to God, lord of universes. 22 | fifo = "/tmp/emissary.pipe" 23 | 24 | import os, stat 25 | if not os.path.exists(fifo): 26 | try: 27 | os.mkfifo(fifo) 28 | except Exception, e: 29 | cache['app'].log("Error creating %s: %s" % (fifo, e.message)) 30 | 31 | # Emissary always executes scripts with an article and its feed in the namespace. 32 | 33 | # There is also a dictionary named cache, containing the app object. 34 | # Random aside but through the app object you can access the logging interface and the feed manager. 35 | try: 36 | # READER BEWARE: Use non-blocking IO or you won't be storing owt. 37 | fd = os.open(fifo, os.O_CREAT | os.O_WRONLY | os.O_NONBLOCK) 38 | os.write(fd, "%s: %s\n%s\n" % (feed.name, article.title, article.url)) 39 | os.close(fd) 40 | del fd 41 | except Exception, e: # Usually due to there not being a reader fd known to the kernel. 42 | pass 43 | 44 | del os, stat, fifo 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # _*_ coding: utf-8 _*_ 3 | from setuptools import setup, find_packages 4 | import os 5 | import imp 6 | 7 | def non_python_files(path): 8 | """ Return all non-python-file filenames in path """ 9 | result = [] 10 | all_results = [] 11 | module_suffixes = [info[0] for info in imp.get_suffixes()] 12 | ignore_dirs = ['cvs'] 13 | for item in os.listdir(path): 14 | name = os.path.join(path, item) 15 | if ( 16 | os.path.isfile(name) and 17 | os.path.splitext(item)[1] not in module_suffixes 18 | ): 19 | result.append(name) 20 | elif os.path.isdir(name) and item.lower() not in ignore_dirs: 21 | all_results.extend(non_python_files(name)) 22 | if result: 23 | all_results.append((path, result)) 24 | return all_results 25 | 26 | data_files = ( 27 | # non_python_files('emissary') + 28 | # non_python_files(os.path.join('Emissary', 'doc')) 29 | ) 30 | 31 | setup(name='Emissary', 32 | version="2.1.1", 33 | description='A microservice for indexing the plain text of articles and essays', 34 | author='Luke Brooks', 35 | author_email='luke@psybernetics.org.uk', 36 | url='http://psybernetics.org.uk/emissary', 37 | download_url = 'https://github.com/LukeB42/Emissary/tarball/2.0.0', 38 | data_files = data_files, 39 | packages=['emissary', 'emissary.resources', 'emissary.controllers'], 40 | include_package_data=True, 41 | install_requires=[ 42 | "setproctitle", 43 | "goose-extractor", 44 | "lxml", 45 | "gevent", 46 | "Flask-RESTful", 47 | "Flask-SQLAlchemy", 48 | "cssselect", 49 | "BeautifulSoup", 50 | "feedparser", 51 | "python-snappy", 52 | "requests", 53 | "pygments", 54 | "window", 55 | ], 56 | keywords=["text extraction","document archival","document retrieval"] 57 | ) 58 | --------------------------------------------------------------------------------