├── LICENCE
├── README.md
├── crontab
├── doc
├── emissary2.png
├── emissary3.png
├── emissary4.png
└── emissary5.png
├── emissary
├── __init__.py
├── client.py
├── config.py
├── controllers
│ ├── __init__.py
│ ├── cron.py
│ ├── fetch.py
│ ├── load.py
│ ├── log.py
│ ├── manager.py
│ ├── parser.py
│ ├── scripts.py
│ ├── tui.py
│ └── utils.py
├── models.py
├── repl.py
├── resources
│ ├── __init__.py
│ ├── api_key.py
│ ├── articles.py
│ ├── feedgroups.py
│ └── feeds.py
└── run.py
├── scripts
└── hello.py
└── setup.py
/LICENCE:
--------------------------------------------------------------------------------
1 | Permission is hereby granted, free of charge, to any person
2 | obtaining a copy of this software and associated documentation
3 | files (the "Software"), to deal in the Software without
4 | restriction, including without limitation the rights to use,
5 | copy, modify, merge, publish, distribute, sublicense, and/or sell
6 | copies of the Software, and to permit persons to whom the
7 | Software is furnished to do so, subject to the following
8 | conditions:
9 |
10 | The above copyright notice and this permission notice shall be
11 | included in all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
15 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
17 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 | OTHER DEALINGS IN THE SOFTWARE.
21 |
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Emissary
2 | ========
3 |
4 | An intelligence utility / test for researchers, programmers and generally carnivorous primates who want personally curated news archives.
5 | Emissary is a web content extractor that has a RESTful API and the ability to run pre-store scripts.
6 | Emissary stores the full text of linked articles from RSS feeds or URLs containing links.
7 |
8 | Documentation lives [here](http://docs.psybernetics.org/).
9 |
10 | --------
11 | 
12 | 
13 | 
14 |
15 |
16 | Installation requires the python interpreter headers, libevent, libxml2 and libxslt headers.
17 | Optional article compression requires libsnappy.
18 | All of these can be obtained on debian-based systems with:
19 | sudo apt-get install -y zlib1g-dev libxml2-dev libxslt1-dev python-dev libevent-dev libsnappy-dev
20 |
21 | You're then ready to install the package for all users:
22 | sudo python setup.py install
23 |
24 |
25 | Usage: python -m emissary.run
26 |
27 | -h, --help show this help message and exit
28 | -c, --crontab Crontab to parse
29 | --config (defaults to emissary.config)
30 | -a, --address (defaults to 0.0.0.0)
31 | -p, --port (defaults to 6362)
32 | --export Write the existing database as a crontab
33 | --key SSL key file
34 | --cert SSL certificate
35 | --pidfile (defaults to ./emissary.pid)
36 | --logfile (defaults to ./emissary.log)
37 | --stop
38 | --debug Log to stdout
39 | -d Run in the background
40 | --run-as (defaults to the invoking user)
41 | --scripts-dir (defaults to ./scripts/)
42 |
43 |
44 | Some initial setup has to be done before the system will start.
45 | Communication with Emissary is mainly done over HTTPS connections
46 | and for that you're going to need an SSL certificate and a key:
47 |
48 | user@host $ openssl genrsa 4096 > key
49 | user@host $ openssl req -new -x509 -nodes -sha256 -days 365 -key key > cert
50 |
51 | To prevent your API keys ever getting put into version control for all
52 | the world to see, you need to put a database URI into the environment:
53 |
54 | export EMISSARY_DATABASE="sqlite://///home/you/.emissary.db"
55 |
56 | Protip: Put that last line in your shells' rc file.
57 |
58 | Start an instance in the foreground to obtain your first API key:
59 |
60 | user@host $ python -m emissary.run --cert cert --key key
61 | 14/06/2015 16:31:30 - Emissary - INFO - Starting Emissary 2.0.0.
62 | e5a59e0a-b457-45c6-9d30-d983419c43e1
63 | ^That UUID is your Primary API key. Add it to this example crontab:
64 |
65 | user@host $ cat feeds.txt
66 | apikey: your-api-key-here
67 |
68 | # url name group minute hour day month weekday
69 | http://news.ycombinator.com/rss "HN" "HN" */15 * * * *
70 | http://phys.org/rss-feed/ "Phys.org" "Phys.org" 1 12 * * *
71 | http://feeds.nature.com/news/rss/most_recent "Nature" "Nature" 30 13 * * *
72 |
73 | user@host $ python -m emissary.run -c feeds.txt
74 | Using API key "Primary".
75 | Primary: Creating feed group HN.
76 | Primary: HN: Creating feed "HN"
77 |
78 | Emissary supports multiple apikey directives in one crontab.
79 | Subsequent feed definitions are associated with the previous key.
80 |
81 | Start an instance in the background and connect to it:
82 | user@host $ python -m emissary.run -d --cert cert --key key
83 | user@host $ python -m emissary.repl
84 | Emissary 2.0.0
85 | Psybernetics 2015
86 |
87 | (3,204) > help
88 |
89 |
90 |
91 | If the prospect of creating an NSA profile of your reading habits is
92 | something that rightfully bothers you then my advice is to subscribe
93 | to many things and then use Emissary to read the things that really
94 | interest you.
95 |
96 | 
97 |
--------------------------------------------------------------------------------
/crontab:
--------------------------------------------------------------------------------
1 | apikey: your-api-key-here
2 |
3 | # url name group minute hour day month weekday
4 | http://news.ycombinator.com/rss "HN" "HN" 20! * * * *
5 | http://mf.feeds.reuters.com/reuters/UKdomesticNews "Reuters UK" "Reuters" 0 3! * * *
6 |
--------------------------------------------------------------------------------
/doc/emissary2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary2.png
--------------------------------------------------------------------------------
/doc/emissary3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary3.png
--------------------------------------------------------------------------------
/doc/emissary4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary4.png
--------------------------------------------------------------------------------
/doc/emissary5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/doc/emissary5.png
--------------------------------------------------------------------------------
/emissary/__init__.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | #
3 | # The structure of this package is essentially as follows
4 | #
5 | # models.py Our abstractions for the types of data we persist to a database,
6 | # including how to represent columns and joins on other tables as singular
7 | # JSON documents. Handy for building list comprehensions of models.
8 | # resources/ RESTful API endpoints for interacting with models over HTTP
9 | # controllers/ Miscellaneous utilities used throughout the whole project
10 | # run.py A runner program that inserts a database schema if none is present,
11 | # binds to a network interface and changes UID if asked.
12 | # repl.py An interactive read-eval-print loop for working with the REST interface.
13 | # config.py Defines how to obtain a database URI.
14 | """
15 | A democracy thing for researchers, programmers and news junkies who want personally curated news archives.
16 | Emissary is a web content extractor that has a RESTful API and a scripting system.
17 | Emissary stores the full text of linked articles from RSS feeds or URLs containing links.
18 | """
19 |
20 | from pkgutil import extend_path
21 | __path__ = extend_path(__path__, __name__)
22 | __all__ = ["client", "controllers", "models", "resources", "run", "repl"]
23 |
24 | import time
25 | from flask import Flask
26 | from flask.ext import restful
27 | from flask.ext.sqlalchemy import SQLAlchemy
28 | from multiprocessing import Queue, cpu_count
29 | from sqlalchemy.engine.reflection import Inspector
30 |
31 | app = Flask("emissary")
32 |
33 | # This config is the default and can be overridden by
34 | # using options.config in run.py (python -m emissary.run -c somefile.py)
35 | app.config.from_object("emissary.config")
36 |
37 | app.version = "2.1.1"
38 | app.inbox = Queue()
39 | app.scripts = None
40 | app.feedmanager = None
41 | app.config["HTTP_BASIC_AUTH_REALM"] = "Emissary " + app.version
42 |
43 |
44 | # These are response queues that enable the main thread of execution to
45 | # share data with the REST interface. Mainly for reporting the status of crontabs.
46 | app.queues = []
47 | for i in range(cpu_count() * 2):
48 | q = Queue()
49 | q.access = time.time()
50 | app.queues.append(q)
51 |
52 | db = SQLAlchemy(app)
53 | api = restful.Api(app, prefix='/v1')
54 |
55 | def init():
56 | # Models are imported here to prevent a circular import where we would
57 | # import models and the models would import that db object just above us.
58 |
59 | # They're also imported here in this function because they implicitly
60 | # monkey-patch the threading module, and we might not need that if all we want
61 | # from the namespace is something like app.version, like in repl.py for example.
62 | from models import APIKey
63 | from models import FeedGroup
64 | from models import Feed
65 | from models import Article
66 | from models import Event
67 |
68 | from resources import api_key
69 | from resources import feeds
70 | from resources import feedgroups
71 | from resources import articles
72 |
73 | api.add_resource(api_key.KeyCollection, "/keys")
74 | api.add_resource(api_key.KeyResource, "/keys/")
75 |
76 | api.add_resource(feedgroups.FeedGroupCollection, "/feeds")
77 | api.add_resource(feedgroups.FeedGroupResource, "/feeds/")
78 | api.add_resource(feedgroups.FeedGroupStop, "/feeds//stop")
79 | api.add_resource(feedgroups.FeedGroupStart, "/feeds//start")
80 | api.add_resource(feedgroups.FeedGroupArticles, "/feeds//articles")
81 | api.add_resource(feedgroups.FeedGroupSearch, "/feeds//search/")
82 | api.add_resource(feedgroups.FeedGroupCount, "/feeds//count")
83 |
84 | api.add_resource(feeds.FeedResource, "/feeds//")
85 | api.add_resource(feeds.FeedArticleCollection, "/feeds///articles")
86 | api.add_resource(feeds.FeedSearch, "/feeds///search/")
87 | api.add_resource(feeds.FeedStartResource, "/feeds///start")
88 | api.add_resource(feeds.FeedStopResource, "/feeds///stop")
89 |
90 | api.add_resource(articles.ArticleCollection, "/articles")
91 | api.add_resource(articles.ArticleResource, "/articles/")
92 | api.add_resource(articles.ArticleSearch, "/articles/search/")
93 | api.add_resource(articles.ArticleCount, "/articles/count")
94 |
95 | # Create the database schema if it's not already laid out.
96 | inspector = Inspector.from_engine(db.engine)
97 | tables = [table_name for table_name in inspector.get_table_names()]
98 |
99 | if 'api_keys' not in tables:
100 | db.create_all()
101 | master = models.APIKey(name = app.config['MASTER_KEY_NAME'])
102 | if app.config['MASTER_KEY']: master.key = app.config['MASTER_KEY']
103 | else: master.key = master.generate_key_str()
104 | print master.key
105 | master.active = True
106 | db.session.add(master)
107 | db.session.commit()
108 |
--------------------------------------------------------------------------------
/emissary/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pprint
3 | import json
4 | import cmd
5 | import os
6 | os.environ['no_proxy'] = '127.0.0.1,localhost'
7 | requests.packages.urllib3.disable_warnings()
8 |
9 | class Client(object):
10 | def __init__(self, key, base_url, verify=True, timeout=2.500):
11 | self.key = key
12 | self.base = base_url
13 | pp = pprint.PrettyPrinter(indent=4)
14 | self.p = pp.pprint
15 | self.verify_https = verify
16 | self.timeout = timeout
17 |
18 | # Defining a username manually on your client objects will
19 | # permit you to use the .can() shortcut for determining
20 | # the username's access rights.
21 | self.username = None
22 |
23 | if not self.base.endswith('/'):
24 | self.base += '/'
25 |
26 | def _send_request(self, url, type='GET', body={}, headers={}):
27 | headers['Authorization'] = "Basic %s" % self.key
28 | url = self.base+url
29 | resp = None
30 | if type=='GET':
31 | resp = requests.get(url, verify=self.verify_https,
32 | headers=headers, timeout=self.timeout)
33 | elif type=='DELETE':
34 | resp = requests.delete(url, verify=self.verify_https,
35 | data=body, headers=headers, timeout=self.timeout)
36 | elif type=='PUT':
37 | resp = requests.put(url, verify=self.verify_https,
38 | data=body, headers=headers, timeout=self.timeout)
39 | elif type=='POST':
40 | resp = requests.post(url, verify=self.verify_https,
41 | data=body, headers=headers, timeout=self.timeout)
42 | try: return resp.json(), resp.status_code
43 | except: return {}, resp.status_code
44 |
45 | def get(self, url, body={}, headers={}):
46 | return self._send_request(url, body=body, headers=headers)
47 |
48 | def put(self, url, body={}, headers={}):
49 | return self._send_request(url, type='PUT', body=body, headers=headers)
50 |
51 | def post(self, url, body={}, headers={}):
52 | return self._send_request(url, type='POST', body=body, headers=headers)
53 |
54 | def delete(self, url, body={}, headers={}):
55 | return self._send_request(url, type='DELETE', body=body, headers=headers)
56 |
57 | def pp(self, url, type='GET', body={}, headers={}):
58 | self.p(self._send_request(url, type, body, headers))
59 |
60 | def keys(self, type='GET', body={}, headers={}):
61 | return self._send_request("keys", type, body, headers)
62 |
63 | def __repr__(self):
64 | return "" % self.base
65 |
--------------------------------------------------------------------------------
/emissary/config.py:
--------------------------------------------------------------------------------
1 | import os, getpass
2 | if not 'EMISSARY_DATABASE' in os.environ:
3 | print 'You need to export a URI for EMISSARY_DATABASE'
4 | print 'Eg: export EMISSARY_DATABASE="sqlite://///home/%s/.emissary.db"' % getpass.getuser()
5 | raise SystemExit
6 | else:
7 | SQLALCHEMY_DATABASE_URI = (
8 | os.environ['EMISSARY_DATABASE']
9 | )
10 |
11 | MASTER_KEY = None
12 | MASTER_KEY_NAME = "Primary"
13 | PERMIT_NEW = False
14 | GZIP_HERE = True
15 | COMPRESS_ARTICLES = True
16 | ENABLE_CORS = False
17 | if "NO_DUPLICATE_TITLES" in os.environ:
18 | NO_DUPLICATE_TITLES = os.environ['DUPLICATE_TITLES']
19 | else:
20 | NO_DUPLICATE_TITLES = True
21 |
--------------------------------------------------------------------------------
/emissary/controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/emissary/controllers/__init__.py
--------------------------------------------------------------------------------
/emissary/controllers/cron.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # From http://stackoverflow.com/questions/373335/suggestions-for-a-cron-like-scheduler-in-python
3 | import gevent
4 | import time, sys
5 | from datetime import datetime, timedelta
6 |
7 | class CronError(Exception):
8 | def __init__(self, message):
9 | self.message = message
10 | def __str__(self):
11 | return repr(self.message)
12 |
13 | class days:
14 | mon = 0
15 | tue = 1
16 | wed = 2
17 | thu = 3
18 | fri = 4
19 | sat = 5
20 | sun = 6
21 |
22 | class months:
23 | jan = 1
24 | feb = 2
25 | mar = 3
26 | apr = 4
27 | may = 5
28 | jun = 6
29 | jul = 7
30 | aug = 8
31 | sep = 9
32 | oct = 10
33 | nov = 11
34 | dec = 12
35 |
36 | # Turn a list of timing data into raw numeric values
37 | def parse_timings(timings):
38 | # minute hour day month weekday
39 | # 0 6,12 * 0-11 mon-sun
40 | # Currently contains off by one errors.
41 | if type(timings) == str:
42 | timings = timings.split()
43 | if len(timings) != 5:
44 | print len(timings), timings
45 | raise CronError('Timings require five fields.')
46 | minute = hour = day = month = weekday = []
47 | if timings[0] == '*': minute = allMatch # range(0,60)
48 | if timings[1] == '*': hour = allMatch # range(0,24)
49 | if timings[2] == '*': day = allMatch # range(0,32)
50 | if timings[3] == '*': month = allMatch # range(0,12)
51 | if timings[4] == '*': weekday = allMatch # range(0,7)
52 | for i, v in enumerate(timings):
53 | if len(v) < 3:
54 | try:
55 | r = int(v)
56 | if i == 0: minute = [r]
57 | if i == 1: hour = [r]
58 | if i == 2: day = [r]
59 | if i == 3: month = [r]
60 | if i == 4: weekday = [r]
61 | except:
62 | pass
63 | if ',' in v: # TODO: Incorporate lists of days and months.
64 | t = v.split(',')
65 | x=[]
66 | for f in t:
67 | x.append(int(f))
68 | if i == 0: minute = x
69 | if i == 1: hour = x
70 | if i == 2: day = x
71 | if i == 3: month = x
72 | if i == 4: weekday = x
73 | del t,f,x
74 | if v.endswith("!") or v.startswith("*/"):
75 | s = ""
76 | for j in v:
77 | if j.isdigit():
78 | s += j
79 | s = int(s)
80 | if i == 0: minute = range(0,60,s)
81 | if i == 1: hour = range(0,24,s)
82 | if i == 2: day = range(0,32,s)
83 | if i == 3: month = range(0,12,s)
84 | if i == 4: weekday = range(0,7,s)
85 | if '-' in v and len(v) > 2:
86 | r = v.split('-')
87 | for n,m in enumerate(r):
88 | try:
89 | r[n] = int(m)
90 | except:
91 | pass
92 | if type(r[n]) == int:
93 | if i == 0: minute = range(r[0],int(r[1])+1)
94 | if i == 1: hour = range(r[0],int(r[1])+1)
95 | if i == 2: day = range(r[0],int(r[1])+1)
96 | if i == 3: month = range(r[0],int(r[1])+1)
97 | if i == 4: weekday = range(r[0],int(r[1])+1)
98 | continue
99 | else:
100 | start = stop = None
101 | if i == 3: # Months
102 | if hasattr(months,r[0]):
103 | start = getattr(months,r[0])
104 | if hasattr(months,r[1]):
105 | stop = getattr(months,r[1])
106 | if (start and stop) != None:
107 | month = range(start,stop+1)
108 | del start, stop
109 | else:
110 | raise CronError('Malformed month data.')
111 | if i == 4: # Weekdays
112 | if hasattr(days,r[0]):
113 | start = getattr(days,r[0])
114 | if hasattr(days,r[1]):
115 | stop = getattr(days,r[1])
116 | if (start and stop) != None:
117 | weekday = range(start,stop+1)
118 | del start, stop
119 | else:
120 | raise CronError('Malformed day-of-the-week data.')
121 | del v,i,r,n,m,
122 | return minute, hour, day, month, weekday
123 |
124 | def parse_crontab_line(line,lineno=None,tcpd=False):
125 | url=line.split()[0]
126 | f=line.split()[1:]
127 | for i,w in enumerate(f):
128 | if w.endswith("'"): break
129 | name = ' '.join(f[:i+1]).strip("'")
130 | timings = ' '.join(f[i+1:]) # Minutes Hour Day Month Weekday
131 | parse_timings(timings)
132 | if not tcpd:
133 | if lineno:
134 | print "Line %s. %s: %s %s" % (lineno,name,url,timings)
135 | else:
136 | print "%s: %s %s" % (name,url,timings)
137 | return (url,name,timings)
138 |
139 | # Some utility classes / functions first
140 | class AllMatch(set):
141 | """Universal set - match everything"""
142 | def __contains__(self, item): return True
143 |
144 | allMatch = AllMatch()
145 |
146 | def conv_to_set(obj): # Allow single integer to be provided
147 | if isinstance(obj, (int,long)):
148 | return set([obj]) # Single item
149 | if not isinstance(obj, set):
150 | obj = set(obj)
151 | return obj
152 |
153 | class Event(object):
154 | def __init__(self, action, min=allMatch, hour=allMatch,
155 | day=allMatch, month=allMatch, dow=allMatch,
156 | args=(), kwargs={}):
157 | self.mins = conv_to_set(min)
158 | self.hours= conv_to_set(hour)
159 | self.days = conv_to_set(day)
160 | self.months = conv_to_set(month)
161 | self.dow = conv_to_set(dow)
162 | self.action = action
163 | self.args = args
164 | self.kwargs = kwargs
165 | self.running = False
166 | self.name = None
167 |
168 | def matchtime(self, t):
169 | """Return True if this event should trigger at the specified datetime"""
170 | return ((t.minute in self.mins) and
171 | (t.hour in self.hours) and
172 | (t.day in self.days) and
173 | (t.month in self.months) and
174 | (t.weekday() in self.dow))
175 |
176 | def check(self, t):
177 | if self.matchtime(t):
178 | self.running = True
179 | self.action(*self.args, **self.kwargs)
180 | self.running = False
181 |
182 | class CronTab(gevent.Greenlet):
183 | def __init__(self, *events):
184 | self.events = events
185 | self.name = None
186 | gevent.Greenlet.__init__(self)
187 |
188 | def _run(self):
189 | t=datetime(*datetime.now().timetuple()[:5])
190 | while 1:
191 | for e in self.events:
192 | # print zip([i for i in dir(self)], [getattr(self,i) for i in dir(self)])
193 | if self.inbox: # This .get() blocks, preventing duplicate greenlets running
194 | msg = self.inbox.get() # in the same addr due to our use of multiprocessing.Process
195 | e.check(t)
196 | t += timedelta(minutes=1)
197 | n = datetime.now()
198 | while n < t:
199 | s = (t - n).seconds + 1
200 | time.sleep(s)
201 | n = datetime.now()
202 |
203 | def __repr__(self):
204 | if self.name:
205 | return "" % (self.name, hex(id(self)))
206 | else:
207 | return "" % hex(id(self))
208 |
209 | def parse_crontab(db,log):
210 | table = db['feeds']
211 |
212 | crontab = sys.stdin.read()
213 | feedlines={}
214 |
215 | for index, line in enumerate(crontab.split('\n')):
216 | if line.startswith('http'):
217 | index+=1
218 | feedlines['%s' % index] = line
219 | elif (line.startswith('#')) or (line == ''): continue
220 | else: print Utils.parse_option(line,config)
221 |
222 | for lineno, feedline in feedlines.items():
223 | url=name=timings=None
224 | try:
225 | (url,name,timings) = Cron.parse_crontab_line(feedline,lineno)
226 | except EmissaryError, e:
227 | print e
228 |
229 | if url and name and timings:
230 | # Check URL isn't already loaded
231 | feed = Feed.Feed(db,log,url=url)
232 | if 'name' in feed.feed.keys():
233 | if name != feed['name'] or timings != feed['timings']:
234 | feed.adjust(name,timings)
235 | sys.stdout.write("Adjusted %s: %s\n" % (name,feed.feed))
236 | else:
237 | sys.stdout.write('Adding %s\n' % name)
238 | feed = Feed.Feed(db,log).create(name,url,timings)
239 |
240 | raise SystemExit
241 |
242 | #if __name__ == '__main__':
243 | # c = CronTab(Event(lambda x: print "Hello", range(0,59), range(0,23), dow=range(0,5)))
244 | # c.run()
245 |
246 |
--------------------------------------------------------------------------------
/emissary/controllers/fetch.py:
--------------------------------------------------------------------------------
1 | import time
2 | import urlparse
3 | import requests
4 | import feedparser
5 | from emissary import app, db
6 | from sqlalchemy import and_, or_
7 | from emissary.models import Article
8 | from emissary.controllers import parser
9 | from emissary.controllers.utils import uid, tconv
10 | requests.packages.urllib3.disable_warnings()
11 |
12 | snappy = None
13 | if app.config['COMPRESS_ARTICLES']:
14 | try:
15 | import snappy
16 | except ImportError:
17 | pass
18 |
19 |
20 | # This is a little globally-available (as far as coroutines calling this are concerned)
21 | # dictionary of urls we've already visited. It permits us to only try a url
22 | # four times every half an hour. If we see it again after half an hour we'll
23 | # try it again, otherwise it stays in the seen dictionary. It also needs periodically
24 | # emptying, lest it grow infinitely.
25 | seen = {}
26 |
27 | def get(url):
28 | headers = {"User-Agent": "Emissary "+ app.version}
29 | return requests.get(url, headers=headers, verify=False)
30 |
31 | # Fetch a feed.url, parse the links, visit the links and store articles.
32 | def fetch_feed(feed, log):
33 |
34 | if feed.group:
35 | log("%s: %s: Fetching %s." % \
36 | (feed.key.name, feed.group.name, feed.name))
37 | else:
38 | log("%s: Fetching %s." % (feed.key.name, feed.name))
39 | try:
40 | r = get(feed.url)
41 | except Exception, e:
42 | log("%s: %s: Error fetching %s: %s" % \
43 | (feed.key.name, feed.group.name, feed.name, e.message[0]))
44 | return
45 |
46 | # Fetch the links and create articles
47 | links = parser.extract_links(r)
48 | title = None
49 | for link in links:
50 | # try:
51 | fetch_and_store(link, feed, log)
52 | # except Exception, e:
53 | # log("%s: %s: Error with %s: %s" % \
54 | # (feed.key.name, feed.name, link, e.message), "error")
55 |
56 | def fetch_and_store(link, feed, log, key=None, overwrite=False):
57 | """
58 | Fetches, extracts and stores a URL.
59 | link can be a list of urls or a dictionary of url/title pairs.
60 | """
61 | then = int(time.time())
62 | # If the feed was XML data then we probably have a dictionary of
63 | # url:title pairs, otherwise we have a list of urls.
64 | if type(link) == dict:
65 | for url, title in link.items(): continue
66 | else:
67 | url = link
68 | title = None
69 |
70 | # Skip this url if we've already extracted and stored it for this feed, unless we're overwriting.
71 | if Article.query.filter(and_(Article.url == url, Article.feed == feed)).first():
72 | if overwrite:
73 | log("%s: %s/%s: Preparing to overwrite existing copy of %s" % \
74 | (feed.key.name, feed.group.name, feed.name, url), "debug")
75 | else:
76 | log("%s: %s/%s: Already storing %s" % (feed.key.name, feed.group.name, feed.name, url), "debug")
77 | return
78 |
79 | # Fix links with no schema
80 | if not "://" in url:
81 | url = "http://" + url
82 |
83 | # Store our awareness of this url during this run in a globally available dictionary,
84 | # in the form [counter, timestamp].
85 | if url not in seen:
86 | seen[url] = [1, int(time.time())]
87 | else:
88 | # If we haven't modified the counter for half an hour, reset it.
89 | now = int(time.time())
90 | if (now - seen[url][1]) > 60*30:
91 | seen[url] = [1, int(time.time())]
92 | # If we have tried this URL four times, disregard it.
93 | # We might reset its counter in half an hour anyway.
94 | if seen[url][0] >= 4:
95 | return
96 | # Otherwise increment and continue with storing.
97 | seen[url][0] += 1
98 | seen[url][1] = int(time.time())
99 |
100 | # Prune seen URLs older than a day.
101 | for _ in seen.copy():
102 | if int(time.time()) - seen[_][1] > 86400:
103 | del seen[_]
104 |
105 | try:
106 | document = get(url)
107 | except Exception, e:
108 | log("%s: %s/%s: Error fetching %s: %s" % \
109 | (feed.key.name, feed.group.name, feed.name, url, e.message[0]))
110 | return
111 |
112 | # Mimetype detection.
113 | if 'content-type' in document.headers:
114 | if 'application' in document.headers['content-type']:
115 | if not title:
116 | title = url
117 | article = Article(
118 | url=url,
119 | title=title,
120 | )
121 | if not "://" in article.url:
122 | article.url = "http://" + article.url
123 | commit_to_feed(feed, article)
124 | log("%s: %s/%s: Stored %s, reference to %s (%s)" % \
125 | (feed.key.name, feed.group.name, feed.name, article.uid, url, document.headers['content-type']))
126 | return
127 |
128 | # Document parsing.
129 | try:
130 | article_content = parser.extract_body(document.text)
131 | summary = parser.summarise(article_content)
132 | except Exception, e:
133 | log("%s: %s: Error parsing %s: %s" % (feed.key.name, feed.group.name, url, e.message))
134 | return
135 |
136 | # Ensure a title and disregard dupes
137 | if not title:
138 | title = parser.extract_title(document.text)
139 |
140 | if app.config['NO_DUPLICATE_TITLES']:
141 | if Article.query.filter(
142 | and_(Article.title == title, Article.key == feed.key)
143 | ).first():
144 | return
145 |
146 | # Initial article object
147 | article = Article(
148 | url=url,
149 | title=title,
150 | summary=summary
151 | )
152 |
153 | # Determine whether to store the full content or a compressed copy
154 | if not app.config['COMPRESS_ARTICLES']:
155 | article.content=article_content
156 | else:
157 | article.ccontent = snappy.compress(article_content.encode("utf-8", "ignore"))
158 | article.compressed = True
159 |
160 | #
161 | # We execute scripts before committing articles to the database
162 | # it runs the risk of a singular script halting the entire thing
163 | # in return we get to modify articles (ie machine translation) before storing.
164 |
165 | # Non-blocking IO will result in the most reliable performance within your scripts.
166 | #
167 | for s in app.scripts.scripts.values():
168 | try:
169 | s.execute(env={'article':article, 'feed':feed})
170 | article = s['article']
171 | except Exception, e:
172 | log("Error executing %s: %s" % (s.file, e.message), "error")
173 |
174 | commit_to_feed(feed, article)
175 |
176 | now = int(time.time())
177 | duration = tconv(now-then)
178 | log('%s: %s/%s: Stored %s "%s" (%s)' % \
179 | (feed.key.name, feed.group.name, feed.name, article.uid, article.title, duration))
180 | del then, now, duration, feed, article, url, title
181 | return
182 |
183 | def fetch_feedless_article(key, url, overwrite=False):
184 | """
185 | Given a URL, create an Article and attach it to a Key.
186 | """
187 | then = int(time.time())
188 | log = app.log
189 |
190 | if Article.query.filter(Article.url == url).first():
191 | if overwrite:
192 | log("%s: Preparing to overwrite existing copy of %s" % (key.name,url), "debug")
193 | else:
194 | log("%s: Already storing %s" % (key.name, url), "debug")
195 | return
196 |
197 | try:
198 | response = get(url)
199 | except Exception, e:
200 | log("%s: Error fetching %s: %s." % (key.name, url, e.message))
201 | return
202 |
203 | article_content = parser.extract_body(response.text)
204 | title = parser.extract_title(response.text)
205 | summary = parser.summarise(article_content)
206 | article = Article(
207 | url=url,
208 | title=title,
209 | summary=summary
210 | )
211 |
212 | if not app.config['COMPRESS_ARTICLES']:
213 | article.content = article_content
214 | else:
215 | article.ccontent = snappy.compress(article_content.encode("utf-8", "ignore"))
216 | article.compress = True
217 |
218 | for s in app.scripts.scripts.values():
219 | try:
220 | s.execute(env={'article':article, 'feed':None})
221 | article = s['article']
222 | except Exception, e:
223 | log("Error executing %s: %s" % (s.file, e.message), "error")
224 |
225 | key.articles.append(article)
226 |
227 | article.uid = uid()
228 |
229 | db.session.add(article)
230 | db.session.add(key)
231 | db.session.commit()
232 |
233 | now = int(time.time())
234 | duration = tconv(now-then)
235 | log('%s: Stored %s "%s" (%s)' % (key.name, article.uid, article.title, duration))
236 | return article
237 |
238 | def commit_to_feed(feed, article):
239 | """
240 | Place a new article on the api key of a feed, the feed itself,
241 | and commit changes.
242 | """
243 |
244 | # We give articles UIDs manually to ensure unique time data is used.
245 | article.uid = uid()
246 |
247 | session = feed._sa_instance_state.session
248 | feed.articles.append(article)
249 | feed.key.articles.append(article)
250 |
251 | session.add(article)
252 | session.add(feed)
253 | session.commit()
254 | del article, feed, session
255 |
--------------------------------------------------------------------------------
/emissary/controllers/load.py:
--------------------------------------------------------------------------------
1 | # This file contains functions designed for
2 | # loading cron tables and storing new feeds.
3 |
4 | from emissary import db
5 | from sqlalchemy import and_
6 | from emissary.controllers.utils import spaceparse
7 | from emissary.controllers.cron import parse_timings
8 | from emissary.models import APIKey, Feed, FeedGroup
9 |
10 | def create_feed(log, db, key, group, feed):
11 | """
12 | Takes a key object, a group name and a dictionary
13 | describing a feed ({name:,url:,schedule:,active:})
14 | and reliably attaches a newly created feed to the key
15 | and group.
16 | """
17 | if not type(feed) == dict:
18 | log('Unexpected type when creating feed for API key "%s"' % key.name)
19 | return
20 |
21 | for i in ['name', 'schedule', 'active', 'url']:
22 | if not i in feed.keys():
23 | log('%s: Error creating feed. Missing "%s" field from feed definition.' % (key.name, i))
24 | return
25 |
26 | f = Feed.query.filter(and_(Feed.key == key, Feed.name == feed['name'])).first()
27 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == group)).first()
28 |
29 | if f:
30 | if f.group:
31 | log('%s: Error creating feed "%s" in group "%s", feed already exists in group "%s".' % \
32 | (key.name, feed['name'], group, f.group.name))
33 | return
34 | elif fg:
35 | log('%s: %s: Adding feed "%s"' % (key.name, fg.name, f.name))
36 | fg.append(f)
37 | db.session.add(fg)
38 | db.session.add(f)
39 | db.session.commit()
40 | return
41 |
42 | if not fg:
43 | log('%s: Creating feed group %s.' % (key.name, group))
44 | fg = FeedGroup(name=group)
45 | key.feedgroups.append(fg)
46 |
47 | try:
48 | parse_timings(feed['schedule'])
49 | except Exception, e:
50 | log('%s: %s: Error creating "%s": %s' % \
51 | (key.name, fg.name, feed['name'], e.message))
52 |
53 | log('%s: %s: Creating feed "%s"' % (key.name, fg.name, feed['name']))
54 | f = Feed(
55 | name=feed['name'],
56 | url=feed['url'],
57 | active=feed['active'],
58 | schedule=feed['schedule']
59 | )
60 | fg.feeds.append(f)
61 | key.feeds.append(f)
62 | db.session.add(key)
63 | db.session.add(fg)
64 | db.session.add(f)
65 | db.session.commit()
66 |
67 | def parse_crontab(filename):
68 | """
69 | Get a file descriptor on filename and
70 | create feeds and groups for API keys therein.
71 | """
72 | def log(message):
73 | print message
74 | # read filename into a string named crontab
75 | try:
76 | fd = open(filename, "r")
77 | except OSError:
78 | print "Error opening %s" % filename
79 | raise SystemExit
80 | crontab = fd.read()
81 | fd.close()
82 |
83 | # keep a resident api key on hand
84 | key = None
85 |
86 | for i, line in enumerate(crontab.split('\n')):
87 |
88 | # Set the APIKey we're working with when we find a line starting
89 | # with apikey:
90 | if line.startswith("apikey:"):
91 | if ' ' in line:
92 | key_str = line.split()[1]
93 | key = APIKey.query.filter(APIKey.key == key_str).first()
94 | if not key:
95 | print 'Malformed or unknown API key at line %i in %s: %s' % (i+1, filename, line)
96 | raise SystemExit
97 | else:
98 | print 'Using API key "%s".' % key.name
99 |
100 | if line.startswith("http"):
101 | feed = {'active': True}
102 |
103 | # Grab the URL and set the string to the remainder
104 | feed['url'] = line.split().pop(0)
105 | line = ' '.join(line.split()[1:])
106 |
107 | # Grab names and groups
108 | names = spaceparse(line)
109 | if not names:
110 | print "Error parsing feed or group name at line %i in %s: %s" % (i+1, filename, line)
111 | continue
112 | feed['name'], group = names[:2]
113 |
114 | # The schedule should be the last five items
115 | schedule = line.split()[-5:]
116 | try:
117 | parse_timings(schedule)
118 | except Exception, e:
119 | print "Error parsing schedule at line %i in %s: %s" % (i+1, filename, e.message)
120 | continue
121 |
122 | feed['schedule'] = ' '.join(schedule)
123 |
124 | create_feed(log, db, key, group, feed)
125 |
--------------------------------------------------------------------------------
/emissary/controllers/log.py:
--------------------------------------------------------------------------------
1 | """
2 | This file provides a generic logging class.
3 | It could do with automatic file rotation and syslog support.
4 |
5 | Luke Brooks 2015
6 | MIT License.
7 | """
8 | import logging, time
9 |
10 | class Log(object):
11 | def __init__(self, program, log_file=None, log_stdout=False):
12 | self.program = program
13 | self.log = None
14 | self.debug = False
15 |
16 | if log_file or log_stdout:
17 | formatter = logging.Formatter(
18 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%d/%m/%Y %H:%M:%S'
19 | )
20 | self.log = logging.getLogger(program)
21 | self.log.setLevel(logging.DEBUG)
22 |
23 | if log_stdout:
24 | ch = logging.StreamHandler()
25 | ch.setLevel(logging.DEBUG)
26 | ch.setFormatter(formatter)
27 | self.log.addHandler(ch)
28 |
29 | if log_file:
30 | ch = logging.FileHandler(log_file, 'a')
31 | ch.setLevel(logging.DEBUG)
32 | ch.setFormatter(formatter)
33 | self.log.addHandler(ch)
34 |
35 | def __call__(self, data, level='info'):
36 | if self.log:
37 | if level == 'debug': level = 10
38 | if level == 'info': level = 20
39 | if level == 'warning': level = 30
40 | if level == 'error': level = 40
41 | if level == 'critical': level = 50
42 |
43 | if (level > 15) or (self.debug):
44 | self.log.log(level,data)
45 |
--------------------------------------------------------------------------------
/emissary/controllers/manager.py:
--------------------------------------------------------------------------------
1 | from gevent.queue import Queue
2 | import sys, os, time, pwd, optparse, gevent, hashlib
3 |
4 | from sqlalchemy import and_
5 | from emissary.models import Feed, FeedGroup, APIKey
6 | from emissary.controllers import cron
7 | from emissary.controllers import fetch
8 |
9 | class EmissaryError(Exception):
10 | def __init__(self, message):
11 | self.message = message
12 | def __str__(self):
13 | return repr(self.message)
14 |
15 | class FeedManager(object):
16 | """Keeps CronTab objects in rotation"""
17 | def __init__(self, log):
18 | self.log = log
19 | self.app = None
20 | self.running = False
21 | self.crontabs = {}
22 | self.threads = []
23 | self.revived = {} # {name: [amt, time]}
24 |
25 | def load_feeds(self):
26 | """
27 | Currently just starts all feeds flat, by checking if they and their
28 | FeedGroup are active.
29 |
30 |
31 | TODO: Start feeds by API key. Where each CronTab corresponds to a FeedGroup.
32 | """
33 | for key in APIKey.query.all():
34 |
35 | if key.reader:
36 | continue
37 |
38 | if not key.active:
39 | self.log('API key "%s" marked inactive. Skipped.' % (key.name))
40 | continue
41 |
42 | self.log("%s: Processing feed groups." % key.name)
43 | for fg in key.feedgroups:
44 |
45 | if not fg.active:
46 | self.log('%s: Feed group "%s" marked inactive. Skipped.' % \
47 | (key.name, fg.name))
48 | continue
49 |
50 | for feed in fg.feeds:
51 | if not feed.active:
52 | self.log('%s:%s: Feed "%s" marked inactive. Skipped.' % \
53 | (key.name, fg.name, feed.name))
54 | continue
55 |
56 | self.log('%s: %s: Scheduling "%s" (%s)' % \
57 | (key.name, fg.name, feed.name, feed.schedule))
58 |
59 | ct = self.create_crontab(feed)
60 | g = gevent.spawn(ct.run)
61 | g.name = ct.name
62 | self.threads.append(g)
63 | name = self.generate_ct_name(feed)
64 | self.crontabs[name] = ct
65 |
66 | def run(self):
67 | """
68 | Receive inbox messages and revive feeds.
69 | Also block duplicate crontab execution.....
70 |
71 | The reason we do this is due to a quirk of
72 | using Gevent with multiprocessing.Process.
73 |
74 | It's why obtaining the article count in the REPL prompt
75 | takes a second, but the tradeoff is that Emissary won't
76 | overutilise your CPU in this loop.
77 |
78 | If you run a greenlet in a subprocess we end up with
79 | CronTab greenlets executing twice but in the same address space...
80 | So I've settled on this solution for now after investigating GIPC,
81 | which works with Flask's built in httpd, but that's not as nimble
82 | as gevent.WSGIServer.
83 | """
84 | self.running = True
85 | while self.running:
86 | while not self.app.inbox.empty():
87 | self.receive(self.app.inbox.get(block=False))
88 | # Run feeds
89 | gevent.sleep()
90 | for ct in self.crontabs.values():
91 | if ct.inbox.empty():
92 | ct.inbox.put("ping")
93 | # Check if revive needed
94 | self.revive(ct)
95 | for i in self.threads:
96 | if i.started == False:
97 | self.threads.remove(i)
98 | # the sleep for 50ms keeps cpu utilisation low
99 | gevent.sleep()
100 | time.sleep(0.05)
101 | self.log("Cleaning up..")
102 |
103 | def create_crontab(self, feed):
104 | t = cron.parse_timings(feed.schedule.split())
105 | evt = cron.Event( # One possible design for these crontabs
106 | fetch.fetch_feed, # is to have them correspond to a FeedGroup
107 | t[0], t[1], t[2], t[3], t[4],# where each event is a member feed
108 | [feed, self.log]) # and stopping the crontab stops the group.
109 | evt.feed = feed
110 | ct = cron.CronTab(evt)
111 | ct.name = self.generate_ct_name(feed)
112 | ct.inbox = Queue()
113 | return ct
114 |
115 | def generate_ct_name(self, feed):
116 | """
117 | Generate a crontab name from a feed object that's
118 | hopefully unique between multiple feeds in multiple groups
119 | on multiple API keys.
120 |
121 | Determining the feed.key.key string here proved to be too expensive,
122 | so instead it's trusted that the name and creation time are unique enough.
123 |
124 | Improvements to this implementation are most welcome.
125 | """
126 | return hashlib.sha1("%s %s" % (feed.name, feed.created)).hexdigest()
127 |
128 | def revive(self, ct):
129 | """
130 | Restart a dead crontab.
131 | Permit a ceiling amount of restarts.
132 | Only restart a feed once per minute.
133 | """
134 | if ct.name in self.revived:
135 | now = time.time()
136 | then = self.revived[ct.name][1]
137 | if (now - then) < 60:
138 | return
139 | self.revived[ct.name][0] += 1
140 | self.revived[ct.name][1] = now
141 | else:
142 | self.revived[ct.name] = [1, time.time()]
143 |
144 | if ct.started == False:
145 | feed = ct.events[0].feed
146 | ct = self.create_crontab(feed)
147 | self[ct.name] = ct
148 | gevent.spawn(ct.run)
149 | # if feed.name in self.crontabs.keys():
150 | # self.log("Restarting %s" % ct.name, "warning")
151 |
152 | # name = self.generate_ct_name(feed)
153 | # self.crontabs[name] = ct
154 | # self.log(self.crontabs)
155 |
156 | def receive(self, payload):
157 | """
158 | The Feed manager is an actor with an inbox that responds to commands
159 | issued by the HTTPD process. We accept a list containing a queue ID
160 | a command name that corresponds to FeedManager.handle_ and
161 | arguments, even if it's just a None.
162 | """
163 | if len(payload) < 3 or type(payload) != list: return
164 | qid, command, args = payload
165 | func = getattr(self, "handle_" + command, None)
166 | # Execute on messages with a Queue ID of zero without emitting a response
167 | if func and not qid: return(func(args))
168 | # Otherwise, use response queues based on access times
169 | elif func:
170 | # We do a double comparison here in order to sort the queue out of the loop
171 | q = [q for q in self.app.queues if hex(id(q)) == qid]
172 | if not q:
173 | self.log("Couldn't find response queue at %s." % id)
174 | return
175 | q=q[0]
176 | # Put our response on the queue and rotate its priority.
177 | try:
178 | q.put(func(args))
179 | except Exception,e:
180 | self.app.log(e.message,'warning')
181 | q.access = time.time()
182 | self.app.queues.sort(key=lambda q: q.access, reverse=True)
183 | return
184 | return
185 |
186 | def handle_check(self, feed):
187 | """
188 | Return whether we have a feed running or not.
189 | """
190 | name = self.generate_ct_name(feed)
191 | if name in self.crontabs and self.crontabs[name].started:
192 | return True
193 | return False
194 |
195 | def handle_start(self, args):
196 | """
197 | Schedule a feed.
198 |
199 | We look the feed up here because for some reason freshly
200 | created ones aren't great at journeying over IPC queues.
201 | """
202 | key, name = args
203 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first()
204 | if not feed: return
205 |
206 | self.app.log('%s: %s: Scheduling "%s" (%s)' % \
207 | (key.name, feed.group.name, feed.name, feed.schedule))
208 | ct = self.create_crontab(feed)
209 | self.crontabs[ct.name] = ct
210 | g = gevent.spawn(ct.run)
211 | g.name = ct.name
212 | self.threads.append(g)
213 | return True
214 |
215 | def handle_stop(self, args):
216 | """
217 | Halt a feed.
218 |
219 | We can't look the feed up from the database here because we may have
220 | already deleted it from our records, so instead we iterate through
221 | all of our green threads until something sticks.
222 | """
223 | key, name = args
224 |
225 | for id, ct in self.crontabs.items():
226 | feed = ct.events[0].feed
227 | if feed.name == name and feed.key.key == key.key:
228 | if self.app.debug:
229 | self.app.log('%s: %s: Unscheduling "%s". [thread %s]' % \
230 | (key.name, feed.group.name, feed.name, id))
231 | else:
232 | self.app.log('%s: %s: Unscheduling "%s".' % \
233 | (key.name, feed.group.name, feed.name))
234 | for t in self.threads:
235 | if t.name == id:
236 | gevent.kill(t)
237 | break
238 | self.threads.remove(t)
239 | del ct
240 | del self.crontabs[id]
241 | return True
242 | return False
243 |
244 | def __setitem__(self, name, crontab):
245 | if name in self.crontabs.keys():
246 | if crontab.name:
247 | self.log("Restarting %s" % crontab.name, "warning")
248 | else:
249 | self.log("Restarting %s" % name, "warning")
250 | crontab.name = name
251 | self.crontabs[name] = crontab
252 | gevent.spawn(crontab)
253 |
254 | def __getitem__(self, name):
255 | if name in self.crontabs.keys():
256 | return self.crontabs[name]
257 | else:
258 | raise KeyError('Invalid CronTab')
259 |
260 | def __delitem__(self, name):
261 | """Halt crontab, delete"""
262 | if name in self.crontabs.keys():
263 | self.crontabs[name].kill()
264 | del self.crontabs[name]
265 |
266 | def keys(self):
267 | return self.crontabs.keys()
268 |
--------------------------------------------------------------------------------
/emissary/controllers/parser.py:
--------------------------------------------------------------------------------
1 | # This file implements routines for extracting links from response objects.
2 | import re
3 | import lxml
4 | import urlparse
5 | import feedparser
6 | # We have sought to disperse power, to set men and women free.
7 | # That really means: to help them to discover that they are free.
8 | # Everybody's free. The slave is free.
9 | # The ultimate weapon isn't this plague out in Vegas, or any new super H-bomb.
10 | # The ultimate weapon has always existed. Every man, every woman, and every child owns it.
11 | # It's the ability to say No and take the consequences.
12 | # 'Fear is failure.' 'The fear of death is the beginning of slavery.'
13 | # "Thou hast no right but to do thy will.'
14 | # The goose can break the bottle at any second.
15 | # Socrates took the hemlock to prove it.
16 | # Jesus went to the cross to prove it.
17 | # It's in all history, all myth, all poetry.
18 | # It's right out in the open all the time."
19 | from goose import Goose
20 |
21 | def extract_links(response):
22 | urls = []
23 | if ('content-type' in response.headers.keys()) and ('xml' in response.headers['content-type']):
24 | f = feedparser.parse(response.text)
25 | for entry in f.entries:
26 | urls.append({entry.link: entry.title})
27 | del f
28 | else: # The following is a highly experimental feature.
29 | url = urlparse.urlparse(response.url)
30 | url = url.scheme + "://" + url.netloc
31 | p = Parser(response.text, url=url)
32 | urls = p.parse()
33 | del url, p
34 | return urls
35 |
36 | class Parser(object):
37 | """
38 | Build a list of relevant links from an HTML string and the root URL.
39 |
40 | p = Parser(html_text, root_url)
41 | urls = p.parse()
42 | """
43 | def __init__(self,html=None,doc=None,url=None):
44 | self.html=html
45 | self.doc=doc
46 | try: self.url = urlparse.urlparse(url).netloc
47 | except: self.url = url
48 | self.links=[]
49 |
50 | def root_to_urls(self, doc, titles):
51 | """
52 | Return a list of urls from an lxml root.
53 | """
54 | if doc is None:
55 | return []
56 |
57 | a_tags = doc.xpath('//a')
58 | # tries to find titles of link elements via tag text
59 | if titles:
60 | return [ (a.get('href'), a.text) for a in a_tags if a.get('href') ]
61 | return [ a.get('href') for a in a_tags if a.get('href') ]
62 |
63 | def get_urls(self,_input=None,titles=False,regex=False):
64 | if (not _input) and (not self.html): return []
65 | if not _input: _input = self.html
66 | if regex:
67 | text = re.sub('<[^<]+?>', ' ', _input)
68 | text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', _input)
69 | text = [i.strip() for i in _input]
70 | return _input or []
71 | if isinstance(_input, str) or isinstance(_input, unicode):
72 | doc = self.fromstring(_input)
73 | else:
74 | doc = text
75 | return self.root_to_urls(doc, titles)
76 |
77 | def fromstring(self, html):
78 | try:
79 | self.doc = lxml.html.fromstring(html)
80 | except Exception, e:
81 | return None
82 | return self.doc
83 |
84 | def parse(self,html=None,url=None):
85 | """
86 | Whittle a list of urls into things we're interested in.
87 | """
88 | if self.links: self.links=[]
89 | urls = self.get_urls(html)
90 | if not urls: return urls
91 | else: urls = set(urls)
92 | if url: url = "http://%s/" % urlparse.urlparse(url).netloc
93 | for u in urls:
94 | if url:
95 | if u == url: continue
96 | if self.url:
97 | if u == self.url: continue
98 | if u.startswith('#'): continue
99 | if not u.startswith('http'):
100 | if url:
101 | if (url[-1] == '/') and (u[0] == '/'): u = url + u[1:]
102 | else: u = url+u
103 | elif self.url:
104 | if (self.url[-1] == '/') and (u[0] == '/'): u = self.url + u[1:]
105 | else: u = self.url+u
106 | else: continue
107 | self.links.append(u)
108 | return self.links
109 |
110 | def extract_body(html):
111 | """
112 | Extract the body text of a web page
113 | """
114 | g = Goose({'enable_image_fetching':False})
115 | article = g.extract(raw_html=html)
116 | del g
117 | return article.cleaned_text
118 |
119 | def extract_title(html):
120 | """
121 | Extract the body title of a web page
122 | """
123 | g = Goose({'enable_image_fetching':False})
124 | article = g.extract(raw_html=html)
125 | del g
126 | return article.title
127 |
128 |
129 | def summarise(article):
130 | stopnum = c = 0
131 | for i,v in enumerate(article.split()):
132 | if v.endswith('.'):
133 | if c >= 2:
134 | stopnum = i+1
135 | break
136 | else:
137 | c += 1
138 | return ' '.join(article.split()[:stopnum])
139 |
140 |
--------------------------------------------------------------------------------
/emissary/controllers/scripts.py:
--------------------------------------------------------------------------------
1 | #! _*_ coding: utf-8 _*_
2 | # This file provides scripting capabilities
3 | import os
4 | from emissary import app
5 | from emissary.controllers.utils import sha1sum
6 |
7 | class Scripts(object):
8 |
9 | def __init__(self, dir):
10 | self.dir = None
11 | self.scripts = {}
12 |
13 | dir = os.path.abspath(dir)
14 | if not os.path.isdir(dir):
15 | app.log("%s isn't a valid system path." % dir, "error")
16 | return
17 |
18 | self.dir = dir
19 |
20 | def reload(self, *args): # args caught for SIGHUP handler
21 |
22 | if self.dir:
23 | if self.scripts:
24 | app.log("Reloading scripts.")
25 | for file in os.listdir(self.dir):
26 | self.unload(file)
27 | self.load(file)
28 |
29 | def load(self, file):
30 |
31 | file = os.path.abspath(os.path.join(self.dir, file))
32 |
33 | for script in self.scripts.values():
34 | if script.file == file: return
35 |
36 | if os.path.isfile(file):
37 | self.scripts[file] = Script(file)
38 | app.log("Loaded %s" % file)
39 |
40 | def unload(self, file):
41 | file = os.path.abspath(os.path.join(self.dir, file))
42 |
43 | if file in self.scripts:
44 | del self.scripts[file]
45 |
46 | class Script(object):
47 | """
48 | Represents the execution environment for a third-party script.
49 | We send custom values into the environment and work with whatever's left.
50 | Scripts can also call any methods on objects put in their environment.
51 | """
52 | def __init__(self, file=None, env={}):
53 | self.read_on_exec = app.debug
54 | self.file = file
55 | self.env = env
56 | self.script = ''
57 | self.code = None
58 | self.hash = None
59 | self.cache = {
60 | 'app': app
61 | }
62 |
63 | def execute(self, env={}):
64 | if not self.code or self.read_on_exec: self.compile()
65 | if env: self.env = env
66 | self.env['cache'] = self.cache
67 | exec self.code in self.env
68 | del self.env['__builtins__']
69 | if 'cache' in self.env.keys():
70 | self.cache = self.env['cache']
71 | return (self.env)
72 |
73 | def compile(self, script=''):
74 | if self.file:
75 | f = file(self.file, 'r')
76 | self.script = f.read()
77 | f.close()
78 | elif script:
79 | self.script = script
80 | if self.script:
81 | hash = sha1sum(self.script)
82 | if self.hash != hash:
83 | self.hash = hash
84 | self.code = compile(self.script, '', 'exec')
85 | self.script = ''
86 |
87 | def __getitem__(self, key):
88 | if key in self.env.keys():
89 | return (self.env[key])
90 | else:
91 | raise (KeyError(key))
92 |
93 | def keys(self):
94 | return self.env.keys()
95 |
--------------------------------------------------------------------------------
/emissary/controllers/tui.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from emissary.controllers.utils import tconv
4 | from window import Window, Pane, ALIGN_LEFT, EXPAND, palette
5 |
6 | class EmissaryMenu(Pane):
7 | """
8 | Defines a menu where items call local methods.
9 | """
10 | geometry = [EXPAND, EXPAND]
11 | # Default and selection colours.
12 | col = [-1, -1] # fg, bg
13 | sel = [-1, "blue"]
14 | items = []
15 |
16 | def update(self):
17 | for i, item in enumerate(self.items):
18 | if item[0]:
19 | colours = palette(self.sel[0], self.sel[1])
20 | else:
21 | colours = palette(self.col[0], self.col[1])
22 | text = ' ' + item[1]
23 | spaces = ' ' * (self.width - len(text))
24 | text += spaces
25 | self.change_content(i, text + '\n', ALIGN_LEFT, colours)
26 |
27 | def process_input(self, character):
28 | # Handle the return key and the right arrow key
29 | if character == 10 or character == 13 or character == 261:
30 | for i, item in enumerate(self.items):
31 | if item[0]:
32 | func = getattr(self, item[2].lower(), None)
33 | if func:
34 | func()
35 |
36 | # Handle navigating the menu
37 | elif character in [259, 258, 339, 338]:
38 | for i, item in enumerate(self.items):
39 | if item[0]:
40 | if character == 259: # up arrow
41 | if i == 0: break
42 | item[0] = 0
43 | self.items[i-1][0] = 1
44 | break
45 | if character == 258: # down arrow
46 | if i+1 >= len(self.items): break
47 | item[0] = 0
48 | self.items[i+1][0] = 1
49 | break
50 | if character == 339: # page up
51 | item[0] = 0
52 | self.items[0][0] = 1
53 | break
54 | if character == 338: # page down
55 | item[0] = 0
56 | self.items[-1][0] = 1
57 | break
58 |
59 | class FeedGroups(EmissaryMenu):
60 | geometry = [EXPAND, EXPAND]
61 | def update(self):
62 | if not self.items:
63 | (res, status) = self.window.c.get("feeds")
64 |
65 |
66 | class Feeds(EmissaryMenu):
67 | geometry = [EXPAND, EXPAND]
68 | items = []
69 |
70 |
71 | class Articles(Pane):
72 | """
73 | items for Articles are [1, "text", "uid"]
74 | """
75 | geometry = [EXPAND, EXPAND]
76 | items = []
77 | col = [-1, -1] # fg, bg
78 | sel = ["black", "white"]
79 | avail = ["black", "green"]
80 |
81 | def update(self):
82 | if not self.items:
83 | self.fetch_items()
84 |
85 | for i, item in enumerate(self.items):
86 | if item[0]:
87 | if item[3]:
88 | colours = palette(self.avail[0], self.avail[1])
89 | else:
90 | colours = palette(self.sel[0], self.sel[1])
91 | else:
92 | colours = palette(self.col[0], self.col[1])
93 | text = ' ' + item[1]
94 | spaces = ' ' * (self.width - len(text))
95 | text += spaces
96 | self.change_content(i, text + '\n', ALIGN_LEFT, colours)
97 |
98 | def process_input(self, character):
99 | # Handle the return key and the right arrow key
100 | if character in [10, 13, 261]:
101 | for i, item in enumerate(self.items):
102 | if item[0]:
103 | uid = item[2]
104 | (article, status) = self.window.c.get('articles/' + uid)
105 | statuspane = self.window.get("status")
106 |
107 | if status != 200:
108 | statuspane.status = str(status)
109 | else:
110 | self.reader.article = article
111 | if article['content'] == None:
112 | self.reader.data = ""
113 | else:
114 | self.reader.data = article['content'].encode("ascii", "ignore")
115 | self.reader.active = True
116 | self.active = False
117 |
118 | elif character == 114: # r to refresh
119 | self.fetch_items()
120 |
121 | elif character == 9: # tab to reader
122 | reader = self.window.get("reader")
123 | reader.active = True
124 | self.active = False
125 |
126 | # Handle navigating the menu
127 | elif character in [259, 258, 339, 338]:
128 | for i, item in enumerate(self.items):
129 | if item[0]:
130 | if character == 259: # up arrow
131 | if i == 0: break
132 | item[0] = 0
133 | self.items[i-1][0] = 1
134 | break
135 | if character == 258: # down arrow
136 | if i+1 >= len(self.items): break
137 | item[0] = 0
138 | self.items[i+1][0] = 1
139 | break
140 | if character == 339: # page up
141 | item[0] = 0
142 | self.items[0][0] = 1
143 | break
144 | if character == 338: # page down
145 | item[0] = 0
146 | self.items[-1][0] = 1
147 | break
148 |
149 | def fetch_items(self):
150 | (res, status) = self.window.c.get("articles?per_page=%i" % self.height)
151 | if status == 200:
152 | self.fill_menu(res)
153 | else:
154 | status = self.window.get("status")
155 | status.status = str(res)
156 |
157 | def fill_menu(self, res):
158 | self.items = []
159 | self.content = []
160 | for r in res["data"]:
161 | self.items.append([0, r['title'].encode("ascii", "ignore"), r['uid'], r['content_available']])
162 | if self.items:
163 | self.items[0][0] = 1
164 |
165 | class Reader(Pane):
166 | """
167 | Defines a scrolling pager for long multi-line strings.
168 | """
169 | geometry = [EXPAND, EXPAND]
170 | data = ""
171 | outbuffer = ""
172 | position = 0
173 | article = None
174 |
175 | def update(self):
176 | if self.article:
177 | feed = self.article.get('feed', None)
178 | heading = "%s\n%s (%s %s ago)\n%s\n\n" % \
179 | (self.article['title'].encode("ascii","ignore"), feed if feed else "",
180 | self.article['uid'], tconv(int(time.time()) - int(self.article['created'])),
181 | self.article['url'])
182 | self.change_content(0, heading)
183 | self.outbuffer = self.data.split('\n')[self.position:]
184 | self.change_content(1, '\n'.join(self.outbuffer))
185 |
186 | def process_input(self, character):
187 | self.window.window.clear()
188 | if character == 259: # Up arrow
189 | if self.position != 0:
190 | self.position -= 1
191 | elif character == 258: # Down arrow
192 | self.position += 1
193 | elif character == 339: # Page up
194 | if self.position - self.height < 0:
195 | self.position = 0
196 | else:
197 | self.position -= self.height
198 | elif character == 338: # Page down
199 | if not self.position + self.height > len(self.data.split('\n')):
200 | self.position += self.height
201 |
202 | elif character in [260, 9]: # Left arrow or tab
203 | articles = self.window.get("articles")
204 | articles.active = True
205 | self.active = False
206 |
207 | elif character in [70, 102]: # f/F to fullscreen the pager
208 | articles = self.window.get("articles")
209 | if articles.hidden:
210 | articles.hidden = False
211 | else:
212 | articles.hidden = True
213 |
214 | class StatusLine(Pane):
215 | geometry = [EXPAND, 1]
216 | content = []
217 | buffer = ""
218 | status = ""
219 | searching = False
220 | tagline = "Thanks God."
221 |
222 | def update(self):
223 | if self.searching:
224 | self.change_content(0, "/"+self.buffer, palette("black", "white"))
225 | else:
226 | state = self.tagline
227 | state += ' ' * ((self.width /2) - len(self.tagline) - (len(str(self.status))/2))
228 | state += str(self.status)
229 | self.change_content(0, state)
230 |
231 | def process_input(self, character):
232 | self.window.window.clear()
233 | if not self.searching and character in [80, 112]: # p/P to enter a python REPL
234 | try: # You might need to
235 | import pprint # "sudo pip install ptpython"
236 | from ptpython.repl import embed # to enable this feature.
237 |
238 | def configure(repl):
239 | repl.prompt_style = "ipython"
240 | repl.vi_mode = True
241 | repl.confirm_exit = False
242 | repl.show_status_bar = False
243 | repl.show_line_numbers = True
244 | repl.show_sidebar_help = False
245 | repl.highlight_matching_parenthesis = True
246 | repl.use_code_colorscheme("native")
247 |
248 | def a(uid):
249 | """
250 | Return raw article text given an article uid.
251 | """
252 | response = self.window.c.get("articles/%s" % uid)
253 | if response[1] == 200:
254 | return response[0]['content']
255 | return ""
256 |
257 | p = pprint.PrettyPrinter()
258 | p = p.pprint
259 | l = {"a": a, "c": self.window.c, "p": p, "window": self.window}
260 | reader = self.window.get("reader")
261 | article = getattr(reader, "article", None)
262 | if article:
263 | l['article'] = article
264 |
265 | self.window.stop()
266 | print("\nStarting REPL. ^D to exit.")
267 | embed(locals=l, configure=configure)
268 | self.window.start()
269 | except ImportError:
270 | pass
271 |
272 | if not self.searching and character == 47: # / to search
273 | articles = self.window.get("articles")
274 | articles.active = False
275 | self.searching = True
276 | return
277 |
278 | if self.searching:
279 | self.window.window.clear()
280 | if character == 23 and self.buffer: # Clear buffer on ^W
281 | self.buffer = ''
282 | elif character == 263: # Handle backspace
283 | if self.buffer:
284 | self.buffer = self.buffer[:-1]
285 | if not self.buffer:
286 | self.searching = False
287 | articles = self.window.get("articles")
288 | articles.active = True
289 |
290 | elif character == 10 or character == 13: # Handle the return key
291 | # Pass control back to the articles view
292 | self.searching = False
293 | articles = self.window.get("articles")
294 | articles.active = True
295 | reader = self.window.get("reader")
296 | reader.active = False
297 | self.buffer = ""
298 | else:
299 | try: self.buffer += chr(character) # Append input to buffer
300 | except: pass
301 | # Perform a search for what's in the current buffer.
302 | articles = self.window.get("articles")
303 | url = "articles/search/"+self.buffer+"?per_page=" + str(articles.height)
304 | (res, status) = self.window.c.get(url)
305 | if status == 200:
306 | articles.fill_menu(res)
307 |
308 |
309 | window = Window(blocking=True)
310 |
311 | feedgroups = FeedGroups("feedgroups")
312 | feedgroups.active = False
313 | feedgroups.hidden = True
314 | feeds = Feeds("feeds")
315 | feeds.active = False
316 | feeds.hidden = True
317 | articles = Articles("articles")
318 | reader = Reader("reader")
319 | reader.wrap = True
320 | reader.active = False
321 | articles.reader = reader
322 | status = StatusLine("status")
323 |
324 | panes = [feedgroups, feeds, articles, reader]
325 | window.add(panes)
326 | window.add(status)
327 |
328 | window.exit_keys.append(4) # ^D to exit
329 |
--------------------------------------------------------------------------------
/emissary/controllers/utils.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | # This file defines a nifty utility for querying the database,
3 | # gzipping requests thanks to a snippet on pocoo.org and unique ID generation.
4 | import gzip
5 | import uuid
6 | import urllib
7 | import hashlib
8 | import urlparse
9 | import functools
10 | from emissary import app, db
11 | from sqlalchemy import or_, and_
12 | from cStringIO import StringIO as IO
13 | from flask import after_this_request, request
14 | from emissary.controllers.cron import parse_timings
15 |
16 | def sha1sum(text):
17 | return(hashlib.sha1(text).hexdigest())
18 |
19 | def cors(f):
20 | if not 'ENABLE_CORS' in app.config or not app.config['ENABLE_CORS']:
21 | return f
22 |
23 | @functools.wraps(f)
24 | def view_func(*args, **kwargs):
25 | @after_this_request
26 | def enable_cors(response):
27 | response.headers['Access-Control-Allow-Headers'] = "Cache-Control, Pragma, Origin, Authorization, Content-Type, X-Requested-With, Accept"
28 | response.headers['Access-Control-Allow-Methods'] = "OPTIONS, GET, POST, PUT, DELETE"
29 | response.headers['Access-Control-Allow-Origin'] = "*"
30 |
31 | return response
32 |
33 | return f(*args, **kwargs)
34 |
35 | return view_func
36 |
37 | def gzipped(f):
38 | if not 'GZIP_HERE' in app.config or not app.config['GZIP_HERE']:
39 | return f
40 |
41 | @functools.wraps(f)
42 | def view_func(*args, **kwargs):
43 |
44 | @after_this_request
45 | def zipper(response):
46 | accept_encoding = request.headers.get('Accept-Encoding', '')
47 |
48 | if 'gzip' not in accept_encoding.lower():
49 | return response
50 |
51 | response.direct_passthrough = False
52 |
53 | if (response.status_code < 200 or
54 | response.status_code >= 300 or
55 | 'Content-Encoding' in response.headers):
56 | return response
57 | gzip_buffer = IO()
58 | gzip_file = gzip.GzipFile(mode='wb',
59 | fileobj=gzip_buffer)
60 | gzip_file.write(response.data)
61 | gzip_file.close()
62 |
63 | response.data = gzip_buffer.getvalue()
64 | response.headers['Content-Encoding'] = 'gzip'
65 | response.headers['Vary'] = 'Accept-Encoding'
66 | response.headers['Content-Length'] = len(response.data.replace(' ',''))
67 |
68 | return response
69 |
70 | return f(*args, **kwargs)
71 |
72 | return view_func
73 |
74 | def uid(): return str(uuid.uuid4())
75 |
76 | def tconv(seconds):
77 | minutes, seconds = divmod(seconds, 60)
78 | hours, minutes = divmod(minutes, 60)
79 | days, hours = divmod(hours, 24)
80 | weeks, days = divmod(days, 7)
81 | s=""
82 | if weeks:
83 | if weeks == 1:
84 | s+= "1 week, "
85 | else:
86 | s+= "%i weeks, " % (weeks)
87 | if days:
88 | if days == 1:
89 | s+= "1 day, "
90 | else:
91 | s+= "%i days, " % (days)
92 | if hours:
93 | if hours == 1:
94 | s+= "1 hour, "
95 | else:
96 | s+= "%i hours, " % (hours)
97 | if minutes:
98 | if minutes == 1:
99 | s+= "1 minute"
100 | else:
101 | s+= "%i minutes" % (minutes)
102 | if seconds:
103 | if len(s) > 0:
104 | if seconds == 1:
105 | s+= " and %i second" % (seconds)
106 | else:
107 | s+= " and %i seconds" % (seconds)
108 | else:
109 | if seconds == 1:
110 | s+= "1 second"
111 | else:
112 | s+= "%i seconds" % (seconds)
113 | return s
114 |
115 | def spaceparse(string):
116 | """
117 | Return strings surrounded in quotes as a list, or dict if they're key="value".
118 | """
119 | results = []
120 | quotes = string.count('"')
121 | quoted = quotes / 2
122 | keyvalue = False
123 |
124 | # Return an empty resultset if there are an uneven number of quotation marks
125 | if quotes % 2 != 0:
126 | return results
127 |
128 | # for every quoted phrase determine if it's an assignment and include the variable name
129 | # disregard it from the string we're working with and continue onto the next quoted part
130 | for phrase in range(0,quoted+1):
131 | if not string: break
132 | start = string.find('"')
133 | end = string.find('"', start+1)
134 |
135 | if start > 0 and string[start-1] == '=':
136 | keyvalue = True
137 | for i in range(start,-1,-1):
138 | if string[i] == ' ' or i == 0:
139 | results.append(string[i:end])
140 | break
141 | else:
142 | results.append(string[start+1:end])
143 | string = string[end+1:]
144 | if keyvalue:
145 | res = {}
146 | for item in results:
147 | k,v = item.split('=')
148 | if k.startswith(' '):
149 | k=k[1:]
150 | if v.startswith('"'):
151 | v=v[1:]
152 | res[k]=v
153 | return res
154 | return results
155 |
156 | def update_url(url, params):
157 | url_parts = list(urlparse.urlparse(request.url))
158 | query = dict(urlparse.parse_qsl(url_parts[4]))
159 | query.update(params)
160 | url_parts[4] = urllib.urlencode(query)
161 | return urlparse.urlunparse(url_parts)
162 |
163 | def make_response(url, query, jsonify=True):
164 | """
165 | Take a paginated SQLAlchemy query and return
166 | a response that's more easily reasoned about
167 | by other programs.
168 | """
169 | response = {}
170 | if jsonify:
171 | response['data'] = [i.jsonify() for i in query.items]
172 |
173 | response['links'] = {}
174 | response['links']['self'] = url
175 | if query.has_next:
176 | response['links']['next'] = update_url(url, {"page": str(query.next_num)})
177 | return response
178 |
--------------------------------------------------------------------------------
/emissary/models.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | """
3 | MIT License.
4 | Luke Brooks 2015
5 | Database layout for Emissary.
6 | """
7 | import time
8 | import snappy
9 | from hashlib import sha256
10 | from emissary import db, app
11 | from multiprocessing import Queue
12 | from emissary.controllers.utils import uid
13 |
14 | class APIKey(db.Model):
15 | """
16 | An Emissary API Key.
17 | Reader keys cannot PUT, POST or DELETE.
18 | """
19 | __tablename__ = 'api_keys'
20 | id = db.Column(db.Integer, primary_key=True)
21 | parent_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id"))
22 | name = db.Column(db.String(80))
23 | key = db.Column(db.String(120))
24 | active = db.Column(db.Boolean())
25 | reader = db.Column(db.Boolean(), default=False)
26 | created = db.Column(db.DateTime(timezone=True), default=db.func.now())
27 | parent = db.relationship("APIKey", backref="readers", remote_side=[id])
28 | feedgroups = db.relationship("FeedGroup", backref="key")
29 | feeds = db.relationship("Feed", backref="key")
30 | articles = db.relationship("Article", backref="key")
31 | events = db.relationship("Event", backref="key")
32 |
33 | def generate_key_str(self):
34 | """
35 | Returns a SHA256 of the time as an API Key.
36 | """
37 | return sha256(time.asctime() + str(time.time())).hexdigest()
38 |
39 | def __repr__(self):
40 | if not self.name:
41 | return ""
42 | return '' % self.name
43 |
44 | def jsonify(self, feedgroups=False, with_key_str=False):
45 | response = {}
46 | response['name'] = self.name
47 | if with_key_str:
48 | response['apikey'] = self.key
49 | if feedgroups:
50 | response['feedgroups'] = [group.jsonify() for group in self.feedgroups]
51 | response['active'] = self.active
52 | response['reader'] = self.reader
53 | if self.reader:
54 | response['parent'] = self.parent.name
55 | return response
56 |
57 | class FeedGroup(db.Model):
58 | __tablename__ = "feed_groups"
59 | id = db.Column(db.Integer(), primary_key=True)
60 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id"))
61 | uid = db.Column(db.String(36), default=uid())
62 | name = db.Column(db.String(80))
63 | feeds = db.relationship('Feed', backref="group")
64 | created = db.Column(db.DateTime(timezone=True), default=db.func.now())
65 | active = db.Column(db.Boolean(), default=True)
66 |
67 | def __repr__(self):
68 | if self.name:
69 | return '' % (self.name, len(self.feeds))
70 | return ""
71 |
72 | def jsonify(self):
73 | response = {}
74 | if self.created:
75 | response['name'] = self.name
76 | response['uid'] = self.uid
77 | response['created'] = time.mktime(self.created.timetuple())
78 | response['active'] = self.active
79 | response['feeds'] = [feed.jsonify() for feed in self.feeds]
80 | return response
81 |
82 | class Feed(db.Model):
83 | __tablename__ = "feeds"
84 | id = db.Column(db.Integer(), primary_key=True)
85 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id"))
86 | group_id = db.Column(db.Integer(), db.ForeignKey("feed_groups.id"))
87 | uid = db.Column(db.String(36), default=uid())
88 | name = db.Column(db.String(100))
89 | url = db.Column(db.String(150))
90 | schedule = db.Column(db.String(80))
91 | active = db.Column(db.Boolean(), default=True)
92 | created = db.Column(db.DateTime(timezone=True), default=db.func.now())
93 | articles = db.relationship('Article', backref="feed")
94 |
95 | def __repr__(self):
96 | if self.name:
97 | return '' % (self.name, len(self.articles))
98 | return ""
99 |
100 | def is_running(self):
101 | """
102 | Ask the feedmanager what's happening.
103 | """
104 | if not app.inbox:
105 | return None
106 |
107 | response_queue = app.queues[-1]
108 | qid = hex(id(response_queue))
109 | app.inbox.put([qid, "check", self])
110 |
111 | # Wait somewhere around 500ms max for a response
112 | then = time.time()
113 | while response_queue.empty():
114 | now = time.time()
115 | if (now - then) >= 0.5:
116 | return None
117 |
118 | return response_queue.get()
119 |
120 | def jsonify(self, articles=False):
121 | response = {}
122 | if self.created:
123 | response['name'] = self.name
124 | response['uid'] = self.uid
125 | response['url'] = self.url
126 | response['created'] = time.mktime(self.created.timetuple())
127 | response['schedule'] = self.schedule
128 | response['active'] = self.active
129 | response['article_count'] = len(self.articles)
130 | response['running'] = self.is_running()
131 | if self.group:
132 | response['group'] = self.group.name
133 | else:
134 | response['group'] = None
135 | return response
136 |
137 |
138 | class Article(db.Model):
139 | __tablename__ = "articles"
140 | id = db.Column(db.Integer(), primary_key=True)
141 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id"))
142 | uid = db.Column(db.String(36))
143 | feed_id = db.Column(db.Integer(), db.ForeignKey("feeds.id"))
144 | title = db.Column(db.String(80))
145 | url = db.Column(db.String(400))
146 | content = db.Column(db.String(2000))
147 | ccontent = db.Column(db.LargeBinary())
148 | summary = db.Column(db.String(800))
149 | created = db.Column(db.DateTime(timezone=True), default=db.func.now())
150 | compressed = db.Column(db.Boolean(), default=False)
151 |
152 | def text(self):
153 | if self.content:
154 | return self.content.decode("utf-8", "ignore")
155 | if self.ccontent:
156 | return snappy.decompress(self.ccontent).decode("utf-8", "ignore")
157 | return ""
158 |
159 | def __repr__(self):
160 | if self.content or self.ccontent:
161 | return '' % self.title.encode("utf-8", "ignore")
162 | if self.url and self.title:
163 | return '' % self.title.encode("utf-8", "ignore")
164 | return ""
165 |
166 | def jsonify(self, summary=False, content=False):
167 | response = {}
168 | if self.title:
169 | response['title'] = self.title.encode("utf-8", "ignore")
170 | response['url'] = self.url.encode("utf-8", "ignore")
171 | response['uid'] = self.uid
172 | response['created'] = time.mktime(self.created.timetuple())
173 | if self.feed:
174 | response['feed'] = self.feed.name
175 | if content:
176 | response['compressed'] = self.compressed
177 | if self.ccontent:
178 | response['content'] = snappy.decompress(self.ccontent)
179 | else:
180 | response['content'] = self.content
181 | if not content:
182 | if self.content or self.ccontent:
183 | response['content_available'] = True
184 | else:
185 | response['content_available'] = False
186 | if summary and self.summary:
187 | response['summary'] = self.summary
188 | return response
189 |
190 | class Event(db.Model):
191 | __tablename__ = "events"
192 | id = db.Column(db.Integer(), primary_key=True)
193 | key_id = db.Column(db.Integer(), db.ForeignKey("api_keys.id"))
194 | created = db.Column(db.DateTime(timezone=True), default=db.func.now())
195 | feed_id = db.Column(db.Integer(), db.ForeignKey("feeds.id"))
196 | success = db.Column(db.Boolean())
197 | message = db.Column(db.String(200))
198 |
199 | def __repr__(self):
200 | return ""
201 |
202 | def jsonify(self):
203 | return {}
204 |
--------------------------------------------------------------------------------
/emissary/repl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | import cmd
4 | import json
5 | import time
6 | import errno
7 | import _curses
8 | import optparse
9 | import textwrap
10 | from emissary import app
11 | from emissary.client import Client
12 | from emissary.models import APIKey
13 | from subprocess import Popen, PIPE
14 | from emissary.controllers.utils import tconv, spaceparse
15 | from emissary.controllers.tui import window
16 |
17 | try:
18 | from pygments import highlight
19 | from pygments.lexers import JsonLexer
20 | from pygments.styles import get_style_by_name, STYLE_MAP
21 | from pygments.formatters.terminal256 import Terminal256Formatter
22 | except ImportError: highlight = False
23 |
24 | class repl(cmd.Cmd):
25 |
26 | prompt = "> "
27 | intro = "Emissary %s\nPsybernetics %i\n" % (app.version, time.gmtime()[0])
28 | ruler = '-'
29 | width = 80
30 |
31 |
32 | def parse_args(self, args):
33 | body = {}
34 | parsed = spaceparse(args)
35 | args = args.split()
36 | for i in args:
37 | try:
38 | x=i.split('=')
39 | if type(parsed) == dict and not x[0] in parsed:
40 | parsed[x[0]] = x[1]
41 | else:
42 | body[x[0]] = x[1]
43 | except: continue
44 | if type(parsed) == dict: body = parsed
45 | return body
46 |
47 | def formatted_prompt(self):
48 | """
49 | Here we format the first return value of /v1/articles/count
50 | into something that adds commas to triple digit (etc) values.
51 | """
52 | try:
53 | return "({:,}) > ".format(
54 | self.c.get("articles/count")[0]
55 | )
56 | except:
57 | return "no connection> "
58 |
59 | def do_setkey(self,key):
60 | "Sets the API key to transmit requests with."
61 | if key:
62 | self.c.key = key
63 | print 'Changed active API key to "%s"' % key
64 | else:
65 | print "Usage: setkey "
66 |
67 | def do_use(self,key):
68 | "Alias of setkey."
69 | self.do_setkey(key)
70 |
71 | def do_getkey(self,line):
72 | "Displays the current API key."
73 | print self.c.key
74 |
75 | def do_get(self,line):
76 | """
77 | Sends GET requests
78 | EG: get articles
79 | get feeds
80 | get feedgroups
81 | """
82 | response = self.c._send_request(line)
83 | self.display(response)
84 |
85 | def do_put(self,line):
86 | """
87 | Creates a new feed or feed group.
88 | EG: put feedgroups name=HN
89 | """
90 | if not ' ' in line:
91 | print "Need data to transmit."
92 | else:
93 | line, body = line.split(' ',1)
94 | body = self.parse_args(body)
95 | response = self.c._send_request(line, 'PUT', body)
96 | self.display(response)
97 |
98 |
99 | def do_post(self,line):
100 | """
101 | Modifies an existing feed or feed group.
102 | EG: post feeds/SomeFeed schedule="20 3 2! * *"
103 | """
104 |
105 | if not ' ' in line:
106 | print "Need data to transmit."
107 | else:
108 | line, body = line.split(' ',1)
109 | body = self.parse_args(body)
110 | response = self.c._send_request(line, 'POST', body)
111 | self.display(response)
112 |
113 | def do_exit(self,line):
114 | try:
115 | _curses.endwin()
116 | except _curses.error:
117 | pass
118 | finally:
119 | raise SystemExit
120 |
121 | def do_read(self,line):
122 | """
123 | Usage: read
124 | Pipes article content into the system pager.
125 |
126 | Text column width can be configured with the width command.
127 | """
128 | then = time.time()
129 | response = self.c._send_request("articles/" + line)
130 | if response[1] != 200:
131 | print response[1]
132 | return
133 |
134 | data = response[0]
135 |
136 | if not 'content' in data:
137 | print None
138 | else:
139 |
140 | p = Popen(['less', '-P', data['title']], stdin=PIPE)
141 |
142 | try:
143 | duration = tconv(int(then) - int(data['created']))
144 | p.stdin.write('%s\n(%i paragraphs, fetched %s ago)\n%s\n\n' % \
145 | (data['title'].encode("utf-8", "ignore"),
146 | len(data['content'].encode("utf-8","ignore").split("\n"))/2+1,
147 | duration,
148 | data['url'].encode("utf-8","ignore")))
149 |
150 | content = data['content'].encode("utf-8", "ignore")
151 | # Get TTY width and wrap the text
152 | if self.width == "auto":
153 | s = _curses.initscr()
154 | width = s.getmaxyx()[1]
155 | _curses.endwin()
156 |
157 | else:
158 | width = self.width
159 |
160 | content = '\n'.join(
161 | textwrap.wrap(content, width, break_long_words=False, replace_whitespace=False)
162 | )
163 | p.stdin.write(content)
164 |
165 | except IOError as e:
166 | if e.errno == errno.EPIPE or e.errno == errno.EINVAL:
167 | sys.stderr.write("Error writing to pipe.\n")
168 | else:
169 | raise
170 |
171 | p.stdin.close()
172 | p.wait()
173 | now = time.time()
174 | duration = tconv(now-then)
175 | # print "\n%s" % duration
176 |
177 | def do_delete(self,line):
178 | """
179 | Sends a DELETE request.
180 | EG: delete feeds/somefeed
181 | """
182 | if ' ' in line:
183 | line, body = line.split(' ',1)
184 | body = self.parse_args(body)
185 | else: body = ''
186 | response = self.c._send_request(line, 'DELETE', body)
187 | self.display(response)
188 |
189 | def do_EOF(self,line):
190 | print "^D",
191 | return True
192 |
193 | def postcmd(self, stop, line):
194 | self.prompt = self.formatted_prompt()
195 | return stop
196 |
197 | def emptyline(self):
198 | pass
199 |
200 | def postloop(self):
201 | print
202 |
203 | def do_width(self, line):
204 | """
205 | Set the text width for the read command.
206 | Acceptable values are an integer amount of characters or "auto".
207 | """
208 | if line == "auto":
209 | self.width = "auto"
210 | elif line == "":
211 | print "The current width is set to %s" % str(self.width)
212 | else:
213 | try:
214 | self.width = int(line)
215 | except:
216 | print "width must be an integer."
217 |
218 | def do_search(self, line):
219 | self.do_get("articles/search/" + line)
220 |
221 | def do_style(self, style):
222 | """
223 | Usage: style
224 | Lists the available themes if no
225 | name is supplied, or sets the theme to use.
226 | """
227 | if not self.highlight:
228 | print "For syntax highlighting you will need to install the Pygments package."
229 | print "sudo pip install pygments"
230 | return
231 | if style:
232 | self.style = style
233 | print 'Changed style to "%s"' % style
234 | else:
235 | print ', '.join(self.AVAILABLE_STYLES)
236 | print 'Currently using "%s"' % self.style
237 |
238 | def display(self, response):
239 | if self.highlight:
240 | print response[1]
241 | print highlight(json.dumps(response[0],indent=4), JsonLexer(), Terminal256Formatter(style=self.style))
242 | else: self.c.p(response)
243 |
244 | def reqwrap(func):
245 | def wrapper(*args, **kwargs):
246 | try: return func(*args, **kwargs)
247 | except: return ({'error':'Connection refused.'}, 000)
248 | return wrapper
249 |
250 |
251 | if __name__ == "__main__":
252 | parser = optparse.OptionParser(prog="python -m emissary.repl")
253 | parser.add_option("--host", dest="host", action="store", default='localhost:6362/v1/')
254 | parser.add_option("--ncurses", dest="ncurses", action="store_true", default=False)
255 | (options,args) = parser.parse_args()
256 |
257 | if options.ncurses:
258 | r = window
259 | else:
260 | r = repl()
261 |
262 | r.c = Client('','https://%s' % options.host, verify=False)
263 |
264 | r.c.key = ""
265 |
266 | try:
267 | k = APIKey.query.first()
268 | except Exception, e:
269 | print "Encountered an error: " + e.message
270 | print "This either means there's no URI exported as EMISSARY_DATABASE or you've exported a URI"
271 | print "but haven't given Emissary a first run in order to write the schema and a primary API key."
272 | raise SystemExit
273 |
274 | if k: r.c.key = k.key
275 | r.c.verify_https = False
276 |
277 | if not options.ncurses:
278 | r.highlight = highlight
279 | r.prompt = r.formatted_prompt()
280 | if highlight:
281 | r.AVAILABLE_STYLES = set(STYLE_MAP.keys())
282 | if 'tango' in r.AVAILABLE_STYLES: r.style = 'tango'
283 | else:
284 | for s in r.AVAILABLE_STYLES: break
285 | r.style = s
286 | r.c._send_request = reqwrap(r.c._send_request)
287 |
288 | try:
289 | if options.ncurses:
290 | window.start()
291 | else:
292 | r.cmdloop()
293 | except KeyboardInterrupt:
294 | print "^C"
295 | raise SystemExit
296 |
--------------------------------------------------------------------------------
/emissary/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LukeB42/Emissary/31629a8baedc91a9b60c551a01b2b45372b9a8c7/emissary/resources/__init__.py
--------------------------------------------------------------------------------
/emissary/resources/api_key.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | # This module determines the behavior of API Keys within the system.
3 | # You may also want to check the definition of API keys in models.py.
4 | import re
5 | from flask import request
6 | from sqlalchemy import and_
7 | from emissary import app, db
8 | from emissary.models import *
9 | from flask.ext import restful
10 | from flask.ext.restful import reqparse, abort
11 | from emissary.controllers.utils import cors, gzipped
12 |
13 | def auth(forbid_reader_keys=False):
14 | """
15 | Here we determine that inactive keys are invalid
16 | and that reader keys are their parent unless forbidden.
17 | """
18 | if 'Authorization' in request.headers:
19 | key_str = request.headers['Authorization'].replace('Basic ', '')
20 | key = APIKey.query.filter(APIKey.key == key_str).first()
21 | if key and key.active:
22 | if key.reader:
23 | if not forbid_reader_keys:
24 | return key.parent
25 | abort(401, message="Forbidden to reader keys.")
26 | return key
27 | abort(401, message="Invalid API Key.")
28 |
29 | class KeyCollection(restful.Resource):
30 |
31 | @cors
32 | @gzipped
33 | def get(self):
34 | key = auth()
35 | response = key.jsonify(feedgroups=False)
36 |
37 | if key.name == app.config['MASTER_KEY_NAME'] or key.systemwide:
38 | response['system'] = {}
39 |
40 | if key.name == app.config['MASTER_KEY_NAME']:
41 | keys = []
42 | for i in APIKey.query.all(): keys.append(i.name)
43 | response['system']['keys'] = keys
44 | response['system']['permit_new'] = app.config['PERMIT_NEW']
45 |
46 | return [response]
47 |
48 | @cors
49 | @gzipped
50 | def put(self):
51 | """
52 | This method creates keys under the specified name,
53 | presuming config['PERMIT_NEW'] is enabled or the master key is in use.
54 |
55 | Reader keys (keys that can only perform GET requests) are created by setting
56 | the "reader" parameter to a value in the body of the request.
57 | They are automatically associated with the requesting key.
58 | """
59 | key = None
60 | parser = reqparse.RequestParser()
61 | parser.add_argument("name",type=str, help="Name associated with the key", required=True)
62 | parser.add_argument("reader",type=bool, help="Creates a reader key", default=False)
63 | args = parser.parse_args()
64 |
65 | if 'Authorization' in request.headers:
66 | key_str = request.headers['Authorization'].replace('Basic ', '')
67 | key = APIKey.query.filter(APIKey.key == key_str).first()
68 | if key.reader:
69 | abort(401, message="Reader keys cannot create API keys.")
70 |
71 | # Create a reader key if this request has been made with an existing key
72 | if key and args.name and args.reader:
73 | new_key = APIKey(name=args.name, active=True, reader=True)
74 | new_key.key = new_key.generate_key_str()
75 | key.readers.append(new_key)
76 | db.session.add(key)
77 | db.session.add(new_key)
78 | db.session.commit()
79 |
80 | return new_key.jsonify(with_key_str=True), 201
81 |
82 | if (key and key.name == app.config['MASTER_KEY_NAME']) or app.config['PERMIT_NEW']:
83 | # Permit only simple names (character limit, alphanumeric)
84 | if re.match("^$|\s+[a-zA-Z0-9_]+$",args.name) or len(args.name) > 60:
85 | abort(422, message="Invalid key name. Must contain alphanumeric characters.")
86 | # Determine if already exists
87 | key = APIKey.query.filter(APIKey.name == args.name).first()
88 |
89 | if key: abort(403, message="A key already exists with this name.")
90 |
91 | key = APIKey(name=args.name)
92 | key.key = key.generate_key_str()
93 | key.active = True
94 | db.session.add(key)
95 | db.session.commit()
96 |
97 | return key.jsonify(with_key_str=True), 201
98 |
99 | abort(403, message="This server isn't currently generating new keys.")
100 |
101 | @cors
102 | @gzipped
103 | def post(self):
104 | "This method is for updating existing API keys via the master key."
105 |
106 | key = auth(forbid_reader_keys=True)
107 |
108 | parser = reqparse.RequestParser()
109 | parser.add_argument("key",type=str, help="API Key")
110 | parser.add_argument("name",type=str, help="Name associated with the key")
111 | parser.add_argument("permit_new", type=bool, help="Determines whether new API keys can be created.")
112 | parser.add_argument("active", type=bool, help="Determines whether a key is active or not.", default=None)
113 | args = parser.parse_args()
114 |
115 | if key.name != app.config['MASTER_KEY_NAME']: abort(403)
116 |
117 | response={}
118 | subject = None
119 |
120 | if args.key and args.name:
121 | subject = APIKey.query.filter(APIKey.key == args.key).first()
122 | if APIKey.query.filter(APIKey.name == args.name).first():
123 | return {'message':"A key already exists with this name."}, 304
124 | subject.name = args.name
125 | elif args.name and not args.key:
126 | subject = APIKey.query.filter(APIKey.name == args.name).first()
127 | elif args.key and not args.name:
128 | subject = APIKey.query.filter(APIKey.key == args.key).first()
129 |
130 | if not subject: abort(404)
131 |
132 | if subject.name == app.config['MASTER_KEY_NAME']: abort(403)
133 | if args.active or args.active == False:
134 | subject.active = args.active
135 |
136 | response['key'] = subject.jsonify(with_key_str=True)
137 | db.session.add(subject)
138 |
139 | if (args.permit_new or args.permit_new == False) and key.name == app.config['MASTER_KEY_NAME']:
140 | app.config['PERMIT_NEW'] = args.permit_new
141 | response['system'] = {}
142 | response['system']['permit_new'] = app.config['PERMIT_NEW']
143 |
144 | db.session.commit()
145 | return response
146 |
147 | @cors
148 | @gzipped
149 | def delete(self):
150 | # http://docs.sqlalchemy.org/en/rel_0_9/orm/tutorial.html#configuring-delete-delete-orphan-cascade
151 | key = auth(forbid_reader_keys=True)
152 |
153 | parser = reqparse.RequestParser()
154 | parser.add_argument("key",type=str, help="API Key")
155 | args = parser.parse_args()
156 |
157 | target = APIKey.query.filter(APIKey.key == args.key).first()
158 | if not target: abort(404, message="Unrecognized key.")
159 |
160 | if args.key != key.key and key.name != app.config['MASTER_KEY_NAME']:
161 | abort(403, message="You do not have permission to remove this key.")
162 | if key.name == app.config['MASTER_KEY_NAME'] and args.key == key.key:
163 | abort(403, message="You are attempting to delete the master key.")
164 |
165 | for fg in target.feedgroups: db.session.delete(fg)
166 | for f in target.feeds: db.session.delete(f)
167 | for a in target.articles: db.session.delete(a)
168 |
169 | db.session.delete(target)
170 | db.session.commit()
171 | return {}, 204
172 |
173 | class KeyResource(restful.Resource):
174 |
175 | @cors
176 | @gzipped
177 | def get(self, name):
178 | """
179 | Permit the administrative key to review another key by name.
180 | """
181 | key = auth(forbid_reader_keys=True)
182 | if key.name != app.config['MASTER_KEY_NAME'] and name != key.name:
183 | abort(403)
184 |
185 | target = APIKey.query.filter_by(name=name).first()
186 | if target:
187 | return target.jsonify(feedgroups=True, with_key_str=True)
188 |
189 | abort(404, message="Unrecognised key.")
190 |
--------------------------------------------------------------------------------
/emissary/resources/articles.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | # This file determines how articles are accessed.
3 | # You may also want to examine the Article class in emissary/models.py
4 | from emissary import db
5 | from flask import request
6 | from flask.ext import restful
7 | from sqlalchemy import desc, and_
8 | from emissary.models import Article
9 | from emissary.resources.api_key import auth
10 | from emissary.controllers.fetch import fetch_feedless_article
11 | from emissary.controllers.utils import make_response, gzipped, cors
12 |
13 | class ArticleCollection(restful.Resource):
14 |
15 | @cors
16 | def get(self):
17 | """
18 | Review all articles associated with this key.
19 | """
20 | key = auth()
21 |
22 | parser = restful.reqparse.RequestParser()
23 | parser.add_argument("page", type=int, default=1)
24 | parser.add_argument("per_page", type=int, default=10)
25 | parser.add_argument("content", type=bool, default=None)
26 | args = parser.parse_args()
27 |
28 | # Construct a query for Articles ordered by descending creation date and paginated.
29 | if args.content == True:
30 | query = Article.query.filter(and_(Article.key == key, Article.content != None))\
31 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
32 | elif args.content == False:
33 | query = Article.query.filter(and_(Article.key == key, Article.content == None))\
34 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
35 | else:
36 | query = Article.query.filter(Article.key == key)\
37 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
38 |
39 | # Attach links to help consuming applications
40 | response = make_response(request.url, query)
41 | return response
42 |
43 | @cors
44 | def put(self):
45 | """
46 | Fetch an article without an associated feed.
47 | """
48 | key = auth()
49 |
50 | parser = restful.reqparse.RequestParser()
51 | parser.add_argument("url", type=str, required=True)
52 | args = parser.parse_args()
53 |
54 | try:
55 | article = fetch_feedless_article(key, args.url)
56 | except Exception, e:
57 | return {"Error": e.message}
58 |
59 | if not article:
60 | return {"Error": "This URL has already been stored."}, 304
61 |
62 | return article.jsonify(), 201
63 |
64 | class ArticleSearch(restful.Resource):
65 |
66 | @cors
67 | def get(self, terms):
68 | """
69 | The /v1/articles/search/ endpoint.
70 | """
71 | key = auth()
72 |
73 | parser = restful.reqparse.RequestParser()
74 | parser.add_argument("page", type=int, help="", default=1)
75 | parser.add_argument("per_page", type=int, help="", default=10)
76 | parser.add_argument("content", type=bool, help="", default=None)
77 | args = parser.parse_args()
78 |
79 | if args.content == True:
80 | query = Article.query.filter(
81 | and_(
82 | Article.key == key,
83 | Article.content != None,
84 | Article.title.like("%" + terms + "%")
85 | ))\
86 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
87 |
88 | response = make_response(request.url, query)
89 |
90 | # This method of manually pruning JSON documents because they
91 | # don't relate to items that have content can omit them from search
92 | # completely. They don't have content but they're showing up here in
93 | # content != None rather than content == None.. You could always just
94 | # comment out this next for loop
95 | for doc in response['data']:
96 | if not doc['content_available']:
97 | response['data'].remove(doc)
98 | return response
99 |
100 | elif args.content == False:
101 | query = Article.query.filter(
102 | and_(
103 | Article.key == key,
104 | Article.content == None,
105 | Article.title.like("%" + terms + "%")
106 | ))\
107 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
108 | return make_response(request.url, query)
109 |
110 | query = Article.query.filter(
111 | and_(Article.key == key, Article.title.like("%" + terms + "%")))\
112 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
113 | return make_response(request.url, query)
114 |
115 | class ArticleResource(restful.Resource):
116 |
117 | @cors
118 | def get(self, uid):
119 | """
120 | Read an article.
121 | """
122 | key = auth()
123 |
124 | article = Article.query.filter(and_(Article.key == key, Article.uid == uid)).first()
125 | if article:
126 | return article.jsonify(summary=True, content=True)
127 |
128 | restful.abort(404)
129 |
130 | @cors
131 | @gzipped
132 | def delete(self, uid):
133 | """
134 | Delete an article.
135 | """
136 | key = auth(forbid_reader_keys=True)
137 |
138 | article = Article.query.filter(and_(Article.key == key, Article.uid == uid)).first()
139 | if article:
140 | db.session.delete(article)
141 | db.session.commit()
142 | return {}
143 |
144 | restful.abort(404)
145 |
146 | class ArticleCount(restful.Resource):
147 |
148 | @cors
149 | def get(self):
150 | """
151 | Return the amount of articles belonging to an API key.
152 | """
153 | key = auth()
154 | return len(key.articles)
155 |
156 |
--------------------------------------------------------------------------------
/emissary/resources/feedgroups.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | # This file provides the HTTP endpoints for operating on groups of feeds.
3 | from emissary import app, db
4 | from flask import request
5 | from flask.ext import restful
6 | from sqlalchemy import and_, desc
7 | from emissary.resources.api_key import auth
8 | from emissary.models import FeedGroup, Feed, Article
9 | from emissary.controllers.cron import CronError, parse_timings
10 | from emissary.controllers.utils import cors, gzipped, make_response
11 |
12 | class FeedGroupCollection(restful.Resource):
13 |
14 | @cors
15 | @gzipped
16 | def get(self):
17 | """
18 | Paginate an array of feed groups
19 | associated with the requesting key.
20 | """
21 | key = auth()
22 |
23 | parser = restful.reqparse.RequestParser()
24 | parser.add_argument("page", type=int, default=1)
25 | parser.add_argument("per_page", type=int, default=10)
26 | parser.add_argument("content", type=bool, default=None)
27 | args = parser.parse_args()
28 |
29 | query = FeedGroup.query.filter(FeedGroup.key == key)\
30 | .order_by(desc(FeedGroup.created)).paginate(args.page, args.per_page)
31 |
32 | return make_response(request.url, query)
33 |
34 | @cors
35 | @gzipped
36 | def put(self):
37 | """
38 | Create a new feed group, providing the name isn't already in use.
39 | """
40 | key = auth(forbid_reader_keys=True)
41 |
42 | parser = restful.reqparse.RequestParser()
43 | parser.add_argument("name", type=str, required=True)
44 | parser.add_argument("active", type=bool, default=True, help="Feed is active", required=False)
45 | args = parser.parse_args()
46 |
47 | # Check for this name already existing in the groups on this key
48 | if [fg for fg in key.feedgroups if fg.name == args.name]:
49 | return {"message":"Feed group %s already exists." % args.name}, 304
50 |
51 | fg = FeedGroup(name=args.name, active=args.active)
52 | key.feedgroups.append(fg)
53 | db.session.add(fg)
54 | db.session.add(key)
55 | db.session.commit()
56 |
57 | return fg.jsonify(), 201
58 |
59 | class FeedGroupResource(restful.Resource):
60 |
61 | @cors
62 | @gzipped
63 | def get(self, groupname):
64 | """
65 | Review a specific feed group.
66 | """
67 | key = auth()
68 |
69 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
70 | if not fg:
71 | restful.abort(404)
72 | return fg.jsonify()
73 |
74 | @cors
75 | @gzipped
76 | def put(self, groupname):
77 | """
78 | Create a new feed providing the name and url are unique.
79 | Feeds must be associated with a group.
80 | """
81 | key = auth(forbid_reader_keys=True)
82 |
83 | parser = restful.reqparse.RequestParser()
84 | parser.add_argument("name", type=str, required=True)
85 | parser.add_argument("url", type=str, required=True)
86 | parser.add_argument("schedule", type=str, required=True)
87 | parser.add_argument("active", type=bool, default=True, help="Feed is active", required=False)
88 | args = parser.parse_args()
89 |
90 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
91 | if not fg:
92 | return {"message":"Unknown Feed Group %s" % groupname}, 304
93 |
94 | # Verify the schedule
95 | try:
96 | parse_timings(args.schedule)
97 | except CronError, err:
98 | return {"message": err.message}, 500
99 |
100 | # Check the URL isn't already scheduled on this key
101 | if [feed for feed in key.feeds if feed.url == args.url]:
102 | return {"message": "A feed on this key already exists with this url."}, 500
103 |
104 | # Check the name is unique to this feedgroup
105 | if [feed for feed in fg.feeds if feed.name == args.name]:
106 | return {"message": "A feed in this group already exists with this name."}, 500
107 |
108 | feed = Feed(name=args.name, url=args.url, schedule=args.schedule, active=args.active)
109 |
110 | # We generally don't want to have objects in this system that don't belong to API keys.
111 | fg.feeds.append(feed)
112 | key.feeds.append(feed)
113 |
114 | db.session.add(feed)
115 | db.session.add(fg)
116 | db.session.add(key)
117 | db.session.commit()
118 |
119 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == args.name)).first()
120 | if not feed:
121 | return {"message":"Error saving feed."}, 304
122 |
123 | # Schedule this feed. 0 here is a response
124 | # queue ID (we're not waiting for a reply)
125 | app.inbox.put([0, "start", [key,feed.name]])
126 | return feed.jsonify(), 201
127 |
128 | @cors
129 | @gzipped
130 | def post(self, groupname):
131 | "Rename a feedgroup or toggle active status"
132 |
133 | key = auth(forbid_reader_keys=True)
134 |
135 | parser = restful.reqparse.RequestParser()
136 | parser.add_argument("name", type=str, help="Rename a feed group",)
137 | parser.add_argument("active", type=bool, default=None)
138 | args = parser.parse_args()
139 |
140 | fg = FeedGroup.query.filter(
141 | and_(FeedGroup.key == key, FeedGroup.name == groupname)
142 | ).first()
143 | if not fg:
144 | restful.abort(404)
145 |
146 | if args.name:
147 | if FeedGroup.query.filter(
148 | and_(FeedGroup.key == key, FeedGroup.name == args.name)
149 | ).first():
150 | return {"message":"A feed already exists with this name."}, 304
151 | fg.name = args.name
152 |
153 | if args.active or args.active == False:
154 | fg.active = args.active
155 |
156 | db.session.add(fg)
157 | db.session.commit()
158 | return fg.jsonify()
159 |
160 | @cors
161 | @gzipped
162 | def delete(self, groupname):
163 | key = auth(forbid_reader_keys=True)
164 |
165 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
166 | if not fg:
167 | restful.abort(404)
168 | count=0
169 | for feed in fg.feeds:
170 | for article in feed.articles:
171 | count += 1
172 | db.session.delete(article)
173 | db.session.delete(feed)
174 | db.session.delete(fg)
175 | db.session.commit()
176 | count = "{:,}".format(count)
177 | app.log('%s: Deleted feed group "%s". (%s articles)' % (key.name, fg.name, count))
178 |
179 | return {}
180 |
181 | class FeedGroupArticles(restful.Resource):
182 |
183 | @cors
184 | def get(self, groupname):
185 | """
186 | Retrieve articles by feedgroup.
187 | """
188 | key = auth()
189 |
190 | # Summon the group or 404.
191 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
192 | if not fg: restful.abort(404)
193 |
194 | parser = restful.reqparse.RequestParser()
195 | parser.add_argument("page", type=int, default=1)
196 | parser.add_argument("per_page", type=int, default=10)
197 | parser.add_argument("content", type=bool, default=None)
198 | args = parser.parse_args()
199 |
200 | if args.content == True:
201 |
202 | query = Article.query.filter(
203 | and_(Article.feed.has(group=fg), Article.content != None))\
204 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
205 |
206 | response = make_response(request.url, query)
207 |
208 | # for doc in response['data']:
209 | # if not doc['content_available']:
210 | # response['data'].remove(doc)
211 | # return response
212 |
213 | if args.content == False:
214 | query = Article.query.filter(
215 | and_(Article.feed.has(group=fg), Article.content == None))\
216 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
217 |
218 | return make_response(request.url, query)
219 |
220 | query = Article.query.filter(
221 | Article.feed.has(group=fg))\
222 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
223 |
224 | return make_response(request.url, query)
225 |
226 | class FeedGroupStart(restful.Resource):
227 |
228 | @cors
229 | def post(self, groupname):
230 | """
231 | Start all feeds within a group.
232 | """
233 | key = auth(forbid_reader_keys=True)
234 |
235 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
236 | if not fg:
237 | restful.abort(404)
238 |
239 | for feed in fg.feeds:
240 | app.inbox.put([0, "start", [key,feed.name]])
241 | return {}
242 |
243 | class FeedGroupStop(restful.Resource):
244 |
245 | def post(self, groupname):
246 | key = auth(forbid_reader_keys=True)
247 |
248 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
249 | if not fg:
250 | restful.abort(404)
251 |
252 | for feed in fg.feeds:
253 | app.inbox.put([0, "stop", [key,feed.name]])
254 | return {}
255 |
256 | class FeedGroupSearch(restful.Resource):
257 |
258 | def get(self, groupname, terms):
259 | """
260 | Return articles on feeds in this group with our search terms in the title.
261 | """
262 | key = auth()
263 |
264 | parser = restful.reqparse.RequestParser()
265 | parser.add_argument("page", type=int, default=1)
266 | parser.add_argument("per_page", type=int, default=10)
267 | # parser.add_argument("content", type=bool, default=None)
268 | args = parser.parse_args()
269 |
270 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
271 | if not fg:
272 | restful.abort(404)
273 |
274 | query = Article.query.filter(
275 | and_(Article.feed.has(group=fg), Article.title.like("%" + terms + "%")))\
276 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
277 | return make_response(request.url, query)
278 |
279 | class FeedGroupCount(restful.Resource):
280 |
281 | def get(self, groupname):
282 | key = auth()
283 |
284 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
285 | if not fg:
286 | restful.abort(404)
287 |
288 | return sum(len(f.articles) for f in fg.feeds)
289 |
--------------------------------------------------------------------------------
/emissary/resources/feeds.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | # This file provides the HTTP endpoints for operating on individual feeds
3 | from emissary import app, db
4 | from flask import request
5 | from flask.ext import restful
6 | from sqlalchemy import desc, and_
7 | from emissary.models import Feed, FeedGroup, Article
8 | from emissary.resources.api_key import auth
9 | from emissary.controllers.cron import CronError, parse_timings
10 | from emissary.controllers.utils import make_response, gzipped, cors
11 |
12 | class FeedResource(restful.Resource):
13 |
14 | @cors
15 | @gzipped
16 | def get(self, groupname, name):
17 | """
18 | Review a feed.
19 | """
20 | key = auth()
21 |
22 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first()
23 | if feed:
24 | return feed.jsonify()
25 | restful.abort(404)
26 |
27 | @cors
28 | @gzipped
29 | def post(self, groupname, name):
30 | """
31 | Modify an existing feed.
32 | """
33 | key = auth(forbid_reader_keys=True)
34 |
35 | parser = restful.reqparse.RequestParser()
36 | parser.add_argument("name", type=str)
37 | parser.add_argument("group", type=str)
38 | parser.add_argument("url", type=str)
39 | parser.add_argument("schedule", type=str)
40 | parser.add_argument("active", type=bool, default=None, help="Feed is active")
41 | args = parser.parse_args()
42 |
43 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first()
44 | if not feed:
45 | restful.abort(404)
46 |
47 | if args.name:
48 | if Feed.query.filter(and_(Feed.key == key, Feed.name == args.name)).first():
49 | return {"message":"A feed already exists with this name."}, 304
50 | feed.name = args.name
51 |
52 | if args.group:
53 | pass
54 |
55 | if args.active != None:
56 | feed.active = args.active
57 |
58 | if args.url:
59 | feed.url = args.url
60 |
61 | if args.schedule:
62 | try:
63 | parse_timings(args.schedule)
64 | except CronError, err:
65 | return {"message": err.message}, 500
66 | feed.schedule = args.schedule
67 |
68 | db.session.add(feed)
69 | db.session.commit()
70 |
71 | if args.url or args.schedule:
72 | app.inbox.put([0, "stop", [feed.key, feed.name]])
73 | app.inbox.put([0, "start", [feed.key, feed.name]])
74 |
75 | return feed.jsonify()
76 |
77 | @cors
78 | @gzipped
79 | def delete(self, groupname, name):
80 | """
81 | Halt and delete a feed.
82 | Default to deleting its articles.
83 | """
84 | key = auth(forbid_reader_keys=True)
85 | feed = Feed.query.filter(and_(Feed.key == key, Feed.name == name)).first()
86 | if not feed:
87 | restful.abort(404)
88 | app.inbox.put([0, "stop", [key, feed.name]])
89 | app.log('%s: %s: Deleting feed "%s".' % (feed.key.name, feed.group.name, feed.name))
90 | for a in feed.articles:
91 | db.session.delete(a)
92 |
93 | db.session.delete(feed)
94 | db.session.commit()
95 |
96 | return {}
97 |
98 | class FeedArticleCollection(restful.Resource):
99 |
100 | @cors
101 | def get(self, groupname, name):
102 | """
103 | Review the articles for a specific feed on this key.
104 | """
105 | key = auth()
106 |
107 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first()
108 | if not feed: abort(404)
109 |
110 | parser = restful.reqparse.RequestParser()
111 | parser.add_argument("page", type=int, default=1)
112 | parser.add_argument("per_page", type=int, default=10)
113 | parser.add_argument("content", type=bool, default=None)
114 | args = parser.parse_args()
115 |
116 | # Return a list of the JSONified Articles ordered by descending creation date and paginated.
117 | if args.content == True:
118 | query = Article.query.filter(and_(Article.key == key, Article.content != None, Article.feed == feed))\
119 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
120 |
121 | return make_response(request.url, query)
122 |
123 | elif args.content == False:
124 | query = Article.query.filter(and_(Article.key == key, Article.content == None, Article.feed == feed))\
125 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
126 |
127 | return make_response(request.url, query)
128 |
129 | query = Article.query.filter(and_(Article.key == key, Article.feed == feed))\
130 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
131 |
132 | return make_response(request.url, query)
133 |
134 | class FeedSearch(restful.Resource):
135 |
136 | @cors
137 | def get(self, groupname, name, terms):
138 | """
139 | Search for articles within a feed.
140 | """
141 | key = auth()
142 |
143 | parser = restful.reqparse.RequestParser()
144 | parser.add_argument("page", type=int, default=1)
145 | parser.add_argument("per_page", type=int, default=10)
146 | # parser.add_argument("content", type=bool, default=None)
147 | args = parser.parse_args()
148 |
149 | fg = FeedGroup.query.filter(and_(FeedGroup.key == key, FeedGroup.name == groupname)).first()
150 | if not fg:
151 | restful.abort(404)
152 |
153 | f = [f for f in fg.feeds if f.name == name]
154 | if not f: abort(404)
155 |
156 | f = f[0]
157 |
158 | query = Article.query.filter(
159 | and_(Article.feed == f, Article.title.like("%" + terms + "%")))\
160 | .order_by(desc(Article.created)).paginate(args.page, args.per_page)
161 |
162 | return make_response(request.url, query)
163 |
164 | class FeedStartResource(restful.Resource):
165 |
166 | @cors
167 | def post(self, groupname, name):
168 | key = auth(forbid_reader_keys=True)
169 |
170 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first()
171 | if feed:
172 | app.inbox.put([0, "start", [key, feed.name]])
173 | return feed.jsonify()
174 | restful.abort(404)
175 |
176 | class FeedStopResource(restful.Resource):
177 |
178 | @cors
179 | def post(self, groupname, name):
180 | key = auth(forbid_reader_keys=True)
181 |
182 | feed = Feed.query.filter(and_(Feed.name == name, Feed.key == key)).first()
183 | if feed:
184 | app.inbox.put([0, "stop", [key, feed.name]])
185 | return feed.jsonify()
186 | restful.abort(404)
187 |
188 |
--------------------------------------------------------------------------------
/emissary/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # _*_ coding: utf-8 _*_
3 |
4 | # The reason we don't patch threading is because
5 | # our IPC queues rely on it for locking. We can't have them
6 | # be greenlets otherwise they will need the HTTPD to yeild
7 | # before data from the fetch process can be transmitted.
8 | from gevent import monkey; monkey.patch_all(thread=False)
9 | import gevent
10 | from gevent.queue import Queue
11 | from gevent.socket import socket
12 | from gevent.wsgi import WSGIServer
13 |
14 | import os
15 | import sys
16 | import pwd
17 | import time
18 | import signal
19 | import _socket
20 | import optparse
21 | from multiprocessing import Process
22 |
23 | from emissary import app, init, db
24 | from emissary.models import APIKey
25 | from emissary.controllers.log import Log
26 | from emissary.controllers.scripts import Scripts
27 | from emissary.controllers.load import parse_crontab
28 | from emissary.controllers.manager import FeedManager
29 |
30 | try:
31 | import setproctitle
32 | setproctitle.setproctitle("emissary")
33 | except ImportError:
34 | pass
35 |
36 | def Daemonise(pidfile):
37 | try:
38 | pid = os.fork()
39 | if pid > 0:
40 | sys.exit(0) # End parent
41 | except OSError, e:
42 | sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
43 | sys.exit(-2)
44 | os.setsid()
45 | os.umask(0)
46 | try:
47 | pid = os.fork()
48 | if pid > 0:
49 | try:
50 | # TODO: Read the file first and determine if already running.
51 | f = file(pidfile, 'w')
52 | f.write(str(pid))
53 | f.close()
54 | except IOError, e:
55 | logging.error(e)
56 | sys.stderr.write(repr(e))
57 | sys.exit(0) # End parent
58 | except OSError, e:
59 | sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
60 | sys.exit(-2)
61 | for fd in (0, 1, 2):
62 | try:
63 | os.close(fd)
64 | except OSError:
65 | pass
66 |
67 | def export_crontab(filename):
68 | """
69 | Defined here to prevent circular imports.
70 | """
71 | crontab = ""
72 | fd = open(filename, "w")
73 | keys = [k for k in APIKey.query.all() if not k.reader]
74 | for key in keys:
75 | crontab += "apikey: %s\n\n" % key.key
76 | for feed in key.feeds:
77 | crontab += '%s "%s" "%s" %s\n' % (feed.url, feed.name, feed.group.name, feed.schedule)
78 | crontab += '\n\n'
79 | fd.write(crontab)
80 | fd.close()
81 |
82 | if __name__ == "__main__":
83 | prog = "Emissary"
84 | description = "A microservice for archiving the news."
85 | epilog = "Psybernetics %s." % time.asctime().split()[-1]
86 | parser = optparse.OptionParser(prog=prog,version=app.version,description=description,epilog=epilog)
87 |
88 | parser.set_usage('python -m emissary.run [options]')
89 | parser.add_option("-c", "--crontab", dest="crontab", action="store", default=None, help="Crontab to parse")
90 | parser.add_option("--config", dest="config", action="store", default=None, help="(defaults to emissary.config)")
91 | parser.add_option("-a", "--address", dest="address", action="store", default='0.0.0.0', help="(defaults to 0.0.0.0)")
92 | parser.add_option("-p", "--port", dest="port", action="store", default='6362', help="(defaults to 6362)")
93 | parser.add_option("--key", dest="key", action="store", default=None, help="SSL key file")
94 | parser.add_option("--cert", dest="cert", action="store", default=None, help="SSL certificate")
95 | parser.add_option("--export", dest="export", action="store", default=False, help="Write out current database as a crontab")
96 | parser.add_option("--pidfile", dest="pidfile", action="store", default="emissary.pid", help="(defaults to ./emissary.pid)")
97 | parser.add_option("--logfile", dest="logfile", action="store", default="emissary.log", help="(defaults to ./emissary.log)")
98 | parser.add_option("--stop", dest="stop", action="store_true", default=False)
99 | parser.add_option("--debug", dest="debug", action="store_true", default=False, help="Log to stdout")
100 | parser.add_option("-d", dest="daemonise", action="store_true", default=False, help="Run in the background")
101 | parser.add_option("--run-as", dest="run_as", action="store",default=None, help="(defaults to the invoking user)")
102 | parser.add_option("--scripts-dir", dest="scripts_dir", action="store", default="scripts", help="(defaults to ./scripts/)")
103 | (options,args) = parser.parse_args()
104 |
105 | if options.config:
106 | app.config.from_object(options.config)
107 |
108 | if options.crontab:
109 | parse_crontab(options.crontab)
110 | raise SystemExit
111 |
112 | app.debug = options.debug
113 |
114 | # Build logger from config
115 | log = Log("Emissary", log_file=options.logfile, log_stdout= not options.daemonise)
116 | log.debug = options.debug
117 | app.log = log
118 |
119 | log("Starting Emissary %s." % app.version)
120 |
121 | if options.stop:
122 | pid = None
123 | try:
124 | f = file(options.pidfile, 'r')
125 | pids = f.readline().split()
126 | f.close()
127 | os.unlink(options.pidfile)
128 | except ValueError, e:
129 | sys.stderr.write('Error in pid file "%s". Aborting\n' % options.pidfile)
130 | sys.exit(-1)
131 | except IOError, e:
132 | pass
133 | if pids:
134 | for pid in pids:
135 | os.kill(int(pid), 15)
136 | print "Killed process with ID %s." % pid
137 | else:
138 | sys.stderr.write('Emissary not running or no PID file found\n')
139 | sys.exit(0)
140 |
141 | if options.export:
142 | try:
143 | export_crontab(options.export_crontab)
144 | log('Crontab written to "%s".' % options.export_crontab)
145 | except Exception, e:
146 | log('Error writing crontab: %s' % e.message)
147 | raise SystemExit
148 |
149 |
150 | if not options.key and not options.cert:
151 | print "SSL cert and key required. (--key and --cert)"
152 | print "Keys and certs can be generated with:"
153 | print "$ openssl genrsa 1024 > key"
154 | print "$ openssl req -new -x509 -nodes -sha1 -days 365 -key key > cert"
155 | raise SystemExit
156 |
157 | if '~' in options.cert: options.cert = os.path.expanduser(options.cert)
158 | if '~' in options.key: options.key = os.path.expanduser(options.key)
159 |
160 | if not os.path.isfile(options.cert):
161 | sys.exit("Certificate not found at %s" % options.cert)
162 |
163 | if not os.path.isfile(options.key):
164 | sys.exit("Key not found at %s" % options.key)
165 |
166 | if (pwd.getpwuid(os.getuid())[2] == 0) and not options.run_as:
167 | print "Running as root is not permitted.\nExecute this as a different user."
168 | raise SystemExit
169 |
170 | sock = (options.address, int(options.port))
171 |
172 | if options.run_as:
173 | sock = socket(family=_socket.AF_INET)
174 | try:
175 | sock.bind((options.address, int(options.port)))
176 | except _socket.error:
177 | ex = sys.exc_info()[1]
178 | strerror = getattr(ex, 'strerror', None)
179 | if strerror is not None:
180 | ex.strerror = strerror + ': ' + repr(options.address+':'+options.port)
181 | raise
182 | sock.listen(50)
183 | sock.setblocking(0)
184 | uid = pwd.getpwnam(options.run_as)[2]
185 | try:
186 | os.setuid(uid)
187 | log("Now running as %s." % options.run_as)
188 | except Exception, e: raise
189 |
190 | # Create the database schema and insert an administrative key
191 | init()
192 |
193 | if options.daemonise: Daemonise(options.pidfile)
194 |
195 | # Load scripts
196 | app.scripts = Scripts(options.scripts_dir)
197 | app.scripts.reload()
198 |
199 | # Trap SIGHUP to reload scripts
200 | signal.signal(signal.SIGHUP, app.scripts.reload)
201 |
202 |
203 | # Initialise the feed manager with the logger, provide IPC access and load feeds.
204 | fm = FeedManager(log)
205 | fm.db = db
206 | fm.app = app # Queue access
207 | fm.load_feeds()
208 |
209 | # Start the REST interface
210 | httpd = WSGIServer(sock, app, certfile=options.cert, keyfile=options.key)
211 | httpd.loop.reinit()
212 | httpd_process = Process(target=httpd.serve_forever)
213 | log("Binding to %s:%s" % (options.address, options.port))
214 | httpd_process.start()
215 |
216 | if options.daemonise:
217 | f = file(options.pidfile, 'a')
218 | f.write(' %i' % httpd_process.pid)
219 | f.close()
220 |
221 | try:
222 | fm.run()
223 | except KeyboardInterrupt:
224 | log("Stopping...")
225 | httpd_process.terminate()
226 |
--------------------------------------------------------------------------------
/scripts/hello.py:
--------------------------------------------------------------------------------
1 | # _*_ coding: utf-8 _*_
2 | #
3 | # This script creates a named pipe (if it doesn't exist)
4 | # and writes the feed name, article title and url to it
5 | # whenever an article is saved to the database.
6 | #
7 | # This is useful for composing systems that constantly read
8 | # the FIFO and do things like emit the data to IRC channels.
9 | #
10 | # You could, for instance, perform fuzzy pattern matching and be
11 | # notified when certain keywords are in the news.
12 | #
13 | # Transmission to a natural language processing/translation service
14 | # can also be done in a script or by reading a FIFO like the one here.
15 | #
16 | # Whether you use this system to profit, perform intelligence analysis
17 | # or inform your next vote is hopefully up to you!
18 | #
19 | # Luke Brooks, 2015
20 | # MIT License
21 | # Many big thanks to God, lord of universes.
22 | fifo = "/tmp/emissary.pipe"
23 |
24 | import os, stat
25 | if not os.path.exists(fifo):
26 | try:
27 | os.mkfifo(fifo)
28 | except Exception, e:
29 | cache['app'].log("Error creating %s: %s" % (fifo, e.message))
30 |
31 | # Emissary always executes scripts with an article and its feed in the namespace.
32 |
33 | # There is also a dictionary named cache, containing the app object.
34 | # Random aside but through the app object you can access the logging interface and the feed manager.
35 | try:
36 | # READER BEWARE: Use non-blocking IO or you won't be storing owt.
37 | fd = os.open(fifo, os.O_CREAT | os.O_WRONLY | os.O_NONBLOCK)
38 | os.write(fd, "%s: %s\n%s\n" % (feed.name, article.title, article.url))
39 | os.close(fd)
40 | del fd
41 | except Exception, e: # Usually due to there not being a reader fd known to the kernel.
42 | pass
43 |
44 | del os, stat, fifo
45 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # _*_ coding: utf-8 _*_
3 | from setuptools import setup, find_packages
4 | import os
5 | import imp
6 |
7 | def non_python_files(path):
8 | """ Return all non-python-file filenames in path """
9 | result = []
10 | all_results = []
11 | module_suffixes = [info[0] for info in imp.get_suffixes()]
12 | ignore_dirs = ['cvs']
13 | for item in os.listdir(path):
14 | name = os.path.join(path, item)
15 | if (
16 | os.path.isfile(name) and
17 | os.path.splitext(item)[1] not in module_suffixes
18 | ):
19 | result.append(name)
20 | elif os.path.isdir(name) and item.lower() not in ignore_dirs:
21 | all_results.extend(non_python_files(name))
22 | if result:
23 | all_results.append((path, result))
24 | return all_results
25 |
26 | data_files = (
27 | # non_python_files('emissary') +
28 | # non_python_files(os.path.join('Emissary', 'doc'))
29 | )
30 |
31 | setup(name='Emissary',
32 | version="2.1.1",
33 | description='A microservice for indexing the plain text of articles and essays',
34 | author='Luke Brooks',
35 | author_email='luke@psybernetics.org.uk',
36 | url='http://psybernetics.org.uk/emissary',
37 | download_url = 'https://github.com/LukeB42/Emissary/tarball/2.0.0',
38 | data_files = data_files,
39 | packages=['emissary', 'emissary.resources', 'emissary.controllers'],
40 | include_package_data=True,
41 | install_requires=[
42 | "setproctitle",
43 | "goose-extractor",
44 | "lxml",
45 | "gevent",
46 | "Flask-RESTful",
47 | "Flask-SQLAlchemy",
48 | "cssselect",
49 | "BeautifulSoup",
50 | "feedparser",
51 | "python-snappy",
52 | "requests",
53 | "pygments",
54 | "window",
55 | ],
56 | keywords=["text extraction","document archival","document retrieval"]
57 | )
58 |
--------------------------------------------------------------------------------