14 | Read More
15 | {% if e.read_later %}
16 | {% else %}
17 | {% endif %}
18 | {% endfor %}
19 |
20 | {% endblock %}
21 |
--------------------------------------------------------------------------------
/tweets/views.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # NOTE: no longer tested or maintained due to change in Twitter/X API
3 |
4 | from __future__ import unicode_literals
5 |
6 | from django.http import Http404, JsonResponse
7 | from django.views.generic.detail import DetailView
8 | from django.views.generic.list import ListView
9 | from django_filters.views import FilterView
10 |
11 | from tweets.models import Tweet
12 |
13 | from datetime import datetime
14 | import django_filters
15 |
16 |
17 | class TweetFilter(django_filters.FilterSet):
18 |
19 | class Meta:
20 | model = Tweet
21 | fields = ['read', 'read_later', 'published', 'twitter_list__name']
22 |
23 |
24 | class TweetList(FilterView):
25 | template_name = "entry_list.html"
26 | paginate_by = 30
27 | model = Tweet
28 | context_object_name = 'entry_list'
29 | filterset_class = TweetFilter
30 |
31 |
32 | class TweetDetailView(DetailView):
33 | model = Tweet
34 | template_name = "entry_detail.html"
35 |
--------------------------------------------------------------------------------
/feeds/models.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models
5 | from base import Entry
6 |
7 | class FeedCategory(models.Model):
8 | name = models.CharField(max_length=100)
9 |
10 | def __str__(self):
11 | return u'Category: {}'.format(self.name)
12 |
13 | class Feed(models.Model):
14 | title = models.CharField(max_length=255)
15 | url = models.URLField()
16 | is_alive = models.BooleanField(default=True)
17 | category = models.ForeignKey(FeedCategory,
18 | on_delete=models.SET_NULL,
19 | null=True)
20 | updated = models.DateTimeField(null=True, default=None)
21 | last_entry = models.DateTimeField(null=True, default=None)
22 |
23 | def __str__(self):
24 | return u'Feed: {}'.format(self.title)
25 |
26 | class Meta:
27 | ordering = ('-last_entry',)
28 |
29 | class FeedEntry(Entry):
30 | feed = models.ForeignKey(Feed, on_delete=models.CASCADE)
31 |
32 | @property
33 | def entry_type(self):
34 | return "feeds"
35 |
36 | @property
37 | def source(self):
38 | return self.feed.title
39 |
--------------------------------------------------------------------------------
/example_env:
--------------------------------------------------------------------------------
1 | DEBUG=True # set to false for production
2 |
3 | SECRET_KEY=_pleaseputalongrandomstringhere_
4 | ALLOWED_HOSTS=127.0.0.1,YOUR_SERVER_IP_OR_DNS_HERE
5 | DB_USERNAME=changethisplease
6 | DB_PASSWORD=changethisplease
7 | DB_NAME=changethisplease
8 | DB_CONNSTR=enter_your_db_connection_string
9 |
10 | # note that these are different because of some setup redundancies I wanted for my setup, but you can make them the same again in the settings
11 | LOCAL_DB_USERNAME=changethisplease
12 | LOCAL_DB_PASSWORD=changethisplease
13 | LOCAL_DB_NAME=changethisplease
14 | LOCAL_DB_CONNSTR=enter_your_db_connection_string
15 |
16 |
17 | #for the following, you'll need a reddit account and to set yourself up with the reddit API
18 | REDDIT_USERNAME=____
19 | REDDIT_CLIENT_ID=____
20 | REDDIT_CLIENT_SECRET=____
21 | REDDIT_PASSWORD=____
22 | REDDIT_USER_AGENT="my version of priveedly via praw"
23 |
24 | PIPELINE_FILE=___enter path to your pipeline file if you trained your own via scikitlearn___
25 |
26 |
27 | # I don't know if this works anymore :(
28 | TWITTER_CONSUMER_KEY=____
29 | TWITTER_CONSUMER_SECRET=____
30 | TWITTER_ACCESS_TOKEN=____
31 | TWITTER_ACCESS_TOKEN_SECRET=____
32 |
--------------------------------------------------------------------------------
/priveedly/urls.py:
--------------------------------------------------------------------------------
1 |
2 | from django.urls import include, re_path
3 | from django.contrib import admin
4 |
5 | from feeds.views import EntryList, EntryDetailView, main_feed, read_later_feed, recommended_feed, mark_read, mark_read_later, mark_interesting, unmark_read_later
6 | from sites.views import SiteList, SiteDetailView, RedditList, RedditDetailView
7 |
8 | urlpatterns = [
9 | re_path(r'^admin/', admin.site.urls),
10 | re_path(r'^accounts/', include("django.contrib.auth.urls")),
11 | re_path(r'^$', main_feed),
12 | re_path(r'^read-later/$', read_later_feed),
13 | re_path(r'^recommended/$', recommended_feed),
14 | re_path(r'^feeds/$', EntryList.as_view()),
15 | re_path(r'^feeds/(?P\d+)/$', EntryDetailView.as_view()),
16 | re_path(r'^sites/$', SiteList.as_view()),
17 | re_path(r'^sites/(?P\d+)/$', SiteDetailView.as_view()),
18 | re_path(r'^reddit/$', RedditList.as_view()),
19 | re_path(r'^reddit/(?P\d+)/$', RedditDetailView.as_view()),
20 | re_path(r'^feeds/mark-read/', mark_read),
21 | re_path(r'^feeds/mark-interesting/', mark_interesting),
22 | re_path(r'^feeds/mark-read-later/', mark_read_later),
23 | re_path(r'^feeds/unmark-read-later/', unmark_read_later),
24 | ]
25 |
--------------------------------------------------------------------------------
/feeds/management/commands/parse_all.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand
2 | from feeds.management_commands import parse_all_feeds
3 | from sites.management_commands import parse_lobsters, parse_hackernews, parse_all_subreddits
4 |
5 | from datetime import datetime
6 | import logging
7 |
8 |
9 | logging.basicConfig(filename='/var/log/priveedly/parse.log',
10 | encoding='utf-8',
11 | level=logging.INFO,
12 | datefmt='%Y-%m-%d %H:%M:%S')
13 |
14 | class Command(BaseCommand):
15 |
16 |
17 | def handle(self, *args, **kwargs):
18 | start = datetime.now()
19 | try:
20 | parse_all_feeds()
21 | except Exception as e:
22 | logging.error('feed error {}'.format(e))
23 | try:
24 | parse_all_subreddits()
25 | except Exception as e:
26 | logging.error('reddit error {}'.format(e))
27 | try:
28 | parse_hackernews()
29 | except Exception as e:
30 | logging.error('hackernews error {}'.format(e))
31 | try:
32 | parse_lobsters()
33 | except Exception as e:
34 | logging.error('lobsters error {}'.format(e))
35 | logging.info('finished parsing in {}'.format(datetime.now() - start))
36 |
--------------------------------------------------------------------------------
/base.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models
5 | from django.utils.html import strip_tags
6 |
7 | class Entry(models.Model):
8 |
9 | class EntryCategory(models.TextChoices):
10 | RSS = 'RS', 'RSS Feed'
11 | TW = 'TW', 'Twitter'
12 | LS = 'LS', 'Lobste.rs'
13 | RD = 'RD', 'Reddit'
14 | HN = 'HN', 'Hacker News'
15 |
16 | title = models.CharField(max_length=355)
17 | url = models.URLField(max_length=400)
18 | description = models.TextField()
19 | image_url = models.URLField(null=True, blank=True, default='')
20 | published = models.DateTimeField()
21 | created = models.DateTimeField(auto_now_add=True)
22 | read = models.BooleanField(default=False)
23 | to_delete = models.BooleanField(default=False)
24 | read_later = models.BooleanField(default=False)
25 | interesting = models.BooleanField(default=False)
26 | recommended = models.FloatField(default=0)
27 | entry_category = models.CharField(
28 | max_length=2,
29 | choices=EntryCategory.choices,
30 | )
31 |
32 | @property
33 | def safe_text(self):
34 | return strip_tags(self.description)
35 |
36 | class Meta:
37 | ordering = ('published',)
38 | abstract = True
39 |
40 | def __str__(self):
41 | return u'{} Entry: {}'.format(self.entry_category, self.title)
42 |
--------------------------------------------------------------------------------
/sites/views.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.http import Http404, JsonResponse
5 | from django.views.generic.detail import DetailView
6 | from django.views.generic.list import ListView
7 | from django_filters.views import FilterView
8 |
9 | from sites.models import SitePost, RedditPost
10 |
11 | from datetime import datetime
12 | import django_filters
13 |
14 |
15 | class SitesFilter(django_filters.FilterSet):
16 |
17 | class Meta:
18 | model = SitePost
19 | fields = ['read', 'read_later', 'published', 'site_name', 'recommended']
20 |
21 | class RedditFilter(django_filters.FilterSet):
22 |
23 | class Meta:
24 | model = RedditPost
25 | fields = ['read', 'read_later', 'published', 'subreddit__name', 'recommended']
26 |
27 |
28 | class SiteList(FilterView):
29 | template_name = "entry_list.html"
30 | paginate_by = 30
31 | model = SitePost
32 | context_object_name = 'entry_list'
33 | filterset_class = SitesFilter
34 |
35 | class SiteDetailView(DetailView):
36 | model = SitePost
37 | template_name = "entry_detail.html"
38 |
39 | class RedditList(FilterView):
40 | template_name = "entry_list.html"
41 | paginate_by = 30
42 | model = RedditPost
43 | context_object_name = 'entry_list'
44 | filterset_class = RedditFilter
45 |
46 |
47 | class RedditDetailView(DetailView):
48 | model = RedditPost
49 | template_name = "entry_detail.html"
50 |
--------------------------------------------------------------------------------
/static/custom.js:
--------------------------------------------------------------------------------
1 | function mark_read_later(entry_id, entry_type) {
2 | $.ajaxSetup({
3 | headers: {
4 | 'X-CSRFToken': Cookies.get('csrftoken'),
5 | }
6 | });
7 |
8 | $.post("/feeds/mark-read-later/",
9 | {'entry_id': entry_id, 'entry_type': entry_type});
10 | }
11 |
12 | function unmark_read_later(entry_id, entry_type) {
13 | $.ajaxSetup({
14 | headers: {
15 | 'X-CSRFToken': Cookies.get('csrftoken'),
16 | }
17 | });
18 |
19 | $.post("/feeds/unmark-read-later/",
20 | {'entry_id': entry_id, 'entry_type': entry_type});
21 | }
22 |
23 | function mark_interesting(entry_id, entry_type) {
24 | $.ajaxSetup({
25 | headers: {
26 | 'X-CSRFToken': Cookies.get('csrftoken'),
27 | }
28 | });
29 |
30 | $.post("/feeds/mark-interesting/",
31 | {'entry_id': entry_id, 'entry_type': entry_type});
32 | }
33 |
34 | function mark_read() {
35 | var IDs = [];
36 | var entry_types = [];
37 | $("h3").each(function(){
38 | IDs.push(this.id);
39 | entry_types.push($(this).attr("entry_type")); });
40 | var IDsString = IDs.join(",");
41 | var typesString = entry_types.join(",");
42 | $.ajaxSetup({
43 | headers: {
44 | 'X-CSRFToken': Cookies.get('csrftoken'),
45 | }
46 | });
47 |
48 | $.post("/feeds/mark-read/",
49 | {"id_list": IDsString,
50 | "entry_types": typesString})
51 | .done(function(){
52 | document.body.scrollTop = document.documentElement.scrollTop = 0;
53 | location.reload();
54 | });
55 | }
56 |
57 |
58 |
--------------------------------------------------------------------------------
/static/js.cookie.min.js:
--------------------------------------------------------------------------------
1 | /*! js-cookie v3.0.5 | MIT */
2 | !function(e,t){"object"==typeof exports&&"undefined"!=typeof module?module.exports=t():"function"==typeof define&&define.amd?define(t):(e="undefined"!=typeof globalThis?globalThis:e||self,function(){var n=e.Cookies,o=e.Cookies=t();o.noConflict=function(){return e.Cookies=n,o}}())}(this,(function(){"use strict";function e(e){for(var t=1;t twlist.since_id:
69 | twlist.last_entry = e.published
70 | twlist.since_id = tweet.id
71 |
72 | twlist.updated = pytz.utc.localize(datetime.utcnow())
73 | twlist.save()
74 | logging.info("Parsed Twitter list: {} and found {} new items".format(
75 | twlist.name,
76 | entry_count))
77 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | anyio==4.4.0
2 | appnope==0.1.4
3 | argon2-cffi==23.1.0
4 | argon2-cffi-bindings==21.2.0
5 | arrow==1.3.0
6 | asgiref==3.6.0
7 | asttokens==2.2.1
8 | async-lru==2.0.4
9 | attrs==24.2.0
10 | babel==2.16.0
11 | backcall==0.2.0
12 | beautifulsoup4==4.12.2
13 | bleach==6.1.0
14 | certifi==2022.12.7
15 | cffi==1.17.1
16 | charset-normalizer==3.1.0
17 | click==8.1.7
18 | comm==0.2.2
19 | contourpy==1.3.0
20 | cycler==0.12.1
21 | debugpy==1.8.5
22 | decorator==5.1.1
23 | defusedxml==0.7.1
24 | Django==4.2
25 | django-filter==23.1
26 | django-login-required-middleware==0.9.0
27 | django-querysetsequence==0.16
28 | executing==1.2.0
29 | fastjsonschema==2.20.0
30 | feedparser==6.0.10
31 | fonttools==4.53.1
32 | fqdn==1.5.1
33 | gevent==24.10.3
34 | greenlet==3.1.1
35 | h11==0.14.0
36 | HackerNews==2.0.0
37 | hackernews-python==0.3.2
38 | httpcore==1.0.5
39 | httpx==0.27.2
40 | idna==3.4
41 | imageio==2.35.1
42 | imbalanced-learn==0.12.3
43 | ipykernel==6.29.5
44 | ipython==8.13.1
45 | ipywidgets==8.1.5
46 | isoduration==20.11.0
47 | jedi==0.18.2
48 | Jinja2==3.1.4
49 | joblib==1.4.2
50 | json5==0.9.25
51 | jsonpointer==3.0.0
52 | jsonschema==4.23.0
53 | jsonschema-specifications==2023.12.1
54 | jupyter==1.1.1
55 | kiwisolver==1.4.7
56 | lazy_loader==0.4
57 | lime==0.2.0.1
58 | MarkupSafe==2.1.5
59 | matplotlib==3.9.2
60 | matplotlib-inline==0.1.6
61 | mistune==3.0.2
62 | nbclient==0.10.0
63 | nbconvert==7.16.4
64 | nbformat==5.10.4
65 | nest-asyncio==1.6.0
66 | networkx==3.2
67 | nltk==3.9.1
68 | notebook==7.2.2
69 | notebook_shim==0.2.4
70 | numpy==2.0.0
71 | oauthlib==3.2.2
72 | overrides==7.7.0
73 | packaging==24.1
74 | pandas==2.2.2
75 | pandocfilters==1.5.1
76 | parso==0.8.3
77 | pexpect==4.8.0
78 | pickleshare==0.7.5
79 | pillow==10.4.0
80 | platformdirs==4.3.2
81 | praw==7.7.0
82 | prawcore==2.3.0
83 | prometheus_client==0.20.0
84 | prompt-toolkit==3.0.38
85 | psutil==6.0.0
86 | ptyprocess==0.7.0
87 | pure-eval==0.2.2
88 | pycparser==2.22
89 | Pygments==2.15.1
90 | pyparsing==3.1.4
91 | python-dateutil==2.8.2
92 | python-dotenv==1.0.0
93 | python-json-logger==2.0.7
94 | pytz==2023.3
95 | PyYAML==6.0.2
96 | pyzmq==26.2.0
97 | referencing==0.35.1
98 | regex==2024.7.24
99 | rfc3339-validator==0.1.4
100 | rfc3986-validator==0.1.1
101 | rpds-py==0.20.0
102 | scikit-image==0.24.0
103 | scikit-learn==1.5.1
104 | scipy==1.13.1
105 | seaborn==0.13.2
106 | Send2Trash==1.8.3
107 | sentry-sdk==2.17.0
108 | setuptools==72.1.0
109 | sgmllib3k==1.0.0
110 | six==1.16.0
111 | sniffio==1.3.1
112 | soupsieve==2.4.1
113 | SQLAlchemy==2.0.34
114 | sqlparse==0.4.4
115 | stack-data==0.6.2
116 | terminado==0.18.1
117 | threadpoolctl==3.5.0
118 | tifffile==2024.8.30
119 | tinycss2==1.3.0
120 | tornado==6.4.1
121 | tqdm==4.66.5
122 | traitlets==5.9.0
123 | types-python-dateutil==2.9.0.20240906
124 | typing_extensions==4.12.2
125 | tzdata==2024.1
126 | update-checker==0.18.0
127 | uri-template==1.3.0
128 | urllib3==2.0.2
129 | wcwidth==0.2.6
130 | webcolors==24.8.0
131 | webencodings==0.5.1
132 | wheel==0.43.0
133 | widgetsnbextension==4.0.13
134 | zope.event==5.0
135 | zope.interface==7.1.1
136 |
--------------------------------------------------------------------------------
/feeds/management_commands.py:
--------------------------------------------------------------------------------
1 | from feeds.models import Feed, FeedEntry
2 |
3 | import feedparser
4 | import logging
5 | import pytz
6 |
7 | from datetime import datetime, timedelta, timezone
8 | from dateutil import parser as date_parser
9 | from django.utils.timezone import is_aware
10 | from lxml.html import fromstring
11 |
12 |
13 | logging.basicConfig(filename='/var/log/priveedly/parse.log',
14 | encoding='utf-8',
15 | level=logging.INFO,
16 | datefmt='%Y-%m-%d %H:%M:%S')
17 |
18 | def test_feeds_for_zombies():
19 | all_feeds = Feed.objects.filter(is_alive=True)
20 | zombies = 0
21 | utcnow = pytz.utc.localize(datetime.utcnow())
22 | for feed in all_feeds:
23 | if feed.last_entry < utcnow - timedelta(days=90):
24 | feed.is_alive = False
25 | feed.save()
26 | zombies += 1
27 | logging.info('Found {} zombies.'.format(zombies))
28 |
29 |
30 | def parse_all_feeds():
31 | all_feeds = Feed.objects.filter(is_alive=True)
32 | for feed in all_feeds:
33 | try:
34 | parse_feed(feed)
35 | except Exception as e:
36 | logging.error(e)
37 | logging.error('Problem parsing %s' % feed)
38 | logging.info('Finished parsing {} feeds.'.format(len(all_feeds)))
39 |
40 | def get_pub_date(entry):
41 | if hasattr(entry, 'published'):
42 | pub_date = date_parser.parse(entry.published)
43 | elif hasattr(entry, 'updated'):
44 | pub_date = date_parser.parse(entry.updated)
45 | else:
46 | pub_date = pytz.utc.localize(datetime.utcnow())
47 |
48 | if not is_aware(pub_date):
49 | pub_date = pytz.utc.localize(pub_date)
50 |
51 | return pub_date
52 |
53 | def get_title(entry):
54 | if hasattr(entry, 'title'):
55 | return entry.title[:354]
56 | return 'no title'
57 |
58 | def get_description(entry):
59 | # this prefers HTML and longer content over shorter !!
60 | if hasattr(entry, 'content'):
61 | if len(entry.content) == 1:
62 | return entry.content[0]['value']
63 | else:
64 | for ec in entry.content:
65 | if 'html' in ec['type']:
66 | return ec['value']
67 | return ec['value'] # default if no html
68 | elif hasattr(entry, 'description'):
69 | return entry.description
70 | return 'no description'
71 |
72 | def get_image(description):
73 | doc = fromstring(description)
74 | for pattern in ['//img/@src', '//img/@src']:
75 | images = doc.xpath(pattern)
76 | if len(images):
77 | return images[0]
78 |
79 | def parse_feed(feed):
80 | entry_count = 0
81 | entries = feedparser.parse(feed.url)
82 | for entry in entries.entries:
83 | pub_date = get_pub_date(entry)
84 |
85 | # this speeds up queries once you have many entries
86 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120)
87 | if not FeedEntry.objects.filter(published__gte=filter_date, url=entry.link) and pub_date >= filter_date:
88 | title = get_title(entry)
89 | desc = get_description(entry)
90 |
91 | e = FeedEntry(
92 | feed=feed,
93 | title=title,
94 | url=entry.link,
95 | description=desc,
96 | entry_category='RS',
97 | published=pub_date,
98 | )
99 |
100 | img = get_image(desc)
101 | if img:
102 | e.image_url = img
103 |
104 | e.save()
105 | entry_count += 1
106 | if not feed.last_entry or e.published > feed.last_entry:
107 | feed.last_entry = e.published
108 |
109 | feed.updated = pytz.utc.localize(datetime.utcnow())
110 | feed.save()
111 | logging.info("Parsed feed: {} and found {} new items".format(
112 | feed.title,
113 | entry_count))
114 |
--------------------------------------------------------------------------------
/feeds/views.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 |
4 | from django.http import Http404, JsonResponse
5 | from django.views.generic.detail import DetailView
6 | from django.views.generic.list import ListView
7 | from django.shortcuts import render
8 | from django_filters.views import FilterView
9 | from queryset_sequence import QuerySetSequence
10 |
11 | from feeds.models import FeedCategory, Feed, FeedEntry
12 | from sites.models import SitePost, RedditPost
13 |
14 | from datetime import datetime
15 | import django_filters
16 |
17 |
18 | class EntryFilter(django_filters.FilterSet):
19 |
20 | class Meta:
21 | model = FeedEntry
22 | fields = ['read', 'read_later', 'feed__title', 'published', 'recommended']
23 |
24 |
25 | class EntryList(FilterView):
26 | template_name = "entry_list.html"
27 | paginate_by = 30
28 | model = FeedEntry
29 | context_object_name = 'entry_list'
30 | filterset_class = EntryFilter
31 |
32 |
33 | class EntryDetailView(DetailView):
34 | model = FeedEntry
35 | template_name = "entry_detail.html"
36 |
37 |
38 | def main_feed(request):
39 | group_qs = QuerySetSequence(
40 | FeedEntry.objects.filter(read=False),
41 | SitePost.objects.filter(read=False),
42 | RedditPost.objects.filter(read=False)).order_by('published')
43 |
44 | return render(request,
45 | 'entry_list.html', {
46 | 'entry_list': group_qs[:30],
47 | 'total_unread': len(group_qs)})
48 |
49 |
50 | def read_later_feed(request):
51 | group_qs = QuerySetSequence(
52 | FeedEntry.objects.filter(read_later=True),
53 | SitePost.objects.filter(read_later=True),
54 | RedditPost.objects.filter(read_later=True)).order_by('published')[:20]
55 | return render(request,
56 | 'entry_list.html', {'entry_list': group_qs})
57 |
58 |
59 | def recommended_feed(request):
60 | group_qs = QuerySetSequence(
61 | FeedEntry.objects.filter(recommended__gte=0.5, read=False),
62 | SitePost.objects.filter(recommended__gte=0.5, read=False),
63 | RedditPost.objects.filter(recommended__gte=0.5, read=False)).order_by('published')[:20]
64 | return render(request,
65 | 'entry_list.html', {'entry_list': group_qs})
66 |
67 | def mark_read(request):
68 | if request.method == 'POST':
69 | entry_ids = request.POST.get('id_list').split(',')
70 | entry_types = request.POST.get('entry_types').split(',')
71 | for etype, ein in zip(entry_types, entry_ids):
72 | if etype == 'sites':
73 | e = SitePost.objects.get(id=ein)
74 | elif etype == 'feeds':
75 | e = FeedEntry.objects.get(id=ein)
76 | elif etype == 'reddit':
77 | e = RedditPost.objects.get(id=ein)
78 | e.read = True
79 | e.save()
80 | return JsonResponse({'success': True})
81 | return JsonResponse({'success': False,
82 | 'error': 'Please send a list of ids'})
83 |
84 | def mark_read_later(request):
85 | if request.method == 'POST':
86 | entry_id = request.POST.get('entry_id')
87 | entry_type = request.POST.get('entry_type')
88 | if entry_type == 'sites':
89 | e = SitePost.objects.get(id=entry_id)
90 | elif entry_type == 'feeds':
91 | e = FeedEntry.objects.get(id=entry_id)
92 | elif entry_type == 'reddit':
93 | e = RedditPost.objects.get(id=entry_id)
94 | e.read_later = True
95 | e.save()
96 | return JsonResponse({'success': True})
97 | return JsonResponse({'success': False,
98 | 'error': 'Please send an entry id'})
99 |
100 | def unmark_read_later(request):
101 | if request.method == 'POST':
102 | entry_id = request.POST.get('entry_id')
103 | entry_type = request.POST.get('entry_type')
104 | if entry_type == 'sites':
105 | e = SitePost.objects.get(id=entry_id)
106 | elif entry_type == 'feeds':
107 | e = FeedEntry.objects.get(id=entry_id)
108 | elif entry_type == 'reddit':
109 | e = RedditPost.objects.get(id=entry_id)
110 | e.read_later = False
111 | e.save()
112 | return JsonResponse({'success': True})
113 | return JsonResponse({'success': False,
114 | 'error': 'Please send an entry id'})
115 |
116 | def mark_interesting(request):
117 | if request.method == 'POST':
118 | entry_id = request.POST.get('entry_id')
119 | entry_type = request.POST.get('entry_type')
120 | if entry_type == 'sites':
121 | e = SitePost.objects.get(id=entry_id)
122 | elif entry_type == 'feeds':
123 | e = FeedEntry.objects.get(id=entry_id)
124 | elif entry_type == 'reddit':
125 | e = RedditPost.objects.get(id=entry_id)
126 | e.read_later = False
127 | e.interesting = True
128 | e.save()
129 | return JsonResponse({'success': True})
130 | return JsonResponse({'success': False,
131 | 'error': 'Please send an entry id'})
132 |
--------------------------------------------------------------------------------
/sites/management_commands.py:
--------------------------------------------------------------------------------
1 | from sites.models import Subreddit, RedditPost, SitePost
2 | from django.conf import settings
3 |
4 | import logging
5 | import praw
6 | import pytz
7 | import random
8 | import requests
9 |
10 | from hackernews import HackerNews
11 | from datetime import datetime, timedelta, timezone
12 |
13 | logging.basicConfig(filename='/var/log/priveedly/parse.log',
14 | encoding='utf-8',
15 | level=logging.INFO,
16 | datefmt='%Y-%m-%d %H:%M:%S')
17 |
18 | def parse_all_subreddits():
19 | all_subreddits = Subreddit.objects.all()
20 | for subreddit in all_subreddits:
21 | try:
22 | parse_reddit(subreddit)
23 | except Exception as e:
24 | logging.error(e)
25 | logging.error('Error parsing subreddit {}'.format(subreddit))
26 | logging.info('Finished parsing {} subreddits.'.format(
27 | len(all_subreddits)))
28 |
29 |
30 | def get_lobster_posts(url="https://lobste.rs/hottest.json"):
31 | return [r for r in requests.get(url).json()]
32 |
33 | def parse_lobsters():
34 | entry_count = 0
35 | posts = get_lobsters_posts()
36 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120)
37 | for post in posts:
38 | if not SitePost.objects.filter(published__gte=filter_date, url=post.get('url')):
39 | e = SitePost(
40 | entry_category='LS',
41 | site_name='lobsters',
42 | title=post.get('title')[:354],
43 | url=post.get('url'),
44 | description=post.get('description') + ' Tags: {}'.format(
45 | ' '.join(post.get('tags'))),
46 | published=post.get('created_at'),
47 | )
48 | e.save()
49 | entry_count += 1
50 | logging.info("Parsed lobsters and found {} new items".format(entry_count))
51 |
52 |
53 | def get_text(hn_item):
54 | if hasattr(hn_item, 'text'):
55 | return hn_item.text
56 | return ''
57 |
58 |
59 | def parse_hackernews():
60 | entry_count = 0
61 | hn = HackerNews()
62 | stories = [hn.item(x) for x in hn.top_stories()]
63 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120)
64 |
65 | for post in stories:
66 | if not hasattr(post, 'url'):
67 | continue
68 | if not SitePost.objects.filter(published__gte=filter_date, url=post.url):
69 | text = post.title[:354]
70 | if hasattr(post, 'text'):
71 | text = post.text
72 | elif hasattr(post, 'kids'):
73 | sample_size = (lambda y: len(y) if len(y) < 4 else 4)(post.kids)
74 | text = '\n '.join(
75 | [get_text(hn.item(x)) for x in
76 | random.sample(post.kids, sample_size)])
77 | e = SitePost(
78 | entry_category='HN',
79 | site_name='hackernews',
80 | title=post.title,
81 | url=post.url,
82 | description=text,
83 | published=pytz.utc.localize(post.time)
84 | )
85 | e.save()
86 | entry_count += 1
87 | logging.info("Parsed hackernews and found {} new items".format(entry_count))
88 |
89 |
90 | def get_praw():
91 | return praw.Reddit(
92 | client_id=settings.REDDIT_CLIENT_ID,
93 | client_secret=settings.REDDIT_CLIENT_SECRET,
94 | password=settings.REDDIT_PASSWORD,
95 | user_agent=settings.REDDIT_USER_AGENT,
96 | username=settings.REDDIT_USERNAME)
97 |
98 | def parse_reddit(subreddit):
99 | entry_count = 0
100 | api = get_praw()
101 | posts = api.subreddit(subreddit.name).new(limit=500)
102 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120)
103 |
104 | for post in posts:
105 | if not RedditPost.objects.filter(published__gte=filter_date, url=post.url):
106 | pub_date = pytz.utc.localize(datetime.utcfromtimestamp(post.created_utc))
107 | if pub_date <= filter_date:
108 | continue
109 | text = post.selftext
110 | if not text:
111 | sample_size = (lambda y: len(y) if
112 | len(y) < 4 else 4)(list(post.comments))
113 | text = '\n '.join(
114 | [comment.body for comment in
115 | random.sample(list(post.comments), sample_size)])
116 |
117 | e = RedditPost(
118 | entry_category='RD',
119 | subreddit=subreddit,
120 | title=post.title[:354],
121 | url=post.url,
122 | description=text,
123 | published=pub_date
124 | )
125 | e.save()
126 | entry_count += 1
127 |
128 | subreddit.updated = pytz.utc.localize(datetime.utcnow())
129 | subreddit.save()
130 | logging.info("Parsed subreddit: {} and found {} new items".format(
131 | subreddit.name,
132 | entry_count))
133 |
--------------------------------------------------------------------------------
/priveedly/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for priveedly project.
3 |
4 | Generated by 'django-admin startproject' using Django 4.2.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/4.2/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/4.2/ref/settings/
11 | """
12 |
13 | from pathlib import Path
14 | from dotenv import load_dotenv
15 | import os
16 |
17 |
18 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
19 | BASE_DIR = Path(__file__).resolve().parent.parent
20 |
21 | load_dotenv()
22 |
23 | # Quick-start development settings - unsuitable for production
24 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
25 |
26 | # SECURITY WARNING: keep the secret key used in production secret!
27 | SECRET_KEY = os.environ.get('SECRET_KEY')
28 |
29 | # SECURITY WARNING: don't run with debug turned on in production!
30 | DEBUG = (os.getenv('DEBUG') == 'True')
31 |
32 | ALLOWED_HOSTS = [x for x in os.environ.get('ALLOWED_HOSTS').split(',')]
33 | SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https')
34 | CSRF_TRUSTED_ORIGINS = ['https://{}'.format(x) for x in os.environ.get('ALLOWED_HOSTS').split(',') if '.com' in x]
35 |
36 | # Application definition
37 |
38 | INSTALLED_APPS = [
39 | 'django.contrib.admin',
40 | 'django.contrib.auth',
41 | 'django.contrib.contenttypes',
42 | 'django.contrib.sessions',
43 | 'django.contrib.messages',
44 | 'django.contrib.staticfiles',
45 | 'django_filters',
46 | 'feeds',
47 | 'sites',
48 | ]
49 |
50 | MIDDLEWARE = [
51 | 'django.middleware.security.SecurityMiddleware',
52 | 'django.contrib.sessions.middleware.SessionMiddleware',
53 | 'django.middleware.common.CommonMiddleware',
54 | 'django.middleware.csrf.CsrfViewMiddleware',
55 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
56 | 'login_required.middleware.LoginRequiredMiddleware',
57 | 'django.contrib.messages.middleware.MessageMiddleware',
58 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
59 | ]
60 |
61 | ROOT_URLCONF = 'priveedly.urls'
62 |
63 | TEMPLATES = [
64 | {
65 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
66 | 'DIRS': [os.path.join(BASE_DIR, 'templates')],
67 | 'APP_DIRS': True,
68 | 'OPTIONS': {
69 | 'context_processors': [
70 | 'django.template.context_processors.debug',
71 | 'django.template.context_processors.request',
72 | 'django.contrib.auth.context_processors.auth',
73 | 'django.contrib.messages.context_processors.messages',
74 | ],
75 | },
76 | },
77 | ]
78 |
79 | WSGI_APPLICATION = 'priveedly.wsgi.application'
80 |
81 |
82 | LOGIN_REQUIRED_IGNORE_PATHS = [
83 | r'/admin',
84 | r'/accounts/login/$',
85 | r'/accounts/logout/$',
86 | ]
87 |
88 | LOGIN_REDIRECT_URL = "/"
89 |
90 | # Database
91 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
92 |
93 | DATABASES = {
94 |
95 | # 'default': {
96 | # 'ENGINE': 'django.db.backends.sqlite3',
97 | # 'NAME': BASE_DIR / 'db.sqlite3',
98 | #
99 | # },
100 | #}
101 |
102 |
103 | "default": {
104 | "ENGINE": "django.db.backends.postgresql",
105 | "NAME": os.environ.get('LOCAL_DB_NAME'),
106 | "USER": os.environ.get('LOCAL_DB_USERNAME'),
107 | "PASSWORD": os.environ.get('LOCAL_DB_PASSWORD'),
108 | "HOST": "localhost",
109 | "PORT": "",
110 |
111 | }
112 |
113 | }
114 |
115 |
116 | # Password validation
117 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
118 |
119 | AUTH_PASSWORD_VALIDATORS = [
120 | {
121 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
122 | },
123 | {
124 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
125 | },
126 | {
127 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
128 | },
129 | {
130 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
131 | },
132 | ]
133 |
134 |
135 | # Internationalization
136 | # https://docs.djangoproject.com/en/4.2/topics/i18n/
137 |
138 | LANGUAGE_CODE = 'en-us'
139 |
140 | TIME_ZONE = 'UTC'
141 |
142 | USE_I18N = True
143 |
144 | USE_TZ = True
145 |
146 |
147 | # Static files (CSS, JavaScript, Images)
148 | # https://docs.djangoproject.com/en/4.2/howto/static-files/
149 |
150 | STATIC_ROOT = os.path.join(BASE_DIR, 'static')
151 | STATIC_URL = 'static/'
152 |
153 | # Default primary key field type
154 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
155 |
156 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
157 |
158 |
159 | TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
160 | TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')
161 | TWITTER_ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN')
162 | TWITTER_ACCESS_TOKEN_SECRET = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
163 | REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
164 | REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
165 | REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD')
166 | REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')
167 | REDDIT_USERNAME = os.environ.get('REDDIT_USERNAME')
168 |
--------------------------------------------------------------------------------
/feeds/management/commands/rate_all.py:
--------------------------------------------------------------------------------
1 | from django.core.management.base import BaseCommand
2 |
3 | import pandas as pd
4 | from sqlalchemy import create_engine
5 | from urllib.parse import urlparse
6 | from nltk.corpus import stopwords
7 | from nltk import tokenize
8 | import re
9 | import string
10 | import html
11 | import logging
12 | import joblib
13 |
14 | from sites.models import RedditPost, SitePost
15 | from feeds.models import FeedEntry
16 | from bs4 import BeautifulSoup
17 |
18 | import os
19 | from dotenv import load_dotenv, dotenv_values
20 | load_dotenv()
21 |
22 | import nltk
23 | nltk.download('stopwords')
24 | nltk.download('punkt_tab')
25 |
26 | logging.basicConfig(filename='/var/log/priveedly/rate.log',
27 | encoding='utf-8',
28 | level=logging.INFO,
29 | datefmt='%Y-%m-%d %H:%M:%S')
30 |
31 | CLEAN_HTML = re.compile('<.*?>')
32 | CLEAN_NUMBERS = re.compile('[0-9,\\.$\\%]+')
33 | CLEAN_NUMBERS_AND_ONE_LETTER = re.compile('([a-z]\\d+)|(\\d+[a-z])|(\\d+[a-z]\\d+)')
34 | CLEAN_REPEATED_PUNCTUATION = re.compile('[!\\-\\/:-@-`’–{-~"“”\\[\\]]+')
35 |
36 | def tokenize_url(url_str):
37 | parsed_url = urlparse(url_str)
38 | return parsed_url.netloc, ' '.join(parsed_url.path.split('/')).replace('-', ' '), parsed_url.query.replace('?', ' ').replace('=', ' ')
39 |
40 | def prepare_content(pandas_row):
41 | netloc, path, query = tokenize_url(pandas_row.url)
42 | return ' '.join([pandas_row.title, pandas_row.description, pandas_row.site_name, netloc, path, query])
43 |
44 | # Update this if you change preprocessing!
45 | def remove_tags_and_lowercase(text):
46 | # some parts from https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
47 | if BeautifulSoup(text, "html.parser").find():
48 | try:
49 | soup = BeautifulSoup(text)
50 | text = soup.get_text()
51 | except:
52 | pass
53 | cleantext = html.unescape(text).encode('unicode_escape').decode('unicode_escape')
54 | # you can try this line or other similar things if you want to be more deliberate about cleaning!
55 | #cleantext = re.sub(CLEAN_NUMBERS_AND_ONE_LETTER, '', cleantext)
56 | cleantext = re.sub(CLEAN_NUMBERS, '', cleantext)
57 | cleantext = re.sub(CLEAN_REPEATED_PUNCTUATION, '', cleantext)
58 | return cleantext.lower()
59 |
60 | # Update this if you change preprocessing!
61 | def tokenize_content(text):
62 | removal = set(stopwords.words('english')).union(set(string.punctuation))
63 | return [w for w in tokenize.word_tokenize(remove_tags_and_lowercase(text))
64 | if w.lower() not in removal]
65 |
66 |
67 | def get_engine():
68 | db_str = "postgresql://{}:{}@localhost:5432/{}".format(
69 | os.environ.get('DB_USERNAME'),
70 | os.environ.get('DB_PASSWORD'),
71 | os.environ.get('DB_NAME'))
72 | return create_engine(db_str)
73 |
74 | # Update this if you change preprocessing!
75 | def create_content_df(engine):
76 | sites_df = pd.read_sql(
77 | "select id, title, url, description, site_name from sites_sitepost WHERE read is False and interesting is False",
78 | con=engine)
79 | sites_df['type'] = 'sites'
80 | feeds_df = pd.read_sql(
81 | "select feeds_feedentry.id as id, feeds_feedentry.title as title, feeds_feedentry.url as url, feeds_feedentry.description as description, feeds_feed.title as site_name from feeds_feedentry JOIN feeds_feed ON feeds_feed.id = feed_id WHERE read is False and interesting is False",
82 | con=engine)
83 | feeds_df['type'] = 'feeds'
84 | reddit_df = pd.read_sql(
85 | "select sites_redditpost.id as id, sites_redditpost.title as title, sites_redditpost.url as url, sites_redditpost.description as description, sites_subreddit.name as site_name from sites_redditpost JOIN sites_subreddit ON sites_redditpost.id = sites_subreddit.id WHERE read is False and interesting is False",
86 | con=engine)
87 | reddit_df['type'] = 'reddit'
88 | return pd.concat([reddit_df, sites_df, feeds_df])
89 |
90 |
91 | def update_score(pandas_row):
92 | if pandas_row.type == 'sites':
93 | obj = SitePost.objects.get(pk=pandas_row.id)
94 | elif pandas_row.type == 'feeds':
95 | obj = FeedEntry.objects.get(pk=pandas_row.id)
96 | else:
97 | obj = RedditPost.objects.get(pk=pandas_row.id)
98 | obj.recommended = pandas_row.y
99 | obj.save()
100 |
101 |
102 | class Command(BaseCommand):
103 |
104 | def handle(self, *args, **kwargs):
105 | try:
106 | # Update this if you change preprocessing!
107 | engine = get_engine()
108 | content_df = create_content_df(engine)
109 | logging.info('about to rate {} items'.format(content_df.shape[0]))
110 | content_df['full_text'] = content_df.apply(prepare_content, axis=1)
111 | content_df['cleaned_text'] = content_df['full_text'].map(lambda x: ' '.join(tokenize_content(x)))
112 | pipeline = joblib.load(os.getenv('PIPELINE_FILE'))
113 | if hasattr(pipeline, 'predict_proba'):
114 | proba = pipeline.predict_proba(content_df['cleaned_text'])
115 | # take only positive class
116 | y = proba[:, 1]
117 | else:
118 | y = pipeline.predict(content_df['cleaned_text'])
119 | content_df['y'] = y
120 | content_df.apply(update_score, axis=1)
121 | except Exception as e:
122 | logging.exception(e)
123 | logging.debug('failed to rate incoming content')
124 |
--------------------------------------------------------------------------------
/priveedly/prod_settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for priveedly project.
3 |
4 | Generated by 'django-admin startproject' using Django 4.2.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/4.2/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/4.2/ref/settings/
11 | """
12 |
13 | from pathlib import Path
14 | from dotenv import load_dotenv
15 | import os
16 |
17 | import sentry_sdk
18 | from sentry_sdk.integrations.django import DjangoIntegration
19 |
20 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
21 | BASE_DIR = Path(__file__).resolve().parent.parent
22 |
23 | load_dotenv()
24 |
25 | # Quick-start development settings - unsuitable for production
26 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
27 |
28 | # SECURITY WARNING: keep the secret key used in production secret!
29 | SECRET_KEY = os.environ.get('SECRET_KEY')
30 |
31 | # SECURITY WARNING: don't run with debug turned on in production!
32 | DEBUG = (os.getenv('DEBUG', 'False') == 'True')
33 |
34 | ALLOWED_HOSTS = [x for x in os.environ.get('ALLOWED_HOSTS').split(',')]
35 | SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https')
36 | CSRF_TRUSTED_ORIGINS = ['https://{}'.format(x) for x in os.environ.get('ALLOWED_HOSTS').split(',') if '.com' in x]
37 |
38 | # Application definition
39 |
40 | INSTALLED_APPS = [
41 | 'django.contrib.admin',
42 | 'django.contrib.auth',
43 | 'django.contrib.contenttypes',
44 | 'django.contrib.sessions',
45 | 'django.contrib.messages',
46 | 'django.contrib.staticfiles',
47 | 'django_filters',
48 | 'feeds',
49 | 'sites',
50 | ]
51 |
52 | MIDDLEWARE = [
53 | 'django.middleware.security.SecurityMiddleware',
54 | 'django.contrib.sessions.middleware.SessionMiddleware',
55 | 'django.middleware.common.CommonMiddleware',
56 | 'django.middleware.csrf.CsrfViewMiddleware',
57 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
58 | 'login_required.middleware.LoginRequiredMiddleware',
59 | 'django.contrib.messages.middleware.MessageMiddleware',
60 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
61 | ]
62 |
63 | ROOT_URLCONF = 'priveedly.urls'
64 |
65 | TEMPLATES = [
66 | {
67 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
68 | 'DIRS': [os.path.join(BASE_DIR, 'templates')],
69 | 'APP_DIRS': True,
70 | 'OPTIONS': {
71 | 'context_processors': [
72 | 'django.template.context_processors.debug',
73 | 'django.template.context_processors.request',
74 | 'django.contrib.auth.context_processors.auth',
75 | 'django.contrib.messages.context_processors.messages',
76 | ],
77 | },
78 | },
79 | ]
80 |
81 | WSGI_APPLICATION = 'priveedly.wsgi.application'
82 |
83 |
84 | LOGIN_REQUIRED_IGNORE_PATHS = [
85 | r'/admin',
86 | r'/accounts/login/$',
87 | r'/accounts/logout/$',
88 | ]
89 |
90 | LOGIN_REDIRECT_URL = "/"
91 |
92 | # Database
93 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
94 |
95 | DATABASES = {
96 |
97 | # 'default': {
98 | # 'ENGINE': 'django.db.backends.sqlite3',
99 | # 'NAME': BASE_DIR / 'db.sqlite3',
100 | #
101 | # },
102 | #}
103 |
104 |
105 |
106 |
107 | "default": {
108 | "ENGINE": "django.db.backends.postgresql",
109 | "NAME": os.environ.get('DB_NAME'),
110 | "USER": os.environ.get('DB_USERNAME'),
111 | "PASSWORD": os.environ.get('DB_PASSWORD'),
112 | "HOST": "localhost",
113 | "PORT": "",
114 |
115 | }
116 |
117 | }
118 |
119 |
120 | # Password validation
121 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
122 |
123 | AUTH_PASSWORD_VALIDATORS = [
124 | {
125 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
126 | },
127 | {
128 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
129 | },
130 | {
131 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
132 | },
133 | {
134 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
135 | },
136 | ]
137 |
138 |
139 | # Internationalization
140 | # https://docs.djangoproject.com/en/4.2/topics/i18n/
141 |
142 | LANGUAGE_CODE = 'en-us'
143 |
144 | TIME_ZONE = 'UTC'
145 |
146 | USE_I18N = True
147 |
148 | USE_TZ = True
149 |
150 |
151 | # Static files (CSS, JavaScript, Images)
152 | # https://docs.djangoproject.com/en/4.2/howto/static-files/
153 |
154 | STATIC_ROOT = os.path.join(BASE_DIR, 'static')
155 | STATIC_URL = 'static/'
156 |
157 | # Default primary key field type
158 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
159 |
160 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
161 |
162 | sentry_sdk.init(
163 | dsn=os.environ.get('SENTRY_DSN'),
164 | integrations=[
165 | DjangoIntegration(),
166 | ],
167 |
168 | # Set traces_sample_rate to 1.0 to capture 100%
169 | # of transactions for performance monitoring.
170 | # We recommend adjusting this value in production.
171 | traces_sample_rate=0.1,
172 |
173 | # If you wish to associate users to errors (assuming you are using
174 | # django.contrib.auth) you may enable sending PII data.
175 | send_default_pii=False
176 | )
177 |
178 | TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
179 | TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')
180 | TWITTER_ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN')
181 | TWITTER_ACCESS_TOKEN_SECRET = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
182 | REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
183 | REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET')
184 | REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD')
185 | REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT')
186 | REDDIT_USERNAME = os.environ.get('REDDIT_USERNAME')
187 |
--------------------------------------------------------------------------------
/deployment/example_initial_ansible.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: YOUR_HOST_NAME
3 | become: yes
4 |
5 | tasks:
6 |
7 | - name: update
8 | apt: update_cache=yes
9 |
10 | - name: install fail2ban
11 | apt: pkg=fail2ban state=present
12 |
13 | - name: install git
14 | apt: name=git state=present
15 |
16 | - name: build essential
17 | apt: name=build-essential state=present
18 |
19 | - name: nginx
20 | apt: name=nginx state=present
21 |
22 | - name: install py3
23 | apt: name=python3 state=present
24 |
25 | - name: install pip
26 | apt: name=python3-pip state=present
27 |
28 | - name: install uwsgi
29 | apt: name=uwsgi state=present
30 |
31 | - name: install emperor
32 | apt: name=uwsgi-emperor state=present
33 |
34 | - name: install uwsgi py3
35 | apt: name=uwsgi-plugin-python3 state=present
36 |
37 | - name: install certbot
38 | apt: name=python3-certbot-nginx state=present
39 |
40 | - name: install postgres
41 | apt: name=postgresql state=present
42 |
43 | - name: install aws
44 | pip: name=awscli state=present
45 |
46 | - name: mk aws dir
47 | become_user: {{your_user}}
48 | file: path=/home/{{your_user}}/.aws mode=0700 recurse=yes state=directory
49 |
50 | - name: copy aws creds
51 | become_user: {{your_user}}
52 | copy: src=/home/{{your_user}}/.aws/credentials dest=/home/{{your_user}}/.aws/credentials
53 |
54 | - name: copy aws config
55 | become_user: {{your_user}}
56 | copy: src=/home/{{your_user}}/.aws/config dest=/home/{{your_user}}/.aws/config
57 |
58 | - name: install psycopg2
59 | apt: name=python3-psycopg2 state=present
60 |
61 | - name: install certbot
62 | apt: name=certbot state=present install_recommends=yes
63 |
64 | - name: install letsencrypt
65 | apt: name=letsencrypt state=present install_recommends=yes
66 |
67 | - name: change pr home perms to fetch
68 | file: path=/var/www/priveedly mode=0777 state=directory recurse=yes
69 | ignore_errors: yes
70 |
71 | - name: run certbot
72 | shell: certbot certonly --nginx -w /var/www/priveedly -d {{your_dns}} -n -m {{your_email}} --keep-until-expiring --agree-tos
73 | ignore_errors: yes
74 |
75 | - name: fetch python application
76 | become_user: {{your_user}}
77 | git: repo=git@github.com:YOUR_GITHUB/priveedly.git dest=/var/www/priveedly key_file=~/.ssh/id_ecdsa accept_hostkey=yes force=yes
78 |
79 | - name: change home perms to fetch
80 | file: path=/var/www/venv mode=0777 state=directory recurse=yes
81 | ignore_errors: yes
82 |
83 | - name: install virtualenv
84 | pip: executable=pip3 name=virtualenv state=present
85 |
86 | - name: install py requirements
87 | pip: requirements=/var/www/priveedly/requirements.txt virtualenv=/var/www/venv/priveedly virtualenv_python=python3
88 |
89 | - name: copy .venv
90 | copy: src=.prod_env dest=/var/www/priveedly/priveedly/.env
91 |
92 | - name: mk uwsgi dir
93 | file: path=/etc/uwsgi/vassals state=directory recurse=yes mode=0644
94 |
95 | - name: copy uwsgi
96 | copy: src=priveedly.ini dest=/etc/uwsgi-emperor/vassals/priveedly.ini
97 |
98 | - name: mk scripts dir
99 | file: path=/home/{{your_user}}/scripts state=directory recurse=yes mode=0777
100 |
101 | - name: copy cron
102 | copy: src=backup.sh dest=/home/{{your_user}}/scripts/priveedly_backup.sh
103 |
104 | - name: copy emperor config
105 | copy: src=emperor.ini dest=/etc/uwsgi/emperor.ini
106 |
107 | - name: copy systemd emperor config
108 | copy: src=templates/emperor.uwsgi.service dest=/etc/systemd/system/emperor.uwsgi.service
109 |
110 | - name: Create database
111 | become: yes
112 | become_user: postgres
113 | become_method: sudo
114 | postgresql_db: name=priveedly encoding='UTF-8' lc_collate='en_US.UTF-8' lc_ctype='en_US.UTF-8' state=present
115 |
116 | - name: copy database
117 | copy: src=templates/priveedly/dump.sql dest=/tmp/backup.sql
118 |
119 | - name: Importing data
120 | become_user: postgres
121 | shell: psql priveedly < /tmp/backup.sql
122 |
123 | - name: Create role for database
124 | become_user: postgres
125 | postgresql_user: db=priveedly user={{your_db_user}} password={{your_db_password}} priv=ALL state=present
126 |
127 | - name: Grant sequence permissions
128 | become_user: postgres
129 | postgresql_privs: database=priveedly state=present privs=ALL type=database roles=priveedly grant_option=no objs=priveedly
130 |
131 | #- name: copy pgpass
132 | #become_user: {{your_user}}
133 | #copy: src=templates/pg_pgpass.txt dest=/home/{{your_user}}/.pgpass mode=0600
134 |
135 | #- name: alter pg_hba
136 | #become_user: postgres
137 | #copy: src=templates/pg_hba.conf dest=/etc/postgresql/9.6/main/pg_hba.conf
138 | #notify:
139 | # - restart postgres
140 |
141 | - name: backup daily cron for sql
142 | cron: name="backup daily cron sql" user="{{your_user}}" minute="0" hour="4" job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t daily"
143 |
144 | - name: backup weekly cron for sql
145 | cron: name="backup weekly cron sql" user="{{your_user}}" minute="0" hour="4" weekday="0" job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t weekly"
146 |
147 | - name: backup monthly cron for sql
148 | cron: name="backup monthly cron sql" user="{{your_user}}" special_time=monthly job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t monthly"
149 |
150 | - name: restart uwsgi emperor
151 | shell: systemctl restart emperor.uwsgi.service
152 |
153 | - name: copy nginx files
154 | copy: src=nginx.conf dest=/etc/nginx/sites-available/priveedly
155 |
156 | - name: symlink and restart
157 | file: src=/etc/nginx/sites-available/priveedly dest=/etc/nginx/sites-enabled/priveedly state=link
158 |
159 | - name: restart server
160 | shell: /bin/true
161 | notify:
162 | - restart nginx
163 |
164 | handlers:
165 | - name: restart ssh
166 | service: name=ssh state=restarted
167 |
168 | - name: restart nginx
169 | service: name=nginx state=restarted
170 |
171 | - name: restart postgres
172 | service: name=postgresql state=restarted
173 |
--------------------------------------------------------------------------------
/feeds/tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from django.contrib.auth.models import User
3 | from django.test import TestCase, Client
4 | from feeds.models import FeedCategory, Feed, FeedEntry
5 | from datetime import datetime, timezone, timedelta
6 |
7 | from feeds.management_commands import get_pub_date, get_title, get_description, get_image, parse_feed
8 | import feedparser
9 |
10 | class LogInTest(TestCase):
11 | def setUp(self):
12 | self.credentials = {
13 | 'username': 'testuser',
14 | 'password': 'secret'}
15 | user = User.objects.create_user(**self.credentials)
16 | user.save()
17 |
18 | def test_login(self):
19 | # try to reach home logged out
20 | response = self.client.get('/', follow=True)
21 | self.assertEqual(response.status_code, 200)
22 | last_url, status_code = response.redirect_chain[-1]
23 |
24 | self.assertEqual(last_url, '/accounts/login/?next=/')
25 |
26 | # send login
27 | response = self.client.post('/accounts/login/?next=/', self.credentials, follow=True)
28 |
29 | # logged in now
30 | last_url, status_code = response.redirect_chain[-1]
31 | self.assertEqual(last_url, "/")
32 | self.assertTrue(response.context['user'].is_active)
33 |
34 |
35 | class MainPageTesting(TestCase):
36 |
37 | def setUp(self):
38 | # Every test needs a client.
39 | self.client = Client()
40 |
41 | #user account
42 | self.credentials = {
43 | 'username': 'testuser',
44 | 'password': 'secret'}
45 | User.objects.create_user(**self.credentials)
46 |
47 | # feeds
48 | feed_cat = FeedCategory(name="testing")
49 | feed_cat.save()
50 | feed = Feed(title='test feed', url="https://example.com", category=feed_cat)
51 | feed.save()
52 | self.posts = []
53 | for idx in range(3):
54 | feed_post = FeedEntry(feed=feed,
55 | title="Here is a post #{}".format(idx),
56 | url="https://localhost:8000/post-{}".format(idx),
57 | published=datetime.utcnow().replace(tzinfo=timezone(timedelta(0))))
58 | feed_post.save()
59 | self.posts.append(feed_post)
60 |
61 | def test_main_page(self):
62 | response = self.client.login(**self.credentials)
63 | response = self.client.get('/', follow=True)
64 | self.assertEqual(response.status_code, 200)
65 |
66 | for post in self.posts:
67 | self.assertIn(post.title, str(response.content))
68 | self.assertIn(post.url, str(response.content))
69 |
70 |
71 | def test_mark_read(self):
72 | response = self.client.login(**self.credentials)
73 | id_list = '{},{}'.format(self.posts[0].id, self.posts[1].id)
74 | entry_types = '{},{}'.format(self.posts[0].entry_type, self.posts[1].entry_type)
75 | response = self.client.post('/feeds/mark-read/',
76 | {'id_list': id_list,
77 | 'entry_types': entry_types
78 | })
79 | self.assertEqual(response.status_code, 200)
80 | self.assertJSONEqual(
81 | str(response.content, encoding='utf8'),
82 | {'success': True}
83 | )
84 |
85 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[0].id).read)
86 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).read)
87 |
88 | response = self.client.get('/')
89 |
90 | self.assertIn(self.posts[2].title, str(response.content))
91 | self.assertIn(self.posts[2].url, str(response.content))
92 |
93 | self.assertNotIn(self.posts[1].title, str(response.content))
94 | self.assertNotIn(self.posts[1].url, str(response.content))
95 |
96 | self.assertNotIn(self.posts[0].title, str(response.content))
97 | self.assertNotIn(self.posts[0].url, str(response.content))
98 |
99 |
100 | def test_mark_read_later(self):
101 | response = self.client.login(**self.credentials)
102 | response = self.client.post('/feeds/mark-read-later/',
103 | {'entry_id': self.posts[1].id,
104 | 'entry_type': self.posts[1].entry_type
105 | })
106 | self.assertEqual(response.status_code, 200)
107 | self.assertJSONEqual(
108 | str(response.content, encoding='utf8'),
109 | {'success': True}
110 | )
111 |
112 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).read_later)
113 |
114 |
115 | response = self.client.get('/read-later/')
116 |
117 | self.assertIn(self.posts[1].title, str(response.content))
118 | self.assertIn(self.posts[1].url, str(response.content))
119 |
120 | self.assertNotIn(self.posts[2].title, str(response.content))
121 | self.assertNotIn(self.posts[2].url, str(response.content))
122 |
123 | self.assertNotIn(self.posts[0].title, str(response.content))
124 | self.assertNotIn(self.posts[0].url, str(response.content))
125 |
126 |
127 | response = self.client.post('/feeds/unmark-read-later/',
128 | {'entry_id': self.posts[1].id,
129 | 'entry_type': self.posts[1].entry_type
130 | })
131 | self.assertEqual(response.status_code, 200)
132 | self.assertJSONEqual(
133 | str(response.content, encoding='utf8'),
134 | {'success': True}
135 | )
136 |
137 | self.assertFalse(FeedEntry.objects.get(pk=self.posts[1].id).read_later)
138 |
139 | response = self.client.get('/read-later/')
140 |
141 | self.assertNotIn(self.posts[1].title, str(response.content))
142 | self.assertNotIn(self.posts[1].url, str(response.content))
143 |
144 | def test_mark_interesting(self):
145 | response = self.client.login(**self.credentials)
146 | response = self.client.post('/feeds/mark-interesting/',
147 | {'entry_id': self.posts[1].id,
148 | 'entry_type': self.posts[1].entry_type
149 | })
150 | self.assertEqual(response.status_code, 200)
151 | self.assertJSONEqual(
152 | str(response.content, encoding='utf8'),
153 | {'success': True}
154 | )
155 |
156 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).interesting)
157 | self.assertFalse(FeedEntry.objects.get(pk=self.posts[1].id).read_later)
158 |
159 |
160 | class ParseFeedTests(TestCase):
161 |
162 | def setUp(self):
163 | # Every test needs a client.
164 | self.client = Client()
165 |
166 | #user account
167 | self.credentials = {
168 | 'username': 'testuser',
169 | 'password': 'secret'}
170 | User.objects.create_user(**self.credentials)
171 | self.feed_files = [
172 | 'feeds/tests/iapp.rss',
173 | 'feeds/tests/nomnom.rss',
174 | 'feeds/tests/jvns_ca.xml',
175 | ]
176 |
177 | def test_feed_parsing_units(self):
178 | for feed_file in self.feed_files:
179 | parser = feedparser.parse(feed_file)
180 | file_contents = open(feed_file, 'r').read()
181 | for entry in parser.entries:
182 |
183 | # test pub date
184 | pub_date = get_pub_date(entry)
185 | self.assertTrue(pub_date.year >= 2023)
186 | self.assertTrue(isinstance(pub_date, datetime))
187 |
188 | # test get_title
189 | title = get_title(entry)
190 | self.assertTrue(isinstance(title, str))
191 | self.assertTrue(len(title)<=355)
192 | if hasattr(entry, 'title'):
193 | self.assertEqual(title, entry.title)
194 |
195 | # test get_description
196 | description = get_description(entry)
197 | self.assertTrue(isinstance(description, str))
198 | if hasattr(entry, 'content'):
199 | self.assertIn(description,
200 | ''.join([ce.get('value') for ce in entry.content]))
201 | elif hasattr(entry, 'description'):
202 | self.assertEqual(description, entry.description)
203 |
204 | # test get_image
205 | image = get_image(description)
206 | if hasattr(entry, 'content'):
207 | if image:
208 | self.assertTrue(isinstance(image, str))
209 | self.assertIn(image,
210 | ''.join([ce.get('value') for ce in entry.content]))
211 | else:
212 | self.assertEqual(image, None)
213 |
214 | def test_feed_reader(self):
215 | response = self.client.login(**self.credentials)
216 | initial_response = self.client.get('/')
217 | self.assertEqual(initial_response.status_code, 200)
218 |
219 | feed = Feed(title='test', url=self.feed_files[1]) # WARNING: this has to be updated
220 | # for timely parsing (i.e. update RSS feed)
221 | feed.save()
222 | parse_feed(feed)
223 |
224 | response = self.client.get('/')
225 |
226 | self.assertEqual(response.status_code, 200)
227 |
228 | self.assertNotEqual(initial_response.content, response.content)
229 | self.assertTrue(len(response.content) > len(initial_response.content))
230 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Hello! Welcome to Priveedly
2 |
3 | Priveedly is a personal content and feed reader with the ability to build your own small personalized machine learning models. Priveedly helps you follow, read, save what you like on your own server.
4 |
5 | Currently supported content:
6 |
7 | - Any valid RSS feed
8 | - Subreddits
9 | - [HackerNews](https://news.ycombinator.com/) hottest
10 | - [Lobste.rs](https://lobste.rs/) hottest
11 |
12 | No longer supported but might work:
13 |
14 | - Twitter/X API
15 |
16 | The project is built using the [Django web framework](https://www.djangoproject.com/), [scikit-learn](https://scikit-learn.org/stable/) and with very minimal Javascript and Python-based feed scraping. It is currently only useful for you if you know Python and can navigate setting up your own server.
17 |
18 | > A short video [introduction to Priveedly is on YouTube](https://youtu.be/_aHZlSUO8Qs)
19 |
20 | If this is too advanced for you, stay tuned! I plan to find some one-click install setups for non-dev/tech folks. 🙂
21 |
22 | ### Why run your own content and feed-reader?
23 |
24 | - 🎯 **Autonomy**: Decide what types of content you want to read and update yourself without an algorithm.
25 | - 🔐 **Privacy**: It's a private service, for you and by you. Unless you give someone your login, they won't read your feed.
26 | - 💸 **No ads**: Because why do you want ads in the middle of your reading?
27 | - 🤓 **Self-study**: Because training ML models for yourself and by yourself can be a fun way to safely do data science and ML without contributing your data to a large-scale content platform.
28 |
29 | You can read more [about my experience and motivation on my blog](https://blog.kjamistan.com/priveedly-your-private-and-personal-content-reader-and-recommender.html).
30 |
31 | ## Table of Contents
32 | - [Installation](#installation)
33 | - [Usage](#usage)
34 | - [Contributing](#contributing)
35 | - [License](#license)
36 |
37 | ## Installation
38 |
39 | This repository doesn't yet have an easy one-click launch, but I am hoping someone might contribute that!
40 |
41 | For those who already know Python, the basic setup is as follows:
42 |
43 | 1. Clone the repository:
44 | ```bash
45 | git clone https://github.com/kjam/priveedly.git
46 | ```
47 |
48 | 2. Create a virtual or Conda environment.
49 | ```bash
50 | conda create -n priveedly
51 | ```
52 |
53 | 3. Install dependencies:
54 | ```bash
55 | conda install pip
56 | pip install -r requirements.txt
57 | ```
58 |
59 | 4. Create database that can be used with Django, like postgresql (or your favorite database here).
60 |
61 | 5. Add environment file with necessary variables, see [example environment file](example_env). This needs to then be saved as '.env' in the main directory.
62 |
63 | 6. Migrate the database.
64 | ```bash
65 | python manage.py makemigrations
66 | python manage.py migrate
67 | ```
68 |
69 | 7. Create a super user.
70 | ```bash
71 | python manage.py createsuperuser
72 | ```
73 |
74 | 8. Run the server and navigate to /admin to log in.
75 | ```bash
76 | python manage.py runserver
77 | ```
78 |
79 | The project is currently only tested on Python 3.9, but appears to work for other versions. Due to the dependencies, you might need to hold back your Python version if you plan on using things like Jupyter, scikit-learn, etc to train your models.
80 |
81 | ## Usage
82 |
83 | ### Local use
84 |
85 | First, get everything installed above and working. Then, you can enter a few feeds and subreddits you might like to read. To do so, add the feeds directly in the admin website (navigate there after running the runserver command).
86 |
87 | Then, you can test whether the parsing is working properly by opening a new terminal in your virtual environment and then running the following command.
88 |
89 | ```bash
90 | python manage.py parse_all
91 | ```
92 |
93 | This command parses your saved feeds, Subreddits along with HackerNews and Lobste.rs top stories. If you'd like to change the parsing, please update the /feeds/commands/parse_all.py file.
94 |
95 | You can then navigate to the homepage when logged in (http://127.0.0.1:8000) and see the parsed feeds. The general reading flow is as follows:
96 |
97 | - ⏳ **Progress Bar**: On the top you have a bar telling you how far you are along in your reading backlog. It's very minimal and if you want to change it or redesign it, feel free!
98 |
99 | - 📚 **Articles**: Each page loads an oldest-first of all of your different feeds, reddits and other stories from HN and lobsters. If you click on the titles of any article, it will navigate to that article (use CNTRL+click if you want to open in a new tab).
100 |
101 | - ✅️ **Read Later**: If you see something you want to read later, click read later and it will be saved for later access. To see your read-later, navigate to http://127.0.0.1:8000/read-later
102 |
103 | - 📊 To train your **own recommendation model**, you'll need to save things to read-later, and then mark the content you like as interesting. If you want to use a different workflow, you could also change the main view to expose the interesting button on the main page.
104 |
105 | - 🗂️ **Recommended**: Once you have your own recommendation model running, you can visit http://127.0.0.1:8000/recommended to skip to the articles you might like the most.
106 |
107 | I wouldn't recommend running locally if you want to use it regularly, because it has the ability to parse and run in the background if you get it setup on a server. (see below)
108 |
109 | For the first few thousand entries, I wouldn't bother trying to train or use the machine learning parts because it won't be enough data for a useful model. Once you have many thousands of posts, it's worth using the machine learning example. If you are new to building language classification models, I recommend starting by watching [my video](https://youtu.be/AMy3K3NbrLw) and then trying it for yourself by running Jupyter in the notebooks folder and following along.
110 |
111 | To get Reddit working, you'll need to [sign up to get an API key](https://praw.readthedocs.io/en/stable/getting_started/configuration.html#configuration) and then store that in your environment file. To see how to do that, please see the [example environment file](example_env). That should then be saved as '.env' in the main directory.
112 |
113 | ### Personal server use
114 |
115 | Ideally, you have access to a server and can get Priveedly set up on that server. If you are familiar with [Ansible](https://docs.ansible.com/ansible/latest/index.html), you can see several reference scripts for your use in the deployment folder but they probably require updates or modifications based on your operating system and cloud provider.
116 |
117 | If you are willing to contribute a Dockerfile to ease deployment for those unfamiliar with Ansible, I would greatly appreciate any contributions.
118 |
119 | Important to note that when using a server, you'll want:
120 |
121 | - Enough storage to store all of your favorite articles and run the parser
122 | - Eventually enough RAM to run ML classification tasks
123 | - Good connectivity/throughput for parsing
124 |
125 | I recommend running the parse_all command every 2-3 hours if you plan on using the app relatively frequently.
126 |
127 | Once you have enough data to train your own model, you'll want to do that locally and then deploy it to the server.
128 |
129 | ### Training your own model
130 |
131 | If machine learning is new to you, you can get started by watching [my YouTube video](https://youtu.be/AMy3K3NbrLw) and then trying it for yourself by running Jupyter in the notebooks folder and following along.
132 |
133 | > Note: When/if you modify the data preparation steps, you must also modify the rate_all.py script in the feeds/management/commands folder. The data going into the model on the server must match how the model was trained.
134 |
135 | Once your model is trained, I recommend running the rate_all about once a day or every 6 hours if you have an especially busy feedreader.
136 |
137 | I'm happy to also host more Notebook and training contributions if you find a different model type works well for you, if you have another notebook that works better for a different set of languages or a more production-ready setup. You can also post your own notebook and explanations on your own site/repo for others to learn from!
138 |
139 | ### Some additional notes
140 |
141 | 1. So long tweets: Twitter changed their API and moved to paid only access after I had already been using this reader for a year or so. 😩 Therefore, I am not sure if the /tweets section still works anymore or not. If you are an active X user and want to test it out and let me know, I'd appreciate the feedback!
142 |
143 | 2. One-click deploy: I'd be really happy if someone wants to figure out an easy way for people to one-click deploy this. If you offer a service like this, please let me know and I'll see if I can get the repository in a shape to get it working!
144 |
145 | 3. Monkeypatching with Django: I originally started monkeypatching some of the sites parsing to add tests, only to find that django testing and monkeypatching are a bit of a pain when used together. If you have experience making these play nice, I'd love some help.
146 |
147 | 4. Supporting other languages and classifiers with beginner-friendly instructions: Because I'd like this to be useful for people of all ML-levels and also folks who like reading non-English texts, it'd be awesome to have more Jupyter contributions and accompanying posts/videos to help folks test out different types of one-person-use recommenders. Contributions are very welcome!
148 |
149 | ## Contributions
150 |
151 | I heartily welcome contributions that would benefit others. First and foremost, please use the project yourself before making significant contributions.
152 |
153 | I also suggest looking through the Issues for open asks from myself and other users.
154 |
155 | In general, please follow the following workflow:
156 |
157 | 1. Fork the repository.
158 | 2. Create a new branch: `git checkout -b feature-name`.
159 | 3. Make your changes. Please include tests if providing significant new functionality.
160 | 4. Push your branch: `git push origin feature-name`.
161 | 5. Create a pull request.
162 |
163 | ## License
164 |
165 | The project is shared under the [GNU Public License](LICENSE).
166 |
--------------------------------------------------------------------------------
/static/simple.css:
--------------------------------------------------------------------------------
1 | /* Global variables. */
2 | :root,
3 | ::backdrop {
4 | /* Set sans-serif & mono fonts */
5 | --sans-font: -apple-system, BlinkMacSystemFont, "Avenir Next", Avenir,
6 | "Nimbus Sans L", Roboto, "Noto Sans", "Segoe UI", Arial, Helvetica,
7 | "Helvetica Neue", sans-serif;
8 | --mono-font: Consolas, Menlo, Monaco, "Andale Mono", "Ubuntu Mono", monospace;
9 | --standard-border-radius: 5px;
10 |
11 | /* Default (light) theme */
12 | --bg: #fff;
13 | --accent-bg: #f5f7ff;
14 | --text: #212121;
15 | --text-light: #585858;
16 | --border: #898EA4;
17 | --accent: #C15CF6;
18 | --code: #d81b60;
19 | --preformatted: #444;
20 | --marked: #ffdd33;
21 | --disabled: #efefef;
22 | }
23 |
24 | /* Dark theme */
25 | @media (prefers-color-scheme: dark) {
26 | :root,
27 | ::backdrop {
28 | color-scheme: dark;
29 | --bg: #212121;
30 | --accent-bg: #2b2b2b;
31 | --text: #dcdcdc;
32 | --text-light: #ababab;
33 | --accent: #C15CF6;
34 | --code: #f06292;
35 | --preformatted: #ccc;
36 | --disabled: #111;
37 | }
38 | /* Add a bit of transparency so light media isn't so glaring in dark mode */
39 | img,
40 | video {
41 | opacity: 0.8;
42 | }
43 | }
44 |
45 | /* Reset box-sizing */
46 | *, *::before, *::after {
47 | box-sizing: border-box;
48 | }
49 |
50 | /* Reset default appearance */
51 | textarea,
52 | select,
53 | input,
54 | progress {
55 | appearance: none;
56 | -webkit-appearance: none;
57 | -moz-appearance: none;
58 | }
59 |
60 | html {
61 | /* Set the font globally */
62 | font-family: var(--sans-font);
63 | scroll-behavior: smooth;
64 | }
65 |
66 | /* Make the body a nice central block */
67 | body {
68 | color: var(--text);
69 | background-color: var(--bg);
70 | font-size: 1.15rem;
71 | line-height: 1.5;
72 | display: grid;
73 | grid-template-columns: 1fr min(45rem, 90%) 1fr;
74 | margin: 0;
75 | }
76 | body > * {
77 | grid-column: 2;
78 | }
79 |
80 | /* Make the header bg full width, but the content inline with body */
81 | body > header {
82 | background-color: var(--accent-bg);
83 | border-bottom: 1px solid var(--border);
84 | text-align: center;
85 | padding: 0 0.5rem 2rem 0.5rem;
86 | grid-column: 1 / -1;
87 | }
88 |
89 | body > header h1 {
90 | max-width: 1200px;
91 | margin: 1rem auto;
92 | }
93 |
94 | body > header p {
95 | max-width: 40rem;
96 | margin: 1rem auto;
97 | }
98 |
99 | /* Add a little padding to ensure spacing is correct between content and header > nav */
100 | main {
101 | padding-top: 1.5rem;
102 | }
103 |
104 | body > footer {
105 | margin-top: 4rem;
106 | padding: 2rem 1rem 1.5rem 1rem;
107 | color: var(--text-light);
108 | font-size: 0.9rem;
109 | text-align: center;
110 | border-top: 1px solid var(--border);
111 | }
112 |
113 | /* Format headers */
114 | h1 {
115 | font-size: 3rem;
116 | }
117 |
118 | h2 {
119 | font-size: 2.6rem;
120 | margin-top: 3rem;
121 | }
122 |
123 | h3 {
124 | font-size: 2rem;
125 | margin-top: 3rem;
126 | }
127 |
128 | h4 {
129 | font-size: 1.44rem;
130 | }
131 |
132 | h5 {
133 | font-size: 1.15rem;
134 | }
135 |
136 | h6 {
137 | font-size: 0.96rem;
138 | }
139 |
140 | /* Prevent long strings from overflowing container */
141 | p, h1, h2, h3, h4, h5, h6 {
142 | overflow-wrap: break-word;
143 | }
144 |
145 | /* Fix line height when title wraps */
146 | h1,
147 | h2,
148 | h3 {
149 | line-height: 1.1;
150 | }
151 |
152 | /* Reduce header size on mobile */
153 | @media only screen and (max-width: 720px) {
154 | h1 {
155 | font-size: 2.5rem;
156 | }
157 |
158 | h2 {
159 | font-size: 2.1rem;
160 | }
161 |
162 | h3 {
163 | font-size: 1.75rem;
164 | }
165 |
166 | h4 {
167 | font-size: 1.25rem;
168 | }
169 | }
170 |
171 | /* Format links & buttons */
172 | a,
173 | a:visited {
174 | color: var(--accent);
175 | }
176 |
177 | a:hover {
178 | text-decoration: none;
179 | }
180 |
181 | button,
182 | [role="button"],
183 | input[type="submit"],
184 | input[type="reset"],
185 | input[type="button"],
186 | label[type="button"] {
187 | border: none;
188 | border-radius: var(--standard-border-radius);
189 | background-color: var(--accent);
190 | font-size: 1rem;
191 | color: var(--bg);
192 | padding: 0.7rem 0.9rem;
193 | margin: 0.5rem 0;
194 | }
195 |
196 | button[disabled],
197 | [role="button"][aria-disabled="true"],
198 | input[type="submit"][disabled],
199 | input[type="reset"][disabled],
200 | input[type="button"][disabled],
201 | input[type="checkbox"][disabled],
202 | input[type="radio"][disabled],
203 | select[disabled] {
204 | cursor: not-allowed;
205 | }
206 |
207 | input:disabled,
208 | textarea:disabled,
209 | select:disabled,
210 | button[disabled] {
211 | cursor: not-allowed;
212 | background-color: var(--disabled);
213 | color: var(--text-light)
214 | }
215 |
216 | input[type="range"] {
217 | padding: 0;
218 | }
219 |
220 | /* Set the cursor to '?' on an abbreviation and style the abbreviation to show that there is more information underneath */
221 | abbr[title] {
222 | cursor: help;
223 | text-decoration-line: underline;
224 | text-decoration-style: dotted;
225 | }
226 |
227 | button:enabled:hover,
228 | [role="button"]:not([aria-disabled="true"]):hover,
229 | input[type="submit"]:enabled:hover,
230 | input[type="reset"]:enabled:hover,
231 | input[type="button"]:enabled:hover,
232 | label[type="button"]:hover {
233 | filter: brightness(1.4);
234 | cursor: pointer;
235 | }
236 |
237 | button:focus-visible:where(:enabled, [role="button"]:not([aria-disabled="true"])),
238 | input:enabled:focus-visible:where(
239 | [type="submit"],
240 | [type="reset"],
241 | [type="button"]
242 | ) {
243 | outline: 2px solid var(--accent);
244 | outline-offset: 1px;
245 | }
246 |
247 | /* Format navigation */
248 | header > nav {
249 | font-size: 1rem;
250 | line-height: 2;
251 | padding: 1rem 0 0 0;
252 | }
253 |
254 | /* Use flexbox to allow items to wrap, as needed */
255 | header > nav ul,
256 | header > nav ol {
257 | align-content: space-around;
258 | align-items: center;
259 | display: flex;
260 | flex-direction: row;
261 | flex-wrap: wrap;
262 | justify-content: center;
263 | list-style-type: none;
264 | margin: 0;
265 | padding: 0;
266 | }
267 |
268 | /* List items are inline elements, make them behave more like blocks */
269 | header > nav ul li,
270 | header > nav ol li {
271 | display: inline-block;
272 | }
273 |
274 | header > nav a,
275 | header > nav a:visited {
276 | margin: 0 0.5rem 1rem 0.5rem;
277 | border: 1px solid var(--border);
278 | border-radius: var(--standard-border-radius);
279 | color: var(--text);
280 | display: inline-block;
281 | padding: 0.1rem 1rem;
282 | text-decoration: none;
283 | }
284 |
285 | header > nav a:hover {
286 | border-color: var(--accent);
287 | color: var(--accent);
288 | cursor: pointer;
289 | }
290 |
291 | /* Reduce nav side on mobile */
292 | @media only screen and (max-width: 720px) {
293 | header > nav a {
294 | border: none;
295 | padding: 0;
296 | text-decoration: underline;
297 | line-height: 1;
298 | }
299 | }
300 |
301 | /* Consolidate box styling */
302 | aside, details, pre, progress {
303 | background-color: var(--accent-bg);
304 | border: 1px solid var(--border);
305 | border-radius: var(--standard-border-radius);
306 | margin-bottom: 1rem;
307 | }
308 |
309 | aside {
310 | font-size: 1rem;
311 | width: 30%;
312 | padding: 0 15px;
313 | margin-left: 15px;
314 | float: right;
315 | }
316 |
317 | /* Make aside full-width on mobile */
318 | @media only screen and (max-width: 720px) {
319 | aside {
320 | width: 100%;
321 | float: none;
322 | margin-left: 0;
323 | }
324 | }
325 |
326 | article, fieldset, dialog {
327 | border: 1px solid var(--border);
328 | padding: 1rem;
329 | border-radius: var(--standard-border-radius);
330 | margin-bottom: 1rem;
331 | }
332 |
333 | article h2:first-child,
334 | section h2:first-child {
335 | margin-top: 1rem;
336 | }
337 |
338 | section {
339 | border-top: 1px solid var(--border);
340 | border-bottom: 1px solid var(--border);
341 | padding: 2rem 1rem;
342 | margin: 3rem 0;
343 | }
344 |
345 | /* Don't double separators when chaining sections */
346 | section + section,
347 | section:first-child {
348 | border-top: 0;
349 | padding-top: 0;
350 | }
351 |
352 | section:last-child {
353 | border-bottom: 0;
354 | padding-bottom: 0;
355 | }
356 |
357 | details {
358 | padding: 0.7rem 1rem;
359 | }
360 |
361 | summary {
362 | cursor: pointer;
363 | font-weight: bold;
364 | padding: 0.7rem 1rem;
365 | margin: -0.7rem -1rem;
366 | word-break: break-all;
367 | }
368 |
369 | details[open] > summary + * {
370 | margin-top: 0;
371 | }
372 |
373 | details[open] > summary {
374 | margin-bottom: 0.5rem;
375 | }
376 |
377 | details[open] > :last-child {
378 | margin-bottom: 0;
379 | }
380 |
381 | /* Format tables */
382 | table {
383 | border-collapse: collapse;
384 | margin: 1.5rem 0;
385 | }
386 |
387 | td,
388 | th {
389 | border: 1px solid var(--border);
390 | text-align: left;
391 | padding: 0.5rem;
392 | }
393 |
394 | th {
395 | background-color: var(--accent-bg);
396 | font-weight: bold;
397 | }
398 |
399 | tr:nth-child(even) {
400 | /* Set every other cell slightly darker. Improves readability. */
401 | background-color: var(--accent-bg);
402 | }
403 |
404 | table caption {
405 | font-weight: bold;
406 | margin-bottom: 0.5rem;
407 | }
408 |
409 | /* Format forms */
410 | textarea,
411 | select,
412 | input {
413 | font-size: inherit;
414 | font-family: inherit;
415 | padding: 0.5rem;
416 | margin-bottom: 0.5rem;
417 | color: var(--text);
418 | background-color: var(--bg);
419 | border: 1px solid var(--border);
420 | border-radius: var(--standard-border-radius);
421 | box-shadow: none;
422 | max-width: 100%;
423 | display: inline-block;
424 | }
425 | label {
426 | display: block;
427 | }
428 | textarea:not([cols]) {
429 | width: 100%;
430 | }
431 |
432 | /* Add arrow to drop-down */
433 | select:not([multiple]) {
434 | background-image: linear-gradient(45deg, transparent 49%, var(--text) 51%),
435 | linear-gradient(135deg, var(--text) 51%, transparent 49%);
436 | background-position: calc(100% - 15px), calc(100% - 10px);
437 | background-size: 5px 5px, 5px 5px;
438 | background-repeat: no-repeat;
439 | padding-right: 25px;
440 | }
441 |
442 | /* checkbox and radio button style */
443 | input[type="checkbox"],
444 | input[type="radio"] {
445 | vertical-align: middle;
446 | position: relative;
447 | width: min-content;
448 | }
449 |
450 | input[type="checkbox"] + label,
451 | input[type="radio"] + label {
452 | display: inline-block;
453 | }
454 |
455 | input[type="radio"] {
456 | border-radius: 100%;
457 | }
458 |
459 | input[type="checkbox"]:checked,
460 | input[type="radio"]:checked {
461 | background-color: var(--accent);
462 | }
463 |
464 | input[type="checkbox"]:checked::after {
465 | /* Creates a rectangle with colored right and bottom borders which is rotated to look like a check mark */
466 | content: " ";
467 | width: 0.18em;
468 | height: 0.32em;
469 | border-radius: 0;
470 | position: absolute;
471 | top: 0.05em;
472 | left: 0.17em;
473 | background-color: transparent;
474 | border-right: solid var(--bg) 0.08em;
475 | border-bottom: solid var(--bg) 0.08em;
476 | font-size: 1.8em;
477 | transform: rotate(45deg);
478 | }
479 | input[type="radio"]:checked::after {
480 | /* creates a colored circle for the checked radio button */
481 | content: " ";
482 | width: 0.25em;
483 | height: 0.25em;
484 | border-radius: 100%;
485 | position: absolute;
486 | top: 0.125em;
487 | background-color: var(--bg);
488 | left: 0.125em;
489 | font-size: 32px;
490 | }
491 |
492 | /* Makes input fields wider on smaller screens */
493 | @media only screen and (max-width: 720px) {
494 | textarea,
495 | select,
496 | input {
497 | width: 100%;
498 | }
499 | }
500 |
501 | /* Set a height for color input */
502 | input[type="color"] {
503 | height: 2.5rem;
504 | padding: 0.2rem;
505 | }
506 |
507 | /* do not show border around file selector button */
508 | input[type="file"] {
509 | border: 0;
510 | }
511 |
512 | /* Misc body elements */
513 | hr {
514 | border: none;
515 | height: 1px;
516 | background: var(--border);
517 | margin: 1rem auto;
518 | }
519 |
520 | mark {
521 | padding: 2px 5px;
522 | border-radius: var(--standard-border-radius);
523 | background-color: var(--marked);
524 | color: black;
525 | }
526 |
527 | img,
528 | video {
529 | max-width: 100%;
530 | height: auto;
531 | border-radius: var(--standard-border-radius);
532 | }
533 |
534 | figure {
535 | margin: 0;
536 | display: block;
537 | overflow-x: auto;
538 | }
539 |
540 | figcaption {
541 | text-align: center;
542 | font-size: 0.9rem;
543 | color: var(--text-light);
544 | margin-bottom: 1rem;
545 | }
546 |
547 | blockquote {
548 | margin: 2rem 0 2rem 2rem;
549 | padding: 0.4rem 0.8rem;
550 | border-left: 0.35rem solid var(--accent);
551 | color: var(--text-light);
552 | font-style: italic;
553 | }
554 |
555 | cite {
556 | font-size: 0.9rem;
557 | color: var(--text-light);
558 | font-style: normal;
559 | }
560 |
561 | dt {
562 | color: var(--text-light);
563 | }
564 |
565 | /* Use mono font for code elements */
566 | code,
567 | pre,
568 | pre span,
569 | kbd,
570 | samp {
571 | font-family: var(--mono-font);
572 | color: var(--code);
573 | }
574 |
575 | kbd {
576 | color: var(--preformatted);
577 | border: 1px solid var(--preformatted);
578 | border-bottom: 3px solid var(--preformatted);
579 | border-radius: var(--standard-border-radius);
580 | padding: 0.1rem 0.4rem;
581 | }
582 |
583 | pre {
584 | padding: 1rem 1.4rem;
585 | max-width: 100%;
586 | overflow: auto;
587 | color: var(--preformatted);
588 | }
589 |
590 | /* Fix embedded code within pre */
591 | pre code {
592 | color: var(--preformatted);
593 | background: none;
594 | margin: 0;
595 | padding: 0;
596 | }
597 |
598 | /* Progress bars */
599 | /* Declarations are repeated because you */
600 | /* cannot combine vendor-specific selectors */
601 | progress {
602 | width: 100%;
603 | }
604 |
605 | progress:indeterminate {
606 | background-color: var(--accent-bg);
607 | }
608 |
609 | progress::-webkit-progress-bar {
610 | border-radius: var(--standard-border-radius);
611 | background-color: var(--accent-bg);
612 | }
613 |
614 | progress::-webkit-progress-value {
615 | border-radius: var(--standard-border-radius);
616 | background-color: var(--accent);
617 | }
618 |
619 | progress::-moz-progress-bar {
620 | border-radius: var(--standard-border-radius);
621 | background-color: var(--accent);
622 | transition-property: width;
623 | transition-duration: 0.3s;
624 | }
625 |
626 | progress:indeterminate::-moz-progress-bar {
627 | background-color: var(--accent-bg);
628 | }
629 |
630 | dialog {
631 | max-width: 40rem;
632 | margin: auto;
633 | }
634 |
635 | dialog::backdrop {
636 | background-color: var(--bg);
637 | opacity: 0.8;
638 | }
639 |
640 | @media only screen and (max-width: 720px) {
641 | dialog {
642 | max-width: 100%;
643 | margin: auto 1em;
644 | }
645 | }
646 |
647 | /* Classes for buttons and notices */
648 | .button,
649 | .button:visited {
650 | display: inline-block;
651 | text-decoration: none;
652 | border: none;
653 | border-radius: 5px;
654 | background: var(--accent);
655 | font-size: 1rem;
656 | color: var(--bg);
657 | padding: 0.7rem 0.9rem;
658 | margin: 0.5rem 0;
659 | }
660 |
661 | .button:hover,
662 | .button:focus {
663 | filter: brightness(1.4);
664 | cursor: pointer;
665 | }
666 |
667 | .notice {
668 | background: var(--accent-bg);
669 | border: 2px solid var(--border);
670 | border-radius: 5px;
671 | padding: 1.5rem;
672 | margin: 2rem 0;
673 | }
674 |
--------------------------------------------------------------------------------
/notebooks/Training and Testing Simple Recommendation Classifiers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "dd4c3a25-bf29-4236-b89e-f1926867c11f",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "from sqlalchemy import create_engine\n",
13 | "from urllib.parse import urlparse\n",
14 | "from nltk.corpus import stopwords\n",
15 | "from nltk import tokenize\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from collections import Counter\n",
18 | "from imblearn.over_sampling import RandomOverSampler\n",
19 | "from bs4 import BeautifulSoup\n",
20 | "\n",
21 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
22 | "from sklearn.naive_bayes import ComplementNB\n",
23 | "from sklearn.linear_model import LogisticRegression\n",
24 | "from sklearn.model_selection import RandomizedSearchCV\n",
25 | "from sklearn.pipeline import Pipeline\n",
26 | "from sklearn.svm import SVC\n",
27 | "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
28 | "\n",
29 | "import joblib\n",
30 | "import os\n",
31 | "import re\n",
32 | "import string\n",
33 | "import html\n",
34 | "\n",
35 | "from pprint import pprint\n",
36 | "from time import time\n",
37 | "from datetime import datetime"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "id": "76fca79d-1422-4b82-b973-4aefd64167f4",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "import nltk\n",
48 | "nltk.download('punkt_tab')\n",
49 | "nltk.download('stopwords')"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "85f91db7-9646-4975-bac6-42f91be92251",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "import os\n",
60 | "from dotenv import load_dotenv, dotenv_values \n",
61 | "load_dotenv() "
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "id": "4474b034-a67a-457d-9204-e798e2672469",
67 | "metadata": {},
68 | "source": [
69 | "## Priveedly: Training a Simple Content Recommender (Classifier) for Personal Use\n",
70 | "\n",
71 | "This notebook is originally for use with [Priveedly](https://blog.kjamistan.com/priveedly-your-private-and-personal-content-reader-and-recommender.html), a personal use content aggregator system available on [GitHub](https://github.com/kjam/priveedly).\n",
72 | "\n",
73 | "- There is a YouTube video to walk you through the notebook at a high level, in case it is helpful! \n",
74 | "- There are some links below to learn more about how to use scikit-learn.\n",
75 | "- I welcome feedback and contributions via GitHub!\n",
76 | "- Most importantly: HAVE FUN playing with ML concepts!\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "id": "b9b5a9fd-64eb-4038-a10e-24af5695aae1",
82 | "metadata": {},
83 | "source": [
84 | "# Getting text from Postgres into Pandas"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "id": "64328592-1b29-4cca-a4e3-90dd4c3b0320",
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "if os.path.isfile('data/cleaned.csv'):\n",
95 | " print (\"SKIP TO LOADING CLEANED DF!!!\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "df266446-4771-4440-9c4b-8463703c509e",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "engine = create_engine(os.getenv('LOCAL_DB_CONNSTR'))"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "268bea0d-9996-4bf4-9ad7-040e15e2dd63",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "sites_df = pd.read_sql(\n",
116 | " \"select title, url, description, site_name, interesting from sites_sitepost WHERE published::date >= '2023-01-01'\", \n",
117 | " con=engine)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "id": "6c072a4f-b809-44c6-b8e8-0ced8610b9fb",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "feeds_df = pd.read_sql(\n",
128 | " \"select feeds_feedentry.title as title, feeds_feedentry.url as url, feeds_feedentry.description as description, feeds_feed.title as site_name, interesting from feeds_feedentry JOIN feeds_feed ON feeds_feed.id = feed_id WHERE published::date >= '2023-01-01'\", \n",
129 | " con=engine)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "id": "71050239-d0f4-49a6-bae0-e7e9fd943dd5",
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "reddit_df = pd.read_sql(\n",
140 | " \"select sites_redditpost.title as title, sites_redditpost.url as url, sites_redditpost.description as description, sites_subreddit.name as site_name, interesting from sites_redditpost JOIN sites_subreddit ON sites_redditpost.id = sites_subreddit.id WHERE published::date >= '2023-01-01'\", \n",
141 | " con=engine)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "e2ee417e-18b6-4f3a-a31b-64544a81e8cb",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "content_df = pd.concat([reddit_df, sites_df, feeds_df])"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "id": "d9729773-70ae-4951-b543-6a050f9d3615",
157 | "metadata": {},
158 | "source": [
159 | "# Evaluating target"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "id": "92887ef8-85d8-4ba3-a446-a20e6072ce98",
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "content_df.interesting = content_df.interesting.astype(int)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "id": "2a77a6b1-6b09-4b04-8791-de63d7b8678e",
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "content_df.interesting.value_counts()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "id": "53a4276c-049e-4c58-bffc-8aaa2e94658b",
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "content_df.interesting.value_counts().iloc[0] / content_df.shape[0]"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "id": "9965b795-cd98-4d8f-a57e-e44e641a37f8",
196 | "metadata": {
197 | "scrolled": true
198 | },
199 | "outputs": [],
200 | "source": [
201 | "content_df.interesting.value_counts().plot.bar()"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "id": "088f5ad6-ac5c-49c4-be23-294a466f4e48",
207 | "metadata": {},
208 | "source": [
209 | "# Preparing the text data\n",
210 | "\n",
211 | "You'll need to take this code and put it into the priveedly rate_all.py script (see management_commands/rate_all.py) once you are running your pipeline in production. \n",
212 | "\n",
213 | "If you are using non-English languages, you probably want to play around and adjust this preparation to fit what works for you. I would love if you want to contribute any interesting additional notebooks to the repo! :)"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "id": "5e72236a-7432-46a6-ad96-f23d12ef071c",
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "def tokenize_url(url_str):\n",
224 | " parsed_url = urlparse(url_str)\n",
225 | " return parsed_url.netloc, ' '.join(parsed_url.path.split('/')).replace('-', ' '), parsed_url.query.replace('?', ' ').replace('=', ' ')\n",
226 | "\n",
227 | "def prepare_content(pandas_row):\n",
228 | " netloc, path, query = tokenize_url(pandas_row.url)\n",
229 | " return ' '.join([pandas_row.title, pandas_row.description, pandas_row.site_name])\n",
230 | "\n",
231 | "CLEAN_NUMBERS = re.compile('[0-9,\\\\.$\\\\%]+')\n",
232 | "CLEAN_NUMBERS_AND_ONE_LETTER = re.compile('([a-z]\\\\d+)|(\\\\d+[a-z])|(\\\\d+[a-z]\\\\d+)')\n",
233 | "CLEAN_REPEATED_PUNCTUATION = re.compile('[!\\\\-\\\\/:-@-`’–{-~\"“”\\\\[\\\\]]+')\n",
234 | "\n",
235 | "def remove_tags_and_lowercase(text): \n",
236 | " # some parts from https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string\n",
237 | " if BeautifulSoup(text, \"html.parser\").find():\n",
238 | " try:\n",
239 | " soup = BeautifulSoup(text)\n",
240 | " text = soup.get_text()\n",
241 | " except:\n",
242 | " pass\n",
243 | " cleantext = html.unescape(text).encode('unicode_escape').decode('unicode_escape')\n",
244 | " # you can try this line or other similar things if you want to be more deliberate about cleaning!\n",
245 | " #cleantext = re.sub(CLEAN_NUMBERS_AND_ONE_LETTER, '', cleantext)\n",
246 | " cleantext = re.sub(CLEAN_NUMBERS, '', cleantext)\n",
247 | " cleantext = re.sub(CLEAN_REPEATED_PUNCTUATION, '', cleantext)\n",
248 | " return cleantext.lower()\n",
249 | "\n",
250 | "removal = set(stopwords.words('english')).union(set(string.punctuation))\n",
251 | "\n",
252 | "def tokenize_content(text):\n",
253 | " return [w for w in tokenize.word_tokenize(remove_tags_and_lowercase(text)) if w.lower() not in removal]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "id": "891c0d6c-dc05-4916-aec7-2b67d9588351",
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "content_df['full_text'] = content_df.apply(prepare_content, axis=1)"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "id": "15a2b39c-889d-4ed9-a322-a6b86855439f",
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "content_df['cleaned_text'] = content_df['full_text'].map(lambda x: ' '.join(tokenize_content(x)))"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "id": "3561815b-9f3b-4408-ae84-5c97875ab78d",
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "sample = content_df.sample(20)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "id": "45e128c9-ec82-450e-8849-b49c7987b939",
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "sample[[\"full_text\", \"cleaned_text\"]]"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "id": "8fe81494-6edc-411e-a1a4-4fc931d2ac35",
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "content_df.to_csv(\"data/cleaned.csv\")"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "11743eea-ad59-4420-8e9a-641766733b9a",
309 | "metadata": {},
310 | "source": [
311 | "### Now you can always load this way"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "id": "3cfcdda5-9aa7-4e4e-aa17-7ce74fee0a15",
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "content_df = pd.read_csv(\"data/cleaned.csv\")"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "id": "cfc424f3-8b7d-4a35-80ab-51baa13f76d1",
327 | "metadata": {},
328 | "source": [
329 | "### Dealing with class imbalance\n",
330 | "\n",
331 | "My classes are really lopsided. Yours might be different! If you notice that yours are more even, you can use the orig_X_train as the X_train (and so forth!).\n",
332 | "\n",
333 | "To help with my lopsided classes, I will use [Imbalanced Learn](https://imbalanced-learn.org/)."
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "id": "7d510c49-ea58-4874-b8db-ba0768297d01",
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "oversampler = RandomOverSampler(sampling_strategy=0.15)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "id": "10ed0d75-f9de-4c69-a0b3-77a92031a859",
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "orig_X_train, orig_X_test, orig_y_train, orig_y_test = train_test_split(content_df.cleaned_text, content_df.interesting, \n",
354 | " test_size=0.3, stratify=content_df.interesting)"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "id": "a7326c30-44b2-4f0f-bd5a-d93d88cd2f5f",
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "Counter(orig_y_train), Counter(orig_y_test)"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "id": "48db71c5-80b9-417e-b37e-6c579cf3a120",
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "X_res, y_res = oversampler.fit_resample(content_df[[\"cleaned_text\"]].to_numpy(), content_df.interesting.to_numpy())"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "id": "76509404-152d-46c1-b60f-2fe0b4371b63",
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "Counter(y_res)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "id": "27a9045e-1490-4a8b-bd3e-7594943fb96b",
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "X_train, X_test, y_train, y_test = train_test_split(X_res.flatten(), y_res, test_size=0.3)"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "id": "ec4ee16b-f4da-408f-93c7-3b629d815a8a",
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "Counter(y_train), Counter(y_test)"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "id": "534aedb3-03ba-4ef7-85c3-a397a5013245",
410 | "metadata": {},
411 | "source": [
412 | "### Let's build some NLP pipelines with Scikit-learn!\n",
413 | "\n",
414 | "Scikit-learn is a great library for building machine learning models, especially with smaller personalized datasets, like this one! It has everything you need to get started and a great learning community and documentation.\n",
415 | "\n",
416 | "Want to learn more about scikit-learn and different machine learning models? Check out:\n",
417 | "\n",
418 | "- [Scikit-learn crash course](https://www.youtube.com/watch?v=0B5eIE_1vpU)\n",
419 | "- [Scikit-learn online learning course](https://inria.github.io/scikit-learn-mooc/)\n",
420 | "- [Calmcode](https://calmcode.io)\n",
421 | "- [probabl's YouTube Channel (some advanced topics)](https://www.youtube.com/@probabl_ai)\n",
422 | "\n",
423 | "Hat tip to [Vincent](https://github.com/koaning) for helping me assemble these resources!"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "id": "1dd58cbb-3972-4506-b3e5-b2c5211f0325",
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "svc_pipeline = Pipeline(\n",
434 | " [\n",
435 | " (\"vect\", TfidfVectorizer()),\n",
436 | " (\"clf\", SVC()), # more complex, but maybe not worth it\n",
437 | " ]\n",
438 | ")"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "id": "6eeab15f-a95f-4a86-aa7e-a9c1d506d2fd",
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "bayes_pipeline = Pipeline(\n",
449 | " [\n",
450 | " (\"vect\", TfidfVectorizer()),\n",
451 | " (\"clf\", ComplementNB()), # better at imbalance\n",
452 | " ]\n",
453 | ")"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "id": "af29b2b5-e681-42a6-a34d-7ec4befed5e1",
460 | "metadata": {},
461 | "outputs": [],
462 | "source": [
463 | "logreg_pipeline = Pipeline(\n",
464 | " [\n",
465 | " (\"vect\", TfidfVectorizer()),\n",
466 | " (\"clf\", LogisticRegression()), # simple, but maybe good enough\n",
467 | " ]\n",
468 | ")"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "id": "7d4a6091-1e7a-49d0-8bb0-360db18603e8",
474 | "metadata": {},
475 | "source": [
476 | "For looking up parameters to test, take a look at the following:\n",
477 | "\n",
478 | "- [TF-IDF Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)\n",
479 | "- [SVC Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n",
480 | "- [Complement Naive Bayes Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html)\n",
481 | "- [LogisticRegression Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "id": "4b95d398-7f4c-47c2-a8d6-a3f271bae949",
488 | "metadata": {},
489 | "outputs": [],
490 | "source": [
491 | "base_parameter_grid = {\n",
492 | " \"vect__max_df\": (0.8, 0.9),\n",
493 | " \"vect__min_df\": (0.01, 0.03),\n",
494 | " \"vect__ngram_range\": ((1, 1), (1, 2)), # unigrams or bigrams\n",
495 | " #\"vect__norm\": (\"l1\", \"l2\"),\n",
496 | "}"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "id": "c814383d-aea9-438b-8795-d97c307cb6f5",
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "svc_parameter_grid = {\n",
507 | " \"clf__C\": (1, 10), # inverse of regularization strength (smaller = more regularization)\n",
508 | " \"clf__kernel\": ('rbf', 'sigmoid', 'poly') \n",
509 | "}\n"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "id": "857aaca3-bdbf-4382-a723-c87fde3238cf",
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "cnb_parameter_grid = {\n",
520 | " \"clf__alpha\": np.logspace(-6, 6, 13), # Additive (Laplace/Lidstone) smoothing parameter \n",
521 | "}"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "id": "3fce5e76-c849-4d51-8687-3aaa5d0f837a",
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "logreg_parameter_grid = {\n",
532 | " \"clf__C\": (1, 10), # inverse of regularization strength (smaller = more regularization)\n",
533 | " \"clf__solver\": (\"lbfgs\", \"liblinear\", \"newton-cholesky\"), \n",
534 | "}"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "id": "5d4859df-26c3-441d-bca4-3d2cc7bb659a",
540 | "metadata": {},
541 | "source": [
542 | "### Start by testing each model separately\n",
543 | "\n",
544 | "You can eventually productionize this with Weights and Biases, or just find the type of model that works best for your data and stick with that, updating only the training dataset over time. \n",
545 | "\n",
546 | "After you get your first model or two working, you likely also decide: oh I really only want to test SVC or I like having a fast LR model. Or even, I want to compare these simple models with a deep learning model or a local LLM.\n",
547 | "\n",
548 | "To test each one, change the lines below to reflect your changes:\n",
549 | "\n",
550 | "- use the parameter grid you set up above\n",
551 | "- change the model_name to something you will remember\n",
552 | "- change the estimator to the pipeline that you are evaluating"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "id": "1a6ca6a9-a428-4ed1-9f24-7fc2cc8e49ec",
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "parameter_grid = base_parameter_grid.copy()\n",
563 | "parameter_grid.update(logreg_parameter_grid) #CHANGE HERE: logreg_parameter_grid, cnb_parameter_grid, svc_parameter_grid\n",
564 | "model_name = \"LR\" # CHANGE HERE suggestion: LR, CNB, SVC\n",
565 | "\n",
566 | "random_search = RandomizedSearchCV(\n",
567 | " estimator=logreg_pipeline, # CHANGE HERE: logreg_pipeline, bayes_pipeline, svc_pipeline\n",
568 | " param_distributions=parameter_grid,\n",
569 | " n_iter=20,\n",
570 | " random_state=0,\n",
571 | " n_jobs=4,\n",
572 | " verbose=1,\n",
573 | ")\n",
574 | "\n",
575 | "print(\"Performing grid search...\")\n",
576 | "print(\"Hyperparameters to be evaluated:\")\n",
577 | "pprint(parameter_grid)"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": null,
583 | "id": "36a93f84-bd2c-4ee0-af08-b78a4418c3c7",
584 | "metadata": {},
585 | "outputs": [],
586 | "source": [
587 | "t0 = time()\n",
588 | "random_search.fit(X_train, y_train)\n",
589 | "print(f\"Done in {time() - t0:.3f}s\")"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": null,
595 | "id": "5fe22603-c6d2-4ce6-903a-d07d5f28aa5f",
596 | "metadata": {},
597 | "outputs": [],
598 | "source": [
599 | "print(\"Best parameters combination found:\")\n",
600 | "best_parameters = random_search.best_estimator_.get_params()\n",
601 | "for param_name in sorted(parameter_grid.keys()):\n",
602 | " print(f\"{param_name}: {best_parameters[param_name]}\")"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "id": "a9d892e1-e3e0-4f56-89ee-590f74b39360",
609 | "metadata": {},
610 | "outputs": [],
611 | "source": [
612 | "test_accuracy = random_search.score(X_test, y_test)\n",
613 | "print(f\"Accuracy of the best parameters using CV random search: {random_search.best_score_:.3f}\")\n",
614 | "print(f\"Accuracy on test set: {test_accuracy:.3f}\")"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": null,
620 | "id": "d4eb3062-c050-4a3e-9188-fc489900cdcd",
621 | "metadata": {},
622 | "outputs": [],
623 | "source": [
624 | "y_pred = random_search.predict(X_test)"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": null,
630 | "id": "122f7dce-4564-45af-8607-9e4447222f16",
631 | "metadata": {},
632 | "outputs": [],
633 | "source": [
634 | "human_labels = {0: 'not interesting',\n",
635 | " 1: 'interesting'}"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": null,
641 | "id": "50146191-cdb6-48ae-9af6-1ee33cde685f",
642 | "metadata": {},
643 | "outputs": [],
644 | "source": [
645 | "disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=[human_labels[c] for c in random_search.classes_])\n",
646 | "disp.plot()"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "id": "ffb04625-450f-40f0-9903-8d5412636c1c",
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "experiment_time = datetime.now().strftime(\"%Y%m%d_%H_%M\")\n",
657 | "with open(\"experiments/{}_{}.txt\".format(experiment_time, model_name), 'w') as documentation_file:\n",
658 | " for param_name in sorted(parameter_grid.keys()):\n",
659 | " documentation_file.write(f\"{param_name}: {best_parameters[param_name]}\")\n",
660 | " documentation_file.write(f\"Accuracy on the random search: {random_search.best_score_:.3f}\")\n",
661 | " documentation_file.write(f\"Accuracy on test set: {test_accuracy:.3f}\") "
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": null,
667 | "id": "e9465786-1310-4c06-ad81-db68a871e7a9",
668 | "metadata": {},
669 | "outputs": [],
670 | "source": [
671 | "logreg_pipeline.set_params(**best_parameters) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline"
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": null,
677 | "id": "e7483cb0-871b-4212-934d-212ac7afb6ab",
678 | "metadata": {},
679 | "outputs": [],
680 | "source": [
681 | "joblib.dump(logreg_pipeline, \"experiments/models/{}_{}_pipeline.pkl\".format(experiment_time, model_name)) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline"
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": null,
687 | "id": "3f6eff3d-2c9c-4745-a816-af60b65a695f",
688 | "metadata": {},
689 | "outputs": [],
690 | "source": [
691 | "pipeline = logreg_pipeline # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline"
692 | ]
693 | },
694 | {
695 | "cell_type": "markdown",
696 | "id": "9b00371a-310d-4fe3-b647-e0c0dc178f80",
697 | "metadata": {},
698 | "source": [
699 | "If you ever want to load again, you can just:\n"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": null,
705 | "id": "08714b49-dd0b-472f-a306-cb5025c5d7b2",
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "pipeline = joblib.load('experiments/models/20250121_19_46_SVC_pipeline.pkl')"
710 | ]
711 | },
712 | {
713 | "cell_type": "markdown",
714 | "id": "51ff4d4f-6141-4763-909e-ccf416836f33",
715 | "metadata": {},
716 | "source": [
717 | "### Investigating / interpreting your model\n",
718 | "\n",
719 | "So now you have an idea of the accuracy, but will it work for what you want to use it for? \n",
720 | "\n",
721 | "Let's say that it's really good at recognizing exactly your interests based on some silly keywords that you don't think will hold in practice. Or let's say you're also just curious about what keywords might be most interesting to you and want to have a look at the inner workings of your system. Either way, it's a good idea to investigate the model in order to qualitatively compare the models you've trained and determine which model you want to use.\n",
722 | "\n",
723 | "The following parts of the notebook can help you investigate and figure out how you think about the model decisions.\n",
724 | "\n",
725 | "#### Note: LIME Text Explainer doesn't appear to work for my data with SVC; but that might be different for you ! Let me know if it does!"
726 | ]
727 | },
728 | {
729 | "cell_type": "code",
730 | "execution_count": null,
731 | "id": "3447d5f4-c393-429b-90a7-c084a67cd742",
732 | "metadata": {},
733 | "outputs": [],
734 | "source": [
735 | "from lime.lime_text import LimeTextExplainer\n",
736 | "\n",
737 | "\n",
738 | "explainer = LimeTextExplainer(class_names=[human_labels[c] for c in pipeline.classes_])"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": null,
744 | "id": "e6f0ffa8-06e2-4523-ad86-335cc1376289",
745 | "metadata": {},
746 | "outputs": [],
747 | "source": [
748 | "sample_df = content_df.groupby(\"interesting\").sample(n=20)"
749 | ]
750 | },
751 | {
752 | "cell_type": "code",
753 | "execution_count": null,
754 | "id": "7c8b4111-1a40-4379-a968-d9e572b9f617",
755 | "metadata": {},
756 | "outputs": [],
757 | "source": [
758 | "pipeline.named_steps"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": null,
764 | "id": "3e6fa799-82d4-4e07-a55c-a0ae249ffada",
765 | "metadata": {},
766 | "outputs": [],
767 | "source": [
768 | "vectorizer = pipeline.named_steps['vect']\n",
769 | "estimator = pipeline.named_steps['clf']"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": null,
775 | "id": "706c5bc4-333f-4042-85b5-49efd602dad0",
776 | "metadata": {},
777 | "outputs": [],
778 | "source": [
779 | "# this is a fix for the SVC problem in LIME (see https://github.com/marcotcr/lime/issues/465)\n",
780 | "def classifier_fn(X):\n",
781 | " vectorized_text_instance = vectorizer.transform(X)\n",
782 | " decision = estimator.decision_function(vectorized_text_instance)\n",
783 | " reshaped_decision = np.array(decision).reshape(-1, 1)\n",
784 | " return reshaped_decision"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": null,
790 | "id": "ce7c92fd-a7e8-41c1-bfc6-637e6ab47684",
791 | "metadata": {},
792 | "outputs": [],
793 | "source": [
794 | "for example in sample_df.cleaned_text: \n",
795 | " try:\n",
796 | " if hasattr(pipeline, 'predict_proba'):\n",
797 | " exp = explainer.explain_instance(example, pipeline.predict_proba, labels=pipeline.classes_) \n",
798 | " elif \"SVC\" in str(estimator): # this is hacky :(\n",
799 | " exp = explainer.explain_instance(text_instance=example, classifier_fn=classifier_fn, labels=(0,))\n",
800 | " exp.show_in_notebook()\n",
801 | " except Exception as e:\n",
802 | " print(e)\n",
803 | " print('problem with this example')"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": null,
809 | "id": "2fc8267f-59c0-496c-b8cc-3d6f5c5c383d",
810 | "metadata": {
811 | "scrolled": true
812 | },
813 | "outputs": [],
814 | "source": [
815 | "from sklearn.inspection import permutation_importance\n",
816 | "\n",
817 | "if hasattr(estimator, 'feature_log_prob_'): # bayesian\n",
818 | " neg_class_prob_sorted = estimator.feature_log_prob_[0, :].argsort()[::-1]\n",
819 | " pos_class_prob_sorted = estimator.feature_log_prob_[1, :].argsort()[::-1]\n",
820 | "elif hasattr(estimator, 'coef_'): # logreg\n",
821 | " pos_class_prob_sorted = estimator.coef_[0, :].argsort()[::-1]\n",
822 | " neg_class_prob_sorted = estimator.coef_[0, :].argsort()\n",
823 | "elif hasattr(estimator, 'kernel'): # svm\n",
824 | " X = vectorizer.transform(X_train).toarray() # this is inefficient and it might run out of memory or timeout :(\n",
825 | " # if this happens restart kernel and don't rerun \n",
826 | " perm_importance = permutation_importance(estimator, X, y_train)\n",
827 | " pos_class_prob_sorted = perm_importance.importances_mean.argsort()\n",
828 | " neg_class_prob_sorted = perm_importance.importances_mean.argsort()[::-1]\n",
829 | "\n",
830 | "\n",
831 | "feature_names = vectorizer.get_feature_names_out()\n",
832 | "\n",
833 | "print(np.take(feature_names, neg_class_prob_sorted[:100]))\n",
834 | "print(np.take(feature_names, pos_class_prob_sorted[:100]))\n"
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": null,
840 | "id": "d35c814d-2212-4987-ae82-a5cf6d5a815c",
841 | "metadata": {},
842 | "outputs": [],
843 | "source": [
844 | "def find_word_rank(query):\n",
845 | " i, = np.where(feature_names == query)\n",
846 | " try:\n",
847 | " pos_i = np.where(pos_class_prob_sorted == i)\n",
848 | " neg_i = np.where(neg_class_prob_sorted == i)\n",
849 | " if pos_i < neg_i:\n",
850 | " print(\"ranked in positive score at position #{} out of {}\".format(pos_i[0][0], pos_class_prob_sorted.shape[0]))\n",
851 | " else:\n",
852 | " print(\"ranked in negative score at position #{} out of {}\".format(neg_i[0][0], neg_class_prob_sorted.shape[0]))\n",
853 | " except ValueError:\n",
854 | " print('token not found')\n"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": null,
860 | "id": "77eea602-ed31-4031-a3cb-11c37961b699",
861 | "metadata": {},
862 | "outputs": [],
863 | "source": [
864 | "find_word_rank(\"crypto\")"
865 | ]
866 | },
867 | {
868 | "cell_type": "code",
869 | "execution_count": null,
870 | "id": "fa3c69a0-60f4-43cd-8de6-cb6a4be4c037",
871 | "metadata": {},
872 | "outputs": [],
873 | "source": [
874 | "find_word_rank(\"cryptography\")"
875 | ]
876 | },
877 | {
878 | "cell_type": "markdown",
879 | "id": "c8ee4a34-3cb1-4de2-8ac4-0d0368ac6d41",
880 | "metadata": {},
881 | "source": [
882 | "### If this is the main one you want to use, store it as pipeline.pkl and upload it to your server :)"
883 | ]
884 | },
885 | {
886 | "cell_type": "code",
887 | "execution_count": null,
888 | "id": "650d779a-04bf-4da5-a41e-e4f4b1dc66a9",
889 | "metadata": {},
890 | "outputs": [],
891 | "source": [
892 | "joblib.dump(pipeline, \"pipeline.pkl\")"
893 | ]
894 | },
895 | {
896 | "cell_type": "code",
897 | "execution_count": null,
898 | "id": "ca277943-8f82-493b-801c-0a2bedc38d4c",
899 | "metadata": {},
900 | "outputs": [],
901 | "source": []
902 | }
903 | ],
904 | "metadata": {
905 | "kernelspec": {
906 | "display_name": "Python 3 (ipykernel)",
907 | "language": "python",
908 | "name": "python3"
909 | },
910 | "language_info": {
911 | "codemirror_mode": {
912 | "name": "ipython",
913 | "version": 3
914 | },
915 | "file_extension": ".py",
916 | "mimetype": "text/x-python",
917 | "name": "python",
918 | "nbconvert_exporter": "python",
919 | "pygments_lexer": "ipython3",
920 | "version": "3.12.4"
921 | }
922 | },
923 | "nbformat": 4,
924 | "nbformat_minor": 5
925 | }
926 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # GNU GENERAL PUBLIC LICENSE
2 |
3 | Version 3, 29 June 2007
4 |
5 | Copyright (C) 2007 Free Software Foundation, Inc.
6 |
7 |
8 | Everyone is permitted to copy and distribute verbatim copies of this
9 | license document, but changing it is not allowed.
10 |
11 | ## Preamble
12 |
13 | The GNU General Public License is a free, copyleft license for
14 | software and other kinds of works.
15 |
16 | The licenses for most software and other practical works are designed
17 | to take away your freedom to share and change the works. By contrast,
18 | the GNU General Public License is intended to guarantee your freedom
19 | to share and change all versions of a program--to make sure it remains
20 | free software for all its users. We, the Free Software Foundation, use
21 | the GNU General Public License for most of our software; it applies
22 | also to any other work released this way by its authors. You can apply
23 | it to your programs, too.
24 |
25 | When we speak of free software, we are referring to freedom, not
26 | price. Our General Public Licenses are designed to make sure that you
27 | have the freedom to distribute copies of free software (and charge for
28 | them if you wish), that you receive source code or can get it if you
29 | want it, that you can change the software or use pieces of it in new
30 | free programs, and that you know you can do these things.
31 |
32 | To protect your rights, we need to prevent others from denying you
33 | these rights or asking you to surrender the rights. Therefore, you
34 | have certain responsibilities if you distribute copies of the
35 | software, or if you modify it: responsibilities to respect the freedom
36 | of others.
37 |
38 | For example, if you distribute copies of such a program, whether
39 | gratis or for a fee, you must pass on to the recipients the same
40 | freedoms that you received. You must make sure that they, too, receive
41 | or can get the source code. And you must show them these terms so they
42 | know their rights.
43 |
44 | Developers that use the GNU GPL protect your rights with two steps:
45 | (1) assert copyright on the software, and (2) offer you this License
46 | giving you legal permission to copy, distribute and/or modify it.
47 |
48 | For the developers' and authors' protection, the GPL clearly explains
49 | that there is no warranty for this free software. For both users' and
50 | authors' sake, the GPL requires that modified versions be marked as
51 | changed, so that their problems will not be attributed erroneously to
52 | authors of previous versions.
53 |
54 | Some devices are designed to deny users access to install or run
55 | modified versions of the software inside them, although the
56 | manufacturer can do so. This is fundamentally incompatible with the
57 | aim of protecting users' freedom to change the software. The
58 | systematic pattern of such abuse occurs in the area of products for
59 | individuals to use, which is precisely where it is most unacceptable.
60 | Therefore, we have designed this version of the GPL to prohibit the
61 | practice for those products. If such problems arise substantially in
62 | other domains, we stand ready to extend this provision to those
63 | domains in future versions of the GPL, as needed to protect the
64 | freedom of users.
65 |
66 | Finally, every program is threatened constantly by software patents.
67 | States should not allow patents to restrict development and use of
68 | software on general-purpose computers, but in those that do, we wish
69 | to avoid the special danger that patents applied to a free program
70 | could make it effectively proprietary. To prevent this, the GPL
71 | assures that patents cannot be used to render the program non-free.
72 |
73 | The precise terms and conditions for copying, distribution and
74 | modification follow.
75 |
76 | ## TERMS AND CONDITIONS
77 |
78 | ### 0. Definitions.
79 |
80 | "This License" refers to version 3 of the GNU General Public License.
81 |
82 | "Copyright" also means copyright-like laws that apply to other kinds
83 | of works, such as semiconductor masks.
84 |
85 | "The Program" refers to any copyrightable work licensed under this
86 | License. Each licensee is addressed as "you". "Licensees" and
87 | "recipients" may be individuals or organizations.
88 |
89 | To "modify" a work means to copy from or adapt all or part of the work
90 | in a fashion requiring copyright permission, other than the making of
91 | an exact copy. The resulting work is called a "modified version" of
92 | the earlier work or a work "based on" the earlier work.
93 |
94 | A "covered work" means either the unmodified Program or a work based
95 | on the Program.
96 |
97 | To "propagate" a work means to do anything with it that, without
98 | permission, would make you directly or secondarily liable for
99 | infringement under applicable copyright law, except executing it on a
100 | computer or modifying a private copy. Propagation includes copying,
101 | distribution (with or without modification), making available to the
102 | public, and in some countries other activities as well.
103 |
104 | To "convey" a work means any kind of propagation that enables other
105 | parties to make or receive copies. Mere interaction with a user
106 | through a computer network, with no transfer of a copy, is not
107 | conveying.
108 |
109 | An interactive user interface displays "Appropriate Legal Notices" to
110 | the extent that it includes a convenient and prominently visible
111 | feature that (1) displays an appropriate copyright notice, and (2)
112 | tells the user that there is no warranty for the work (except to the
113 | extent that warranties are provided), that licensees may convey the
114 | work under this License, and how to view a copy of this License. If
115 | the interface presents a list of user commands or options, such as a
116 | menu, a prominent item in the list meets this criterion.
117 |
118 | ### 1. Source Code.
119 |
120 | The "source code" for a work means the preferred form of the work for
121 | making modifications to it. "Object code" means any non-source form of
122 | a work.
123 |
124 | A "Standard Interface" means an interface that either is an official
125 | standard defined by a recognized standards body, or, in the case of
126 | interfaces specified for a particular programming language, one that
127 | is widely used among developers working in that language.
128 |
129 | The "System Libraries" of an executable work include anything, other
130 | than the work as a whole, that (a) is included in the normal form of
131 | packaging a Major Component, but which is not part of that Major
132 | Component, and (b) serves only to enable use of the work with that
133 | Major Component, or to implement a Standard Interface for which an
134 | implementation is available to the public in source code form. A
135 | "Major Component", in this context, means a major essential component
136 | (kernel, window system, and so on) of the specific operating system
137 | (if any) on which the executable work runs, or a compiler used to
138 | produce the work, or an object code interpreter used to run it.
139 |
140 | The "Corresponding Source" for a work in object code form means all
141 | the source code needed to generate, install, and (for an executable
142 | work) run the object code and to modify the work, including scripts to
143 | control those activities. However, it does not include the work's
144 | System Libraries, or general-purpose tools or generally available free
145 | programs which are used unmodified in performing those activities but
146 | which are not part of the work. For example, Corresponding Source
147 | includes interface definition files associated with source files for
148 | the work, and the source code for shared libraries and dynamically
149 | linked subprograms that the work is specifically designed to require,
150 | such as by intimate data communication or control flow between those
151 | subprograms and other parts of the work.
152 |
153 | The Corresponding Source need not include anything that users can
154 | regenerate automatically from other parts of the Corresponding Source.
155 |
156 | The Corresponding Source for a work in source code form is that same
157 | work.
158 |
159 | ### 2. Basic Permissions.
160 |
161 | All rights granted under this License are granted for the term of
162 | copyright on the Program, and are irrevocable provided the stated
163 | conditions are met. This License explicitly affirms your unlimited
164 | permission to run the unmodified Program. The output from running a
165 | covered work is covered by this License only if the output, given its
166 | content, constitutes a covered work. This License acknowledges your
167 | rights of fair use or other equivalent, as provided by copyright law.
168 |
169 | You may make, run and propagate covered works that you do not convey,
170 | without conditions so long as your license otherwise remains in force.
171 | You may convey covered works to others for the sole purpose of having
172 | them make modifications exclusively for you, or provide you with
173 | facilities for running those works, provided that you comply with the
174 | terms of this License in conveying all material for which you do not
175 | control copyright. Those thus making or running the covered works for
176 | you must do so exclusively on your behalf, under your direction and
177 | control, on terms that prohibit them from making any copies of your
178 | copyrighted material outside their relationship with you.
179 |
180 | Conveying under any other circumstances is permitted solely under the
181 | conditions stated below. Sublicensing is not allowed; section 10 makes
182 | it unnecessary.
183 |
184 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
185 |
186 | No covered work shall be deemed part of an effective technological
187 | measure under any applicable law fulfilling obligations under article
188 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
189 | similar laws prohibiting or restricting circumvention of such
190 | measures.
191 |
192 | When you convey a covered work, you waive any legal power to forbid
193 | circumvention of technological measures to the extent such
194 | circumvention is effected by exercising rights under this License with
195 | respect to the covered work, and you disclaim any intention to limit
196 | operation or modification of the work as a means of enforcing, against
197 | the work's users, your or third parties' legal rights to forbid
198 | circumvention of technological measures.
199 |
200 | ### 4. Conveying Verbatim Copies.
201 |
202 | You may convey verbatim copies of the Program's source code as you
203 | receive it, in any medium, provided that you conspicuously and
204 | appropriately publish on each copy an appropriate copyright notice;
205 | keep intact all notices stating that this License and any
206 | non-permissive terms added in accord with section 7 apply to the code;
207 | keep intact all notices of the absence of any warranty; and give all
208 | recipients a copy of this License along with the Program.
209 |
210 | You may charge any price or no price for each copy that you convey,
211 | and you may offer support or warranty protection for a fee.
212 |
213 | ### 5. Conveying Modified Source Versions.
214 |
215 | You may convey a work based on the Program, or the modifications to
216 | produce it from the Program, in the form of source code under the
217 | terms of section 4, provided that you also meet all of these
218 | conditions:
219 |
220 | - a) The work must carry prominent notices stating that you modified
221 | it, and giving a relevant date.
222 | - b) The work must carry prominent notices stating that it is
223 | released under this License and any conditions added under
224 | section 7. This requirement modifies the requirement in section 4
225 | to "keep intact all notices".
226 | - c) You must license the entire work, as a whole, under this
227 | License to anyone who comes into possession of a copy. This
228 | License will therefore apply, along with any applicable section 7
229 | additional terms, to the whole of the work, and all its parts,
230 | regardless of how they are packaged. This License gives no
231 | permission to license the work in any other way, but it does not
232 | invalidate such permission if you have separately received it.
233 | - d) If the work has interactive user interfaces, each must display
234 | Appropriate Legal Notices; however, if the Program has interactive
235 | interfaces that do not display Appropriate Legal Notices, your
236 | work need not make them do so.
237 |
238 | A compilation of a covered work with other separate and independent
239 | works, which are not by their nature extensions of the covered work,
240 | and which are not combined with it such as to form a larger program,
241 | in or on a volume of a storage or distribution medium, is called an
242 | "aggregate" if the compilation and its resulting copyright are not
243 | used to limit the access or legal rights of the compilation's users
244 | beyond what the individual works permit. Inclusion of a covered work
245 | in an aggregate does not cause this License to apply to the other
246 | parts of the aggregate.
247 |
248 | ### 6. Conveying Non-Source Forms.
249 |
250 | You may convey a covered work in object code form under the terms of
251 | sections 4 and 5, provided that you also convey the machine-readable
252 | Corresponding Source under the terms of this License, in one of these
253 | ways:
254 |
255 | - a) Convey the object code in, or embodied in, a physical product
256 | (including a physical distribution medium), accompanied by the
257 | Corresponding Source fixed on a durable physical medium
258 | customarily used for software interchange.
259 | - b) Convey the object code in, or embodied in, a physical product
260 | (including a physical distribution medium), accompanied by a
261 | written offer, valid for at least three years and valid for as
262 | long as you offer spare parts or customer support for that product
263 | model, to give anyone who possesses the object code either (1) a
264 | copy of the Corresponding Source for all the software in the
265 | product that is covered by this License, on a durable physical
266 | medium customarily used for software interchange, for a price no
267 | more than your reasonable cost of physically performing this
268 | conveying of source, or (2) access to copy the Corresponding
269 | Source from a network server at no charge.
270 | - c) Convey individual copies of the object code with a copy of the
271 | written offer to provide the Corresponding Source. This
272 | alternative is allowed only occasionally and noncommercially, and
273 | only if you received the object code with such an offer, in accord
274 | with subsection 6b.
275 | - d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 | - e) Convey the object code using peer-to-peer transmission,
288 | provided you inform other peers where the object code and
289 | Corresponding Source of the work are being offered to the general
290 | public at no charge under subsection 6d.
291 |
292 | A separable portion of the object code, whose source code is excluded
293 | from the Corresponding Source as a System Library, need not be
294 | included in conveying the object code work.
295 |
296 | A "User Product" is either (1) a "consumer product", which means any
297 | tangible personal property which is normally used for personal,
298 | family, or household purposes, or (2) anything designed or sold for
299 | incorporation into a dwelling. In determining whether a product is a
300 | consumer product, doubtful cases shall be resolved in favor of
301 | coverage. For a particular product received by a particular user,
302 | "normally used" refers to a typical or common use of that class of
303 | product, regardless of the status of the particular user or of the way
304 | in which the particular user actually uses, or expects or is expected
305 | to use, the product. A product is a consumer product regardless of
306 | whether the product has substantial commercial, industrial or
307 | non-consumer uses, unless such uses represent the only significant
308 | mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to
312 | install and execute modified versions of a covered work in that User
313 | Product from a modified version of its Corresponding Source. The
314 | information must suffice to ensure that the continued functioning of
315 | the modified object code is in no case prevented or interfered with
316 | solely because modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or
331 | updates for a work that has been modified or installed by the
332 | recipient, or for the User Product in which it has been modified or
333 | installed. Access to a network may be denied when the modification
334 | itself materially and adversely affects the operation of the network
335 | or violates the rules and protocols for communication across the
336 | network.
337 |
338 | Corresponding Source conveyed, and Installation Information provided,
339 | in accord with this section must be in a format that is publicly
340 | documented (and with an implementation available to the public in
341 | source code form), and must require no special password or key for
342 | unpacking, reading or copying.
343 |
344 | ### 7. Additional Terms.
345 |
346 | "Additional permissions" are terms that supplement the terms of this
347 | License by making exceptions from one or more of its conditions.
348 | Additional permissions that are applicable to the entire Program shall
349 | be treated as though they were included in this License, to the extent
350 | that they are valid under applicable law. If additional permissions
351 | apply only to part of the Program, that part may be used separately
352 | under those permissions, but the entire Program remains governed by
353 | this License without regard to the additional permissions.
354 |
355 | When you convey a copy of a covered work, you may at your option
356 | remove any additional permissions from that copy, or from any part of
357 | it. (Additional permissions may be written to require their own
358 | removal in certain cases when you modify the work.) You may place
359 | additional permissions on material, added by you to a covered work,
360 | for which you have or can give appropriate copyright permission.
361 |
362 | Notwithstanding any other provision of this License, for material you
363 | add to a covered work, you may (if authorized by the copyright holders
364 | of that material) supplement the terms of this License with terms:
365 |
366 | - a) Disclaiming warranty or limiting liability differently from the
367 | terms of sections 15 and 16 of this License; or
368 | - b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 | - c) Prohibiting misrepresentation of the origin of that material,
372 | or requiring that modified versions of such material be marked in
373 | reasonable ways as different from the original version; or
374 | - d) Limiting the use for publicity purposes of names of licensors
375 | or authors of the material; or
376 | - e) Declining to grant rights under trademark law for use of some
377 | trade names, trademarks, or service marks; or
378 | - f) Requiring indemnification of licensors and authors of that
379 | material by anyone who conveys the material (or modified versions
380 | of it) with contractual assumptions of liability to the recipient,
381 | for any liability that these contractual assumptions directly
382 | impose on those licensors and authors.
383 |
384 | All other non-permissive additional terms are considered "further
385 | restrictions" within the meaning of section 10. If the Program as you
386 | received it, or any part of it, contains a notice stating that it is
387 | governed by this License along with a term that is a further
388 | restriction, you may remove that term. If a license document contains
389 | a further restriction but permits relicensing or conveying under this
390 | License, you may add to a covered work material governed by the terms
391 | of that license document, provided that the further restriction does
392 | not survive such relicensing or conveying.
393 |
394 | If you add terms to a covered work in accord with this section, you
395 | must place, in the relevant source files, a statement of the
396 | additional terms that apply to those files, or a notice indicating
397 | where to find the applicable terms.
398 |
399 | Additional terms, permissive or non-permissive, may be stated in the
400 | form of a separately written license, or stated as exceptions; the
401 | above requirements apply either way.
402 |
403 | ### 8. Termination.
404 |
405 | You may not propagate or modify a covered work except as expressly
406 | provided under this License. Any attempt otherwise to propagate or
407 | modify it is void, and will automatically terminate your rights under
408 | this License (including any patent licenses granted under the third
409 | paragraph of section 11).
410 |
411 | However, if you cease all violation of this License, then your license
412 | from a particular copyright holder is reinstated (a) provisionally,
413 | unless and until the copyright holder explicitly and finally
414 | terminates your license, and (b) permanently, if the copyright holder
415 | fails to notify you of the violation by some reasonable means prior to
416 | 60 days after the cessation.
417 |
418 | Moreover, your license from a particular copyright holder is
419 | reinstated permanently if the copyright holder notifies you of the
420 | violation by some reasonable means, this is the first time you have
421 | received notice of violation of this License (for any work) from that
422 | copyright holder, and you cure the violation prior to 30 days after
423 | your receipt of the notice.
424 |
425 | Termination of your rights under this section does not terminate the
426 | licenses of parties who have received copies or rights from you under
427 | this License. If your rights have been terminated and not permanently
428 | reinstated, you do not qualify to receive new licenses for the same
429 | material under section 10.
430 |
431 | ### 9. Acceptance Not Required for Having Copies.
432 |
433 | You are not required to accept this License in order to receive or run
434 | a copy of the Program. Ancillary propagation of a covered work
435 | occurring solely as a consequence of using peer-to-peer transmission
436 | to receive a copy likewise does not require acceptance. However,
437 | nothing other than this License grants you permission to propagate or
438 | modify any covered work. These actions infringe copyright if you do
439 | not accept this License. Therefore, by modifying or propagating a
440 | covered work, you indicate your acceptance of this License to do so.
441 |
442 | ### 10. Automatic Licensing of Downstream Recipients.
443 |
444 | Each time you convey a covered work, the recipient automatically
445 | receives a license from the original licensors, to run, modify and
446 | propagate that work, subject to this License. You are not responsible
447 | for enforcing compliance by third parties with this License.
448 |
449 | An "entity transaction" is a transaction transferring control of an
450 | organization, or substantially all assets of one, or subdividing an
451 | organization, or merging organizations. If propagation of a covered
452 | work results from an entity transaction, each party to that
453 | transaction who receives a copy of the work also receives whatever
454 | licenses to the work the party's predecessor in interest had or could
455 | give under the previous paragraph, plus a right to possession of the
456 | Corresponding Source of the work from the predecessor in interest, if
457 | the predecessor has it or can get it with reasonable efforts.
458 |
459 | You may not impose any further restrictions on the exercise of the
460 | rights granted or affirmed under this License. For example, you may
461 | not impose a license fee, royalty, or other charge for exercise of
462 | rights granted under this License, and you may not initiate litigation
463 | (including a cross-claim or counterclaim in a lawsuit) alleging that
464 | any patent claim is infringed by making, using, selling, offering for
465 | sale, or importing the Program or any portion of it.
466 |
467 | ### 11. Patents.
468 |
469 | A "contributor" is a copyright holder who authorizes use under this
470 | License of the Program or a work on which the Program is based. The
471 | work thus licensed is called the contributor's "contributor version".
472 |
473 | A contributor's "essential patent claims" are all patent claims owned
474 | or controlled by the contributor, whether already acquired or
475 | hereafter acquired, that would be infringed by some manner, permitted
476 | by this License, of making, using, or selling its contributor version,
477 | but do not include claims that would be infringed only as a
478 | consequence of further modification of the contributor version. For
479 | purposes of this definition, "control" includes the right to grant
480 | patent sublicenses in a manner consistent with the requirements of
481 | this License.
482 |
483 | Each contributor grants you a non-exclusive, worldwide, royalty-free
484 | patent license under the contributor's essential patent claims, to
485 | make, use, sell, offer for sale, import and otherwise run, modify and
486 | propagate the contents of its contributor version.
487 |
488 | In the following three paragraphs, a "patent license" is any express
489 | agreement or commitment, however denominated, not to enforce a patent
490 | (such as an express permission to practice a patent or covenant not to
491 | sue for patent infringement). To "grant" such a patent license to a
492 | party means to make such an agreement or commitment not to enforce a
493 | patent against the party.
494 |
495 | If you convey a covered work, knowingly relying on a patent license,
496 | and the Corresponding Source of the work is not available for anyone
497 | to copy, free of charge and under the terms of this License, through a
498 | publicly available network server or other readily accessible means,
499 | then you must either (1) cause the Corresponding Source to be so
500 | available, or (2) arrange to deprive yourself of the benefit of the
501 | patent license for this particular work, or (3) arrange, in a manner
502 | consistent with the requirements of this License, to extend the patent
503 | license to downstream recipients. "Knowingly relying" means you have
504 | actual knowledge that, but for the patent license, your conveying the
505 | covered work in a country, or your recipient's use of the covered work
506 | in a country, would infringe one or more identifiable patents in that
507 | country that you have reason to believe are valid.
508 |
509 | If, pursuant to or in connection with a single transaction or
510 | arrangement, you convey, or propagate by procuring conveyance of, a
511 | covered work, and grant a patent license to some of the parties
512 | receiving the covered work authorizing them to use, propagate, modify
513 | or convey a specific copy of the covered work, then the patent license
514 | you grant is automatically extended to all recipients of the covered
515 | work and works based on it.
516 |
517 | A patent license is "discriminatory" if it does not include within the
518 | scope of its coverage, prohibits the exercise of, or is conditioned on
519 | the non-exercise of one or more of the rights that are specifically
520 | granted under this License. You may not convey a covered work if you
521 | are a party to an arrangement with a third party that is in the
522 | business of distributing software, under which you make payment to the
523 | third party based on the extent of your activity of conveying the
524 | work, and under which the third party grants, to any of the parties
525 | who would receive the covered work from you, a discriminatory patent
526 | license (a) in connection with copies of the covered work conveyed by
527 | you (or copies made from those copies), or (b) primarily for and in
528 | connection with specific products or compilations that contain the
529 | covered work, unless you entered into that arrangement, or that patent
530 | license was granted, prior to 28 March 2007.
531 |
532 | Nothing in this License shall be construed as excluding or limiting
533 | any implied license or other defenses to infringement that may
534 | otherwise be available to you under applicable patent law.
535 |
536 | ### 12. No Surrender of Others' Freedom.
537 |
538 | If conditions are imposed on you (whether by court order, agreement or
539 | otherwise) that contradict the conditions of this License, they do not
540 | excuse you from the conditions of this License. If you cannot convey a
541 | covered work so as to satisfy simultaneously your obligations under
542 | this License and any other pertinent obligations, then as a
543 | consequence you may not convey it at all. For example, if you agree to
544 | terms that obligate you to collect a royalty for further conveying
545 | from those to whom you convey the Program, the only way you could
546 | satisfy both those terms and this License would be to refrain entirely
547 | from conveying the Program.
548 |
549 | ### 13. Use with the GNU Affero General Public License.
550 |
551 | Notwithstanding any other provision of this License, you have
552 | permission to link or combine any covered work with a work licensed
553 | under version 3 of the GNU Affero General Public License into a single
554 | combined work, and to convey the resulting work. The terms of this
555 | License will continue to apply to the part which is the covered work,
556 | but the special requirements of the GNU Affero General Public License,
557 | section 13, concerning interaction through a network will apply to the
558 | combination as such.
559 |
560 | ### 14. Revised Versions of this License.
561 |
562 | The Free Software Foundation may publish revised and/or new versions
563 | of the GNU General Public License from time to time. Such new versions
564 | will be similar in spirit to the present version, but may differ in
565 | detail to address new problems or concerns.
566 |
567 | Each version is given a distinguishing version number. If the Program
568 | specifies that a certain numbered version of the GNU General Public
569 | License "or any later version" applies to it, you have the option of
570 | following the terms and conditions either of that numbered version or
571 | of any later version published by the Free Software Foundation. If the
572 | Program does not specify a version number of the GNU General Public
573 | License, you may choose any version ever published by the Free
574 | Software Foundation.
575 |
576 | If the Program specifies that a proxy can decide which future versions
577 | of the GNU General Public License can be used, that proxy's public
578 | statement of acceptance of a version permanently authorizes you to
579 | choose that version for the Program.
580 |
581 | Later license versions may give you additional or different
582 | permissions. However, no additional obligations are imposed on any
583 | author or copyright holder as a result of your choosing to follow a
584 | later version.
585 |
586 | ### 15. Disclaimer of Warranty.
587 |
588 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
589 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
590 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT
591 | WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT
592 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
593 | A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND
594 | PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
595 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
596 | CORRECTION.
597 |
598 | ### 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR
602 | CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
603 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
604 | ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT
605 | NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR
606 | LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM
607 | TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER
608 | PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
609 |
610 | ### 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | ## How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these
626 | terms.
627 |
628 | To do so, attach the following notices to the program. It is safest to
629 | attach them to the start of each source file to most effectively state
630 | the exclusion of warranty; and each file should have at least the
631 | "copyright" line and a pointer to where the full notice is found.
632 |
633 |
634 | Copyright (C)
635 |
636 | This program is free software: you can redistribute it and/or modify
637 | it under the terms of the GNU General Public License as published by
638 | the Free Software Foundation, either version 3 of the License, or
639 | (at your option) any later version.
640 |
641 | This program is distributed in the hope that it will be useful,
642 | but WITHOUT ANY WARRANTY; without even the implied warranty of
643 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
644 | GNU General Public License for more details.
645 |
646 | You should have received a copy of the GNU General Public License
647 | along with this program. If not, see .
648 |
649 | Also add information on how to contact you by electronic and paper
650 | mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands \`show w' and \`show c' should show the
661 | appropriate parts of the General Public License. Of course, your
662 | program's commands might be different; for a GUI interface, you would
663 | use an "about box".
664 |
665 | You should also get your employer (if you work as a programmer) or
666 | school, if any, to sign a "copyright disclaimer" for the program, if
667 | necessary. For more information on this, and how to apply and follow
668 | the GNU GPL, see .
669 |
670 | The GNU General Public License does not permit incorporating your
671 | program into proprietary programs. If your program is a subroutine
672 | library, you may consider it more useful to permit linking proprietary
673 | applications with the library. If this is what you want to do, use the
674 | GNU Lesser General Public License instead of this License. But first,
675 | please read .
676 |
--------------------------------------------------------------------------------