├── feeds ├── __init__.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── parse_all.py │ │ └── rate_all.py ├── apps.py ├── admin.py ├── models.py ├── management_commands.py ├── views.py └── tests.py ├── sites ├── __init__.py ├── admin.py ├── models.py ├── views.py └── management_commands.py ├── tweets ├── __init__.py ├── admin.py ├── models.py ├── views.py └── management_commands.py ├── priveedly ├── __init__.py ├── asgi.py ├── wsgi.py ├── urls.py ├── settings.py └── prod_settings.py ├── notebooks ├── .gitignore └── Training and Testing Simple Recommendation Classifiers.ipynb ├── deployment ├── emperor.ini ├── priveedly.ini ├── backup.sh ├── nginx.conf ├── priveedly_playbook.yml └── example_initial_ansible.yml ├── .gitignore ├── templates ├── registration │ └── login.html ├── base.html ├── entry_detail.html └── entry_list.html ├── manage.py ├── example_env ├── base.py ├── static ├── custom.js ├── js.cookie.min.js └── simple.css ├── requirements.txt ├── README.md └── LICENSE.md /feeds/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sites/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tweets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /priveedly/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /feeds/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /feeds/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | experiments/*.txt 3 | experiments/models/*.pkl 4 | *.pkl 5 | -------------------------------------------------------------------------------- /deployment/emperor.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | emperor = /etc/uwsgi-emperor/vassals 3 | uid = www-data 4 | gid = www-data 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | db.sqlite3 3 | migrations/ 4 | .DS_Store 5 | social_search.py 6 | .env 7 | .prod_env 8 | *.sql 9 | .ipynb_* 10 | .my_pgpass 11 | .pg_service.conf 12 | *.pkl 13 | -------------------------------------------------------------------------------- /feeds/apps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.apps import AppConfig 5 | 6 | 7 | class FeedsConfig(AppConfig): 8 | name = 'feeds' 9 | -------------------------------------------------------------------------------- /sites/admin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.contrib import admin 5 | from sites.models import Subreddit 6 | 7 | admin.site.register(Subreddit) 8 | -------------------------------------------------------------------------------- /tweets/admin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.contrib import admin 5 | from tweets.models import TwitterList 6 | 7 | admin.site.register(TwitterList) 8 | -------------------------------------------------------------------------------- /templates/registration/login.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |

Log In

5 |
6 | {% csrf_token %} 7 | {{ form }} 8 | 9 |
10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /feeds/admin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.contrib import admin 5 | from feeds.models import FeedCategory, Feed, FeedEntry 6 | 7 | class FeedAdmin(admin.ModelAdmin): 8 | list_filter = ['is_alive'] 9 | 10 | admin.site.register(FeedCategory) 11 | admin.site.register(Feed, FeedAdmin) 12 | admin.site.register(FeedEntry) 13 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | {% block content %} 12 | {% endblock %} 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /priveedly/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for priveedly project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'priveedly.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /priveedly/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for priveedly project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'priveedly.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /deployment/priveedly.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | gevent=100 3 | project = priveedly 4 | base = /var/www 5 | 6 | chdir = %(base)/%(project) 7 | virtualenv = %(base)/venv/%(project) 8 | module = %(project).wsgi:application 9 | plugins = python3,logfile 10 | uid = www-data 11 | gid = www-data 12 | 13 | master = true 14 | processes = 1 15 | enable-threads = true 16 | 17 | http-socket = :9090 18 | chown-socket = %(uid):www-data 19 | chmod-socket = 664 20 | vacuum = true 21 | 22 | logto = /var/log/uwsgi/emperor_priveedly.log 23 | -------------------------------------------------------------------------------- /deployment/backup.sh: -------------------------------------------------------------------------------- 1 | while [[ $# -gt 1 ]] 2 | do 3 | key="$1" 4 | 5 | case $key in 6 | -t|--type) 7 | BACKUP_TYPE="$2" 8 | shift # past argument 9 | ;; 10 | esac 11 | shift # past argument or value 12 | done 13 | 14 | source /home/YOUR_USER/.profile 15 | 16 | if [[ -n $BACKUP_TYPE ]] 17 | then 18 | PGPASSWORD=CHANGE_TO_YOUR_PASSWORD pg_dump -U CHANGE_TO_YOUR_USER -p 5433 DB_NAME > /tmp/priveedly.sql 19 | /usr/bin/aws s3 cp /tmp/priveedly.sql s3://YOUR_BACKUP_LOCATION/"$BACKUP_TYPE"/ 20 | fi 21 | -------------------------------------------------------------------------------- /deployment/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | listen 443 ssl; 4 | listen [::]:443 ssl; 5 | server_name YOUR_DNS_NAME; 6 | 7 | ssl_certificate /etc/letsencrypt/live/YOUR_DNS_NAME/fullchain.pem; 8 | ssl_certificate_key /etc/letsencrypt/live/YOUR_DNS_NAME/privkey.pem; 9 | 10 | location /static { 11 | alias /var/www/priveedly/static; 12 | } 13 | 14 | location / { 15 | 16 | proxy_set_header Host $http_host; 17 | proxy_pass http://127.0.0.1:9090; 18 | 19 | } 20 | 21 | location ~ /.well-known { 22 | allow all; 23 | } 24 | 25 | location ~ /.git { 26 | deny all; 27 | } 28 | 29 | 30 | } 31 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'priveedly.settings') 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /tweets/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # NOTE: no longer tested or maintained due to change in Twitter/X API 3 | 4 | from __future__ import unicode_literals 5 | 6 | from django.db import models 7 | from base import Entry 8 | 9 | class TwitterList(models.Model): 10 | name = models.CharField(max_length=100) 11 | list_id = models.BigIntegerField() 12 | since_id = models.BigIntegerField(null=True, default=None) 13 | 14 | def __str__(self): 15 | return u'Twitter List: {}'.format(self.name) 16 | 17 | class Tweet(Entry): 18 | username = models.CharField(max_length=200) 19 | twitter_list = models.ForeignKey(TwitterList, 20 | on_delete=models.SET_NULL, 21 | null=True) 22 | 23 | @property 24 | def entry_type(self): 25 | return "tweets" 26 | 27 | @property 28 | def source(self): 29 | return self.twitter_list.name 30 | -------------------------------------------------------------------------------- /templates/entry_detail.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |

{{object.title}}

5 | [{{object.get_object_category_display}}] {{object.source}} 6 | {{object.published}} 7 | {% if object.image_url %} 8 | 9 | {% endif %} 10 |

{{object.description|safe}}

11 | {% if not object.read_later %} 12 | 13 | {% else %} 14 | 15 | {% endif %} 16 | 17 | {% if not object.interesting %} 18 | 19 | {% endif %} 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /sites/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models 5 | from base import Entry 6 | 7 | class Subreddit(models.Model): 8 | name = models.CharField(max_length=100) 9 | 10 | def __str__(self): 11 | return u'Subreddit: {}'.format(self.name) 12 | 13 | class RedditPost(Entry): 14 | subreddit = models.ForeignKey(Subreddit, 15 | on_delete=models.SET_NULL, 16 | null=True) 17 | 18 | @property 19 | def entry_type(self): 20 | return "reddit" 21 | 22 | @property 23 | def source(self): 24 | if self.subreddit: 25 | return self.subreddit.name 26 | return 'deleted' 27 | 28 | class SitePost(Entry): 29 | site_name = models.CharField(max_length=200) 30 | 31 | @property 32 | def entry_type(self): 33 | return "sites" 34 | 35 | @property 36 | def source(self): 37 | return self.site_name 38 | -------------------------------------------------------------------------------- /templates/entry_list.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block content %} 3 | 30 out of {{total_unread}} 4 | 5 | {% for e in entry_list %} 6 |

{{e.title}}

7 | [{{e.get_entry_category_display}}] {{e.source}} 8 | {{e.published}} 9 | {{e.recommended}} 10 | {% if e.image_url %} 11 | 12 | {% endif %} 13 |

{{e.safe_text|truncatechars:300}}

14 | Read More 15 | {% if e.read_later %} 16 | {% else %} 17 | {% endif %} 18 | {% endfor %} 19 | 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /tweets/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # NOTE: no longer tested or maintained due to change in Twitter/X API 3 | 4 | from __future__ import unicode_literals 5 | 6 | from django.http import Http404, JsonResponse 7 | from django.views.generic.detail import DetailView 8 | from django.views.generic.list import ListView 9 | from django_filters.views import FilterView 10 | 11 | from tweets.models import Tweet 12 | 13 | from datetime import datetime 14 | import django_filters 15 | 16 | 17 | class TweetFilter(django_filters.FilterSet): 18 | 19 | class Meta: 20 | model = Tweet 21 | fields = ['read', 'read_later', 'published', 'twitter_list__name'] 22 | 23 | 24 | class TweetList(FilterView): 25 | template_name = "entry_list.html" 26 | paginate_by = 30 27 | model = Tweet 28 | context_object_name = 'entry_list' 29 | filterset_class = TweetFilter 30 | 31 | 32 | class TweetDetailView(DetailView): 33 | model = Tweet 34 | template_name = "entry_detail.html" 35 | -------------------------------------------------------------------------------- /feeds/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models 5 | from base import Entry 6 | 7 | class FeedCategory(models.Model): 8 | name = models.CharField(max_length=100) 9 | 10 | def __str__(self): 11 | return u'Category: {}'.format(self.name) 12 | 13 | class Feed(models.Model): 14 | title = models.CharField(max_length=255) 15 | url = models.URLField() 16 | is_alive = models.BooleanField(default=True) 17 | category = models.ForeignKey(FeedCategory, 18 | on_delete=models.SET_NULL, 19 | null=True) 20 | updated = models.DateTimeField(null=True, default=None) 21 | last_entry = models.DateTimeField(null=True, default=None) 22 | 23 | def __str__(self): 24 | return u'Feed: {}'.format(self.title) 25 | 26 | class Meta: 27 | ordering = ('-last_entry',) 28 | 29 | class FeedEntry(Entry): 30 | feed = models.ForeignKey(Feed, on_delete=models.CASCADE) 31 | 32 | @property 33 | def entry_type(self): 34 | return "feeds" 35 | 36 | @property 37 | def source(self): 38 | return self.feed.title 39 | -------------------------------------------------------------------------------- /example_env: -------------------------------------------------------------------------------- 1 | DEBUG=True # set to false for production 2 | 3 | SECRET_KEY=_pleaseputalongrandomstringhere_ 4 | ALLOWED_HOSTS=127.0.0.1,YOUR_SERVER_IP_OR_DNS_HERE 5 | DB_USERNAME=changethisplease 6 | DB_PASSWORD=changethisplease 7 | DB_NAME=changethisplease 8 | DB_CONNSTR=enter_your_db_connection_string 9 | 10 | # note that these are different because of some setup redundancies I wanted for my setup, but you can make them the same again in the settings 11 | LOCAL_DB_USERNAME=changethisplease 12 | LOCAL_DB_PASSWORD=changethisplease 13 | LOCAL_DB_NAME=changethisplease 14 | LOCAL_DB_CONNSTR=enter_your_db_connection_string 15 | 16 | 17 | #for the following, you'll need a reddit account and to set yourself up with the reddit API 18 | REDDIT_USERNAME=____ 19 | REDDIT_CLIENT_ID=____ 20 | REDDIT_CLIENT_SECRET=____ 21 | REDDIT_PASSWORD=____ 22 | REDDIT_USER_AGENT="my version of priveedly via praw" 23 | 24 | PIPELINE_FILE=___enter path to your pipeline file if you trained your own via scikitlearn___ 25 | 26 | 27 | # I don't know if this works anymore :( 28 | TWITTER_CONSUMER_KEY=____ 29 | TWITTER_CONSUMER_SECRET=____ 30 | TWITTER_ACCESS_TOKEN=____ 31 | TWITTER_ACCESS_TOKEN_SECRET=____ 32 | -------------------------------------------------------------------------------- /priveedly/urls.py: -------------------------------------------------------------------------------- 1 | 2 | from django.urls import include, re_path 3 | from django.contrib import admin 4 | 5 | from feeds.views import EntryList, EntryDetailView, main_feed, read_later_feed, recommended_feed, mark_read, mark_read_later, mark_interesting, unmark_read_later 6 | from sites.views import SiteList, SiteDetailView, RedditList, RedditDetailView 7 | 8 | urlpatterns = [ 9 | re_path(r'^admin/', admin.site.urls), 10 | re_path(r'^accounts/', include("django.contrib.auth.urls")), 11 | re_path(r'^$', main_feed), 12 | re_path(r'^read-later/$', read_later_feed), 13 | re_path(r'^recommended/$', recommended_feed), 14 | re_path(r'^feeds/$', EntryList.as_view()), 15 | re_path(r'^feeds/(?P\d+)/$', EntryDetailView.as_view()), 16 | re_path(r'^sites/$', SiteList.as_view()), 17 | re_path(r'^sites/(?P\d+)/$', SiteDetailView.as_view()), 18 | re_path(r'^reddit/$', RedditList.as_view()), 19 | re_path(r'^reddit/(?P\d+)/$', RedditDetailView.as_view()), 20 | re_path(r'^feeds/mark-read/', mark_read), 21 | re_path(r'^feeds/mark-interesting/', mark_interesting), 22 | re_path(r'^feeds/mark-read-later/', mark_read_later), 23 | re_path(r'^feeds/unmark-read-later/', unmark_read_later), 24 | ] 25 | -------------------------------------------------------------------------------- /feeds/management/commands/parse_all.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from feeds.management_commands import parse_all_feeds 3 | from sites.management_commands import parse_lobsters, parse_hackernews, parse_all_subreddits 4 | 5 | from datetime import datetime 6 | import logging 7 | 8 | 9 | logging.basicConfig(filename='/var/log/priveedly/parse.log', 10 | encoding='utf-8', 11 | level=logging.INFO, 12 | datefmt='%Y-%m-%d %H:%M:%S') 13 | 14 | class Command(BaseCommand): 15 | 16 | 17 | def handle(self, *args, **kwargs): 18 | start = datetime.now() 19 | try: 20 | parse_all_feeds() 21 | except Exception as e: 22 | logging.error('feed error {}'.format(e)) 23 | try: 24 | parse_all_subreddits() 25 | except Exception as e: 26 | logging.error('reddit error {}'.format(e)) 27 | try: 28 | parse_hackernews() 29 | except Exception as e: 30 | logging.error('hackernews error {}'.format(e)) 31 | try: 32 | parse_lobsters() 33 | except Exception as e: 34 | logging.error('lobsters error {}'.format(e)) 35 | logging.info('finished parsing in {}'.format(datetime.now() - start)) 36 | -------------------------------------------------------------------------------- /base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models 5 | from django.utils.html import strip_tags 6 | 7 | class Entry(models.Model): 8 | 9 | class EntryCategory(models.TextChoices): 10 | RSS = 'RS', 'RSS Feed' 11 | TW = 'TW', 'Twitter' 12 | LS = 'LS', 'Lobste.rs' 13 | RD = 'RD', 'Reddit' 14 | HN = 'HN', 'Hacker News' 15 | 16 | title = models.CharField(max_length=355) 17 | url = models.URLField(max_length=400) 18 | description = models.TextField() 19 | image_url = models.URLField(null=True, blank=True, default='') 20 | published = models.DateTimeField() 21 | created = models.DateTimeField(auto_now_add=True) 22 | read = models.BooleanField(default=False) 23 | to_delete = models.BooleanField(default=False) 24 | read_later = models.BooleanField(default=False) 25 | interesting = models.BooleanField(default=False) 26 | recommended = models.FloatField(default=0) 27 | entry_category = models.CharField( 28 | max_length=2, 29 | choices=EntryCategory.choices, 30 | ) 31 | 32 | @property 33 | def safe_text(self): 34 | return strip_tags(self.description) 35 | 36 | class Meta: 37 | ordering = ('published',) 38 | abstract = True 39 | 40 | def __str__(self): 41 | return u'{} Entry: {}'.format(self.entry_category, self.title) 42 | -------------------------------------------------------------------------------- /sites/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.http import Http404, JsonResponse 5 | from django.views.generic.detail import DetailView 6 | from django.views.generic.list import ListView 7 | from django_filters.views import FilterView 8 | 9 | from sites.models import SitePost, RedditPost 10 | 11 | from datetime import datetime 12 | import django_filters 13 | 14 | 15 | class SitesFilter(django_filters.FilterSet): 16 | 17 | class Meta: 18 | model = SitePost 19 | fields = ['read', 'read_later', 'published', 'site_name', 'recommended'] 20 | 21 | class RedditFilter(django_filters.FilterSet): 22 | 23 | class Meta: 24 | model = RedditPost 25 | fields = ['read', 'read_later', 'published', 'subreddit__name', 'recommended'] 26 | 27 | 28 | class SiteList(FilterView): 29 | template_name = "entry_list.html" 30 | paginate_by = 30 31 | model = SitePost 32 | context_object_name = 'entry_list' 33 | filterset_class = SitesFilter 34 | 35 | class SiteDetailView(DetailView): 36 | model = SitePost 37 | template_name = "entry_detail.html" 38 | 39 | class RedditList(FilterView): 40 | template_name = "entry_list.html" 41 | paginate_by = 30 42 | model = RedditPost 43 | context_object_name = 'entry_list' 44 | filterset_class = RedditFilter 45 | 46 | 47 | class RedditDetailView(DetailView): 48 | model = RedditPost 49 | template_name = "entry_detail.html" 50 | -------------------------------------------------------------------------------- /static/custom.js: -------------------------------------------------------------------------------- 1 | function mark_read_later(entry_id, entry_type) { 2 | $.ajaxSetup({ 3 | headers: { 4 | 'X-CSRFToken': Cookies.get('csrftoken'), 5 | } 6 | }); 7 | 8 | $.post("/feeds/mark-read-later/", 9 | {'entry_id': entry_id, 'entry_type': entry_type}); 10 | } 11 | 12 | function unmark_read_later(entry_id, entry_type) { 13 | $.ajaxSetup({ 14 | headers: { 15 | 'X-CSRFToken': Cookies.get('csrftoken'), 16 | } 17 | }); 18 | 19 | $.post("/feeds/unmark-read-later/", 20 | {'entry_id': entry_id, 'entry_type': entry_type}); 21 | } 22 | 23 | function mark_interesting(entry_id, entry_type) { 24 | $.ajaxSetup({ 25 | headers: { 26 | 'X-CSRFToken': Cookies.get('csrftoken'), 27 | } 28 | }); 29 | 30 | $.post("/feeds/mark-interesting/", 31 | {'entry_id': entry_id, 'entry_type': entry_type}); 32 | } 33 | 34 | function mark_read() { 35 | var IDs = []; 36 | var entry_types = []; 37 | $("h3").each(function(){ 38 | IDs.push(this.id); 39 | entry_types.push($(this).attr("entry_type")); }); 40 | var IDsString = IDs.join(","); 41 | var typesString = entry_types.join(","); 42 | $.ajaxSetup({ 43 | headers: { 44 | 'X-CSRFToken': Cookies.get('csrftoken'), 45 | } 46 | }); 47 | 48 | $.post("/feeds/mark-read/", 49 | {"id_list": IDsString, 50 | "entry_types": typesString}) 51 | .done(function(){ 52 | document.body.scrollTop = document.documentElement.scrollTop = 0; 53 | location.reload(); 54 | }); 55 | } 56 | 57 | 58 | -------------------------------------------------------------------------------- /static/js.cookie.min.js: -------------------------------------------------------------------------------- 1 | /*! js-cookie v3.0.5 | MIT */ 2 | !function(e,t){"object"==typeof exports&&"undefined"!=typeof module?module.exports=t():"function"==typeof define&&define.amd?define(t):(e="undefined"!=typeof globalThis?globalThis:e||self,function(){var n=e.Cookies,o=e.Cookies=t();o.noConflict=function(){return e.Cookies=n,o}}())}(this,(function(){"use strict";function e(e){for(var t=1;t twlist.since_id: 69 | twlist.last_entry = e.published 70 | twlist.since_id = tweet.id 71 | 72 | twlist.updated = pytz.utc.localize(datetime.utcnow()) 73 | twlist.save() 74 | logging.info("Parsed Twitter list: {} and found {} new items".format( 75 | twlist.name, 76 | entry_count)) 77 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anyio==4.4.0 2 | appnope==0.1.4 3 | argon2-cffi==23.1.0 4 | argon2-cffi-bindings==21.2.0 5 | arrow==1.3.0 6 | asgiref==3.6.0 7 | asttokens==2.2.1 8 | async-lru==2.0.4 9 | attrs==24.2.0 10 | babel==2.16.0 11 | backcall==0.2.0 12 | beautifulsoup4==4.12.2 13 | bleach==6.1.0 14 | certifi==2022.12.7 15 | cffi==1.17.1 16 | charset-normalizer==3.1.0 17 | click==8.1.7 18 | comm==0.2.2 19 | contourpy==1.3.0 20 | cycler==0.12.1 21 | debugpy==1.8.5 22 | decorator==5.1.1 23 | defusedxml==0.7.1 24 | Django==4.2 25 | django-filter==23.1 26 | django-login-required-middleware==0.9.0 27 | django-querysetsequence==0.16 28 | executing==1.2.0 29 | fastjsonschema==2.20.0 30 | feedparser==6.0.10 31 | fonttools==4.53.1 32 | fqdn==1.5.1 33 | gevent==24.10.3 34 | greenlet==3.1.1 35 | h11==0.14.0 36 | HackerNews==2.0.0 37 | hackernews-python==0.3.2 38 | httpcore==1.0.5 39 | httpx==0.27.2 40 | idna==3.4 41 | imageio==2.35.1 42 | imbalanced-learn==0.12.3 43 | ipykernel==6.29.5 44 | ipython==8.13.1 45 | ipywidgets==8.1.5 46 | isoduration==20.11.0 47 | jedi==0.18.2 48 | Jinja2==3.1.4 49 | joblib==1.4.2 50 | json5==0.9.25 51 | jsonpointer==3.0.0 52 | jsonschema==4.23.0 53 | jsonschema-specifications==2023.12.1 54 | jupyter==1.1.1 55 | kiwisolver==1.4.7 56 | lazy_loader==0.4 57 | lime==0.2.0.1 58 | MarkupSafe==2.1.5 59 | matplotlib==3.9.2 60 | matplotlib-inline==0.1.6 61 | mistune==3.0.2 62 | nbclient==0.10.0 63 | nbconvert==7.16.4 64 | nbformat==5.10.4 65 | nest-asyncio==1.6.0 66 | networkx==3.2 67 | nltk==3.9.1 68 | notebook==7.2.2 69 | notebook_shim==0.2.4 70 | numpy==2.0.0 71 | oauthlib==3.2.2 72 | overrides==7.7.0 73 | packaging==24.1 74 | pandas==2.2.2 75 | pandocfilters==1.5.1 76 | parso==0.8.3 77 | pexpect==4.8.0 78 | pickleshare==0.7.5 79 | pillow==10.4.0 80 | platformdirs==4.3.2 81 | praw==7.7.0 82 | prawcore==2.3.0 83 | prometheus_client==0.20.0 84 | prompt-toolkit==3.0.38 85 | psutil==6.0.0 86 | ptyprocess==0.7.0 87 | pure-eval==0.2.2 88 | pycparser==2.22 89 | Pygments==2.15.1 90 | pyparsing==3.1.4 91 | python-dateutil==2.8.2 92 | python-dotenv==1.0.0 93 | python-json-logger==2.0.7 94 | pytz==2023.3 95 | PyYAML==6.0.2 96 | pyzmq==26.2.0 97 | referencing==0.35.1 98 | regex==2024.7.24 99 | rfc3339-validator==0.1.4 100 | rfc3986-validator==0.1.1 101 | rpds-py==0.20.0 102 | scikit-image==0.24.0 103 | scikit-learn==1.5.1 104 | scipy==1.13.1 105 | seaborn==0.13.2 106 | Send2Trash==1.8.3 107 | sentry-sdk==2.17.0 108 | setuptools==72.1.0 109 | sgmllib3k==1.0.0 110 | six==1.16.0 111 | sniffio==1.3.1 112 | soupsieve==2.4.1 113 | SQLAlchemy==2.0.34 114 | sqlparse==0.4.4 115 | stack-data==0.6.2 116 | terminado==0.18.1 117 | threadpoolctl==3.5.0 118 | tifffile==2024.8.30 119 | tinycss2==1.3.0 120 | tornado==6.4.1 121 | tqdm==4.66.5 122 | traitlets==5.9.0 123 | types-python-dateutil==2.9.0.20240906 124 | typing_extensions==4.12.2 125 | tzdata==2024.1 126 | update-checker==0.18.0 127 | uri-template==1.3.0 128 | urllib3==2.0.2 129 | wcwidth==0.2.6 130 | webcolors==24.8.0 131 | webencodings==0.5.1 132 | wheel==0.43.0 133 | widgetsnbextension==4.0.13 134 | zope.event==5.0 135 | zope.interface==7.1.1 136 | -------------------------------------------------------------------------------- /feeds/management_commands.py: -------------------------------------------------------------------------------- 1 | from feeds.models import Feed, FeedEntry 2 | 3 | import feedparser 4 | import logging 5 | import pytz 6 | 7 | from datetime import datetime, timedelta, timezone 8 | from dateutil import parser as date_parser 9 | from django.utils.timezone import is_aware 10 | from lxml.html import fromstring 11 | 12 | 13 | logging.basicConfig(filename='/var/log/priveedly/parse.log', 14 | encoding='utf-8', 15 | level=logging.INFO, 16 | datefmt='%Y-%m-%d %H:%M:%S') 17 | 18 | def test_feeds_for_zombies(): 19 | all_feeds = Feed.objects.filter(is_alive=True) 20 | zombies = 0 21 | utcnow = pytz.utc.localize(datetime.utcnow()) 22 | for feed in all_feeds: 23 | if feed.last_entry < utcnow - timedelta(days=90): 24 | feed.is_alive = False 25 | feed.save() 26 | zombies += 1 27 | logging.info('Found {} zombies.'.format(zombies)) 28 | 29 | 30 | def parse_all_feeds(): 31 | all_feeds = Feed.objects.filter(is_alive=True) 32 | for feed in all_feeds: 33 | try: 34 | parse_feed(feed) 35 | except Exception as e: 36 | logging.error(e) 37 | logging.error('Problem parsing %s' % feed) 38 | logging.info('Finished parsing {} feeds.'.format(len(all_feeds))) 39 | 40 | def get_pub_date(entry): 41 | if hasattr(entry, 'published'): 42 | pub_date = date_parser.parse(entry.published) 43 | elif hasattr(entry, 'updated'): 44 | pub_date = date_parser.parse(entry.updated) 45 | else: 46 | pub_date = pytz.utc.localize(datetime.utcnow()) 47 | 48 | if not is_aware(pub_date): 49 | pub_date = pytz.utc.localize(pub_date) 50 | 51 | return pub_date 52 | 53 | def get_title(entry): 54 | if hasattr(entry, 'title'): 55 | return entry.title[:354] 56 | return 'no title' 57 | 58 | def get_description(entry): 59 | # this prefers HTML and longer content over shorter !! 60 | if hasattr(entry, 'content'): 61 | if len(entry.content) == 1: 62 | return entry.content[0]['value'] 63 | else: 64 | for ec in entry.content: 65 | if 'html' in ec['type']: 66 | return ec['value'] 67 | return ec['value'] # default if no html 68 | elif hasattr(entry, 'description'): 69 | return entry.description 70 | return 'no description' 71 | 72 | def get_image(description): 73 | doc = fromstring(description) 74 | for pattern in ['//img/@src', '//img/@src']: 75 | images = doc.xpath(pattern) 76 | if len(images): 77 | return images[0] 78 | 79 | def parse_feed(feed): 80 | entry_count = 0 81 | entries = feedparser.parse(feed.url) 82 | for entry in entries.entries: 83 | pub_date = get_pub_date(entry) 84 | 85 | # this speeds up queries once you have many entries 86 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120) 87 | if not FeedEntry.objects.filter(published__gte=filter_date, url=entry.link) and pub_date >= filter_date: 88 | title = get_title(entry) 89 | desc = get_description(entry) 90 | 91 | e = FeedEntry( 92 | feed=feed, 93 | title=title, 94 | url=entry.link, 95 | description=desc, 96 | entry_category='RS', 97 | published=pub_date, 98 | ) 99 | 100 | img = get_image(desc) 101 | if img: 102 | e.image_url = img 103 | 104 | e.save() 105 | entry_count += 1 106 | if not feed.last_entry or e.published > feed.last_entry: 107 | feed.last_entry = e.published 108 | 109 | feed.updated = pytz.utc.localize(datetime.utcnow()) 110 | feed.save() 111 | logging.info("Parsed feed: {} and found {} new items".format( 112 | feed.title, 113 | entry_count)) 114 | -------------------------------------------------------------------------------- /feeds/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.http import Http404, JsonResponse 5 | from django.views.generic.detail import DetailView 6 | from django.views.generic.list import ListView 7 | from django.shortcuts import render 8 | from django_filters.views import FilterView 9 | from queryset_sequence import QuerySetSequence 10 | 11 | from feeds.models import FeedCategory, Feed, FeedEntry 12 | from sites.models import SitePost, RedditPost 13 | 14 | from datetime import datetime 15 | import django_filters 16 | 17 | 18 | class EntryFilter(django_filters.FilterSet): 19 | 20 | class Meta: 21 | model = FeedEntry 22 | fields = ['read', 'read_later', 'feed__title', 'published', 'recommended'] 23 | 24 | 25 | class EntryList(FilterView): 26 | template_name = "entry_list.html" 27 | paginate_by = 30 28 | model = FeedEntry 29 | context_object_name = 'entry_list' 30 | filterset_class = EntryFilter 31 | 32 | 33 | class EntryDetailView(DetailView): 34 | model = FeedEntry 35 | template_name = "entry_detail.html" 36 | 37 | 38 | def main_feed(request): 39 | group_qs = QuerySetSequence( 40 | FeedEntry.objects.filter(read=False), 41 | SitePost.objects.filter(read=False), 42 | RedditPost.objects.filter(read=False)).order_by('published') 43 | 44 | return render(request, 45 | 'entry_list.html', { 46 | 'entry_list': group_qs[:30], 47 | 'total_unread': len(group_qs)}) 48 | 49 | 50 | def read_later_feed(request): 51 | group_qs = QuerySetSequence( 52 | FeedEntry.objects.filter(read_later=True), 53 | SitePost.objects.filter(read_later=True), 54 | RedditPost.objects.filter(read_later=True)).order_by('published')[:20] 55 | return render(request, 56 | 'entry_list.html', {'entry_list': group_qs}) 57 | 58 | 59 | def recommended_feed(request): 60 | group_qs = QuerySetSequence( 61 | FeedEntry.objects.filter(recommended__gte=0.5, read=False), 62 | SitePost.objects.filter(recommended__gte=0.5, read=False), 63 | RedditPost.objects.filter(recommended__gte=0.5, read=False)).order_by('published')[:20] 64 | return render(request, 65 | 'entry_list.html', {'entry_list': group_qs}) 66 | 67 | def mark_read(request): 68 | if request.method == 'POST': 69 | entry_ids = request.POST.get('id_list').split(',') 70 | entry_types = request.POST.get('entry_types').split(',') 71 | for etype, ein in zip(entry_types, entry_ids): 72 | if etype == 'sites': 73 | e = SitePost.objects.get(id=ein) 74 | elif etype == 'feeds': 75 | e = FeedEntry.objects.get(id=ein) 76 | elif etype == 'reddit': 77 | e = RedditPost.objects.get(id=ein) 78 | e.read = True 79 | e.save() 80 | return JsonResponse({'success': True}) 81 | return JsonResponse({'success': False, 82 | 'error': 'Please send a list of ids'}) 83 | 84 | def mark_read_later(request): 85 | if request.method == 'POST': 86 | entry_id = request.POST.get('entry_id') 87 | entry_type = request.POST.get('entry_type') 88 | if entry_type == 'sites': 89 | e = SitePost.objects.get(id=entry_id) 90 | elif entry_type == 'feeds': 91 | e = FeedEntry.objects.get(id=entry_id) 92 | elif entry_type == 'reddit': 93 | e = RedditPost.objects.get(id=entry_id) 94 | e.read_later = True 95 | e.save() 96 | return JsonResponse({'success': True}) 97 | return JsonResponse({'success': False, 98 | 'error': 'Please send an entry id'}) 99 | 100 | def unmark_read_later(request): 101 | if request.method == 'POST': 102 | entry_id = request.POST.get('entry_id') 103 | entry_type = request.POST.get('entry_type') 104 | if entry_type == 'sites': 105 | e = SitePost.objects.get(id=entry_id) 106 | elif entry_type == 'feeds': 107 | e = FeedEntry.objects.get(id=entry_id) 108 | elif entry_type == 'reddit': 109 | e = RedditPost.objects.get(id=entry_id) 110 | e.read_later = False 111 | e.save() 112 | return JsonResponse({'success': True}) 113 | return JsonResponse({'success': False, 114 | 'error': 'Please send an entry id'}) 115 | 116 | def mark_interesting(request): 117 | if request.method == 'POST': 118 | entry_id = request.POST.get('entry_id') 119 | entry_type = request.POST.get('entry_type') 120 | if entry_type == 'sites': 121 | e = SitePost.objects.get(id=entry_id) 122 | elif entry_type == 'feeds': 123 | e = FeedEntry.objects.get(id=entry_id) 124 | elif entry_type == 'reddit': 125 | e = RedditPost.objects.get(id=entry_id) 126 | e.read_later = False 127 | e.interesting = True 128 | e.save() 129 | return JsonResponse({'success': True}) 130 | return JsonResponse({'success': False, 131 | 'error': 'Please send an entry id'}) 132 | -------------------------------------------------------------------------------- /sites/management_commands.py: -------------------------------------------------------------------------------- 1 | from sites.models import Subreddit, RedditPost, SitePost 2 | from django.conf import settings 3 | 4 | import logging 5 | import praw 6 | import pytz 7 | import random 8 | import requests 9 | 10 | from hackernews import HackerNews 11 | from datetime import datetime, timedelta, timezone 12 | 13 | logging.basicConfig(filename='/var/log/priveedly/parse.log', 14 | encoding='utf-8', 15 | level=logging.INFO, 16 | datefmt='%Y-%m-%d %H:%M:%S') 17 | 18 | def parse_all_subreddits(): 19 | all_subreddits = Subreddit.objects.all() 20 | for subreddit in all_subreddits: 21 | try: 22 | parse_reddit(subreddit) 23 | except Exception as e: 24 | logging.error(e) 25 | logging.error('Error parsing subreddit {}'.format(subreddit)) 26 | logging.info('Finished parsing {} subreddits.'.format( 27 | len(all_subreddits))) 28 | 29 | 30 | def get_lobster_posts(url="https://lobste.rs/hottest.json"): 31 | return [r for r in requests.get(url).json()] 32 | 33 | def parse_lobsters(): 34 | entry_count = 0 35 | posts = get_lobsters_posts() 36 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120) 37 | for post in posts: 38 | if not SitePost.objects.filter(published__gte=filter_date, url=post.get('url')): 39 | e = SitePost( 40 | entry_category='LS', 41 | site_name='lobsters', 42 | title=post.get('title')[:354], 43 | url=post.get('url'), 44 | description=post.get('description') + ' Tags: {}'.format( 45 | ' '.join(post.get('tags'))), 46 | published=post.get('created_at'), 47 | ) 48 | e.save() 49 | entry_count += 1 50 | logging.info("Parsed lobsters and found {} new items".format(entry_count)) 51 | 52 | 53 | def get_text(hn_item): 54 | if hasattr(hn_item, 'text'): 55 | return hn_item.text 56 | return '' 57 | 58 | 59 | def parse_hackernews(): 60 | entry_count = 0 61 | hn = HackerNews() 62 | stories = [hn.item(x) for x in hn.top_stories()] 63 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120) 64 | 65 | for post in stories: 66 | if not hasattr(post, 'url'): 67 | continue 68 | if not SitePost.objects.filter(published__gte=filter_date, url=post.url): 69 | text = post.title[:354] 70 | if hasattr(post, 'text'): 71 | text = post.text 72 | elif hasattr(post, 'kids'): 73 | sample_size = (lambda y: len(y) if len(y) < 4 else 4)(post.kids) 74 | text = '\n '.join( 75 | [get_text(hn.item(x)) for x in 76 | random.sample(post.kids, sample_size)]) 77 | e = SitePost( 78 | entry_category='HN', 79 | site_name='hackernews', 80 | title=post.title, 81 | url=post.url, 82 | description=text, 83 | published=pytz.utc.localize(post.time) 84 | ) 85 | e.save() 86 | entry_count += 1 87 | logging.info("Parsed hackernews and found {} new items".format(entry_count)) 88 | 89 | 90 | def get_praw(): 91 | return praw.Reddit( 92 | client_id=settings.REDDIT_CLIENT_ID, 93 | client_secret=settings.REDDIT_CLIENT_SECRET, 94 | password=settings.REDDIT_PASSWORD, 95 | user_agent=settings.REDDIT_USER_AGENT, 96 | username=settings.REDDIT_USERNAME) 97 | 98 | def parse_reddit(subreddit): 99 | entry_count = 0 100 | api = get_praw() 101 | posts = api.subreddit(subreddit.name).new(limit=500) 102 | filter_date = datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=120) 103 | 104 | for post in posts: 105 | if not RedditPost.objects.filter(published__gte=filter_date, url=post.url): 106 | pub_date = pytz.utc.localize(datetime.utcfromtimestamp(post.created_utc)) 107 | if pub_date <= filter_date: 108 | continue 109 | text = post.selftext 110 | if not text: 111 | sample_size = (lambda y: len(y) if 112 | len(y) < 4 else 4)(list(post.comments)) 113 | text = '\n '.join( 114 | [comment.body for comment in 115 | random.sample(list(post.comments), sample_size)]) 116 | 117 | e = RedditPost( 118 | entry_category='RD', 119 | subreddit=subreddit, 120 | title=post.title[:354], 121 | url=post.url, 122 | description=text, 123 | published=pub_date 124 | ) 125 | e.save() 126 | entry_count += 1 127 | 128 | subreddit.updated = pytz.utc.localize(datetime.utcnow()) 129 | subreddit.save() 130 | logging.info("Parsed subreddit: {} and found {} new items".format( 131 | subreddit.name, 132 | entry_count)) 133 | -------------------------------------------------------------------------------- /priveedly/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for priveedly project. 3 | 4 | Generated by 'django-admin startproject' using Django 4.2. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/4.2/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | from dotenv import load_dotenv 15 | import os 16 | 17 | 18 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 19 | BASE_DIR = Path(__file__).resolve().parent.parent 20 | 21 | load_dotenv() 22 | 23 | # Quick-start development settings - unsuitable for production 24 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ 25 | 26 | # SECURITY WARNING: keep the secret key used in production secret! 27 | SECRET_KEY = os.environ.get('SECRET_KEY') 28 | 29 | # SECURITY WARNING: don't run with debug turned on in production! 30 | DEBUG = (os.getenv('DEBUG') == 'True') 31 | 32 | ALLOWED_HOSTS = [x for x in os.environ.get('ALLOWED_HOSTS').split(',')] 33 | SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') 34 | CSRF_TRUSTED_ORIGINS = ['https://{}'.format(x) for x in os.environ.get('ALLOWED_HOSTS').split(',') if '.com' in x] 35 | 36 | # Application definition 37 | 38 | INSTALLED_APPS = [ 39 | 'django.contrib.admin', 40 | 'django.contrib.auth', 41 | 'django.contrib.contenttypes', 42 | 'django.contrib.sessions', 43 | 'django.contrib.messages', 44 | 'django.contrib.staticfiles', 45 | 'django_filters', 46 | 'feeds', 47 | 'sites', 48 | ] 49 | 50 | MIDDLEWARE = [ 51 | 'django.middleware.security.SecurityMiddleware', 52 | 'django.contrib.sessions.middleware.SessionMiddleware', 53 | 'django.middleware.common.CommonMiddleware', 54 | 'django.middleware.csrf.CsrfViewMiddleware', 55 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 56 | 'login_required.middleware.LoginRequiredMiddleware', 57 | 'django.contrib.messages.middleware.MessageMiddleware', 58 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 59 | ] 60 | 61 | ROOT_URLCONF = 'priveedly.urls' 62 | 63 | TEMPLATES = [ 64 | { 65 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 66 | 'DIRS': [os.path.join(BASE_DIR, 'templates')], 67 | 'APP_DIRS': True, 68 | 'OPTIONS': { 69 | 'context_processors': [ 70 | 'django.template.context_processors.debug', 71 | 'django.template.context_processors.request', 72 | 'django.contrib.auth.context_processors.auth', 73 | 'django.contrib.messages.context_processors.messages', 74 | ], 75 | }, 76 | }, 77 | ] 78 | 79 | WSGI_APPLICATION = 'priveedly.wsgi.application' 80 | 81 | 82 | LOGIN_REQUIRED_IGNORE_PATHS = [ 83 | r'/admin', 84 | r'/accounts/login/$', 85 | r'/accounts/logout/$', 86 | ] 87 | 88 | LOGIN_REDIRECT_URL = "/" 89 | 90 | # Database 91 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases 92 | 93 | DATABASES = { 94 | 95 | # 'default': { 96 | # 'ENGINE': 'django.db.backends.sqlite3', 97 | # 'NAME': BASE_DIR / 'db.sqlite3', 98 | # 99 | # }, 100 | #} 101 | 102 | 103 | "default": { 104 | "ENGINE": "django.db.backends.postgresql", 105 | "NAME": os.environ.get('LOCAL_DB_NAME'), 106 | "USER": os.environ.get('LOCAL_DB_USERNAME'), 107 | "PASSWORD": os.environ.get('LOCAL_DB_PASSWORD'), 108 | "HOST": "localhost", 109 | "PORT": "", 110 | 111 | } 112 | 113 | } 114 | 115 | 116 | # Password validation 117 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators 118 | 119 | AUTH_PASSWORD_VALIDATORS = [ 120 | { 121 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 122 | }, 123 | { 124 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 125 | }, 126 | { 127 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 128 | }, 129 | { 130 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 131 | }, 132 | ] 133 | 134 | 135 | # Internationalization 136 | # https://docs.djangoproject.com/en/4.2/topics/i18n/ 137 | 138 | LANGUAGE_CODE = 'en-us' 139 | 140 | TIME_ZONE = 'UTC' 141 | 142 | USE_I18N = True 143 | 144 | USE_TZ = True 145 | 146 | 147 | # Static files (CSS, JavaScript, Images) 148 | # https://docs.djangoproject.com/en/4.2/howto/static-files/ 149 | 150 | STATIC_ROOT = os.path.join(BASE_DIR, 'static') 151 | STATIC_URL = 'static/' 152 | 153 | # Default primary key field type 154 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field 155 | 156 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' 157 | 158 | 159 | TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY') 160 | TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET') 161 | TWITTER_ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN') 162 | TWITTER_ACCESS_TOKEN_SECRET = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET') 163 | REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID') 164 | REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET') 165 | REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD') 166 | REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT') 167 | REDDIT_USERNAME = os.environ.get('REDDIT_USERNAME') 168 | -------------------------------------------------------------------------------- /feeds/management/commands/rate_all.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | 3 | import pandas as pd 4 | from sqlalchemy import create_engine 5 | from urllib.parse import urlparse 6 | from nltk.corpus import stopwords 7 | from nltk import tokenize 8 | import re 9 | import string 10 | import html 11 | import logging 12 | import joblib 13 | 14 | from sites.models import RedditPost, SitePost 15 | from feeds.models import FeedEntry 16 | from bs4 import BeautifulSoup 17 | 18 | import os 19 | from dotenv import load_dotenv, dotenv_values 20 | load_dotenv() 21 | 22 | import nltk 23 | nltk.download('stopwords') 24 | nltk.download('punkt_tab') 25 | 26 | logging.basicConfig(filename='/var/log/priveedly/rate.log', 27 | encoding='utf-8', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S') 30 | 31 | CLEAN_HTML = re.compile('<.*?>') 32 | CLEAN_NUMBERS = re.compile('[0-9,\\.$\\%]+') 33 | CLEAN_NUMBERS_AND_ONE_LETTER = re.compile('([a-z]\\d+)|(\\d+[a-z])|(\\d+[a-z]\\d+)') 34 | CLEAN_REPEATED_PUNCTUATION = re.compile('[!\\-\\/:-@-`’–{-~"“”\\[\\]]+') 35 | 36 | def tokenize_url(url_str): 37 | parsed_url = urlparse(url_str) 38 | return parsed_url.netloc, ' '.join(parsed_url.path.split('/')).replace('-', ' '), parsed_url.query.replace('?', ' ').replace('=', ' ') 39 | 40 | def prepare_content(pandas_row): 41 | netloc, path, query = tokenize_url(pandas_row.url) 42 | return ' '.join([pandas_row.title, pandas_row.description, pandas_row.site_name, netloc, path, query]) 43 | 44 | # Update this if you change preprocessing! 45 | def remove_tags_and_lowercase(text): 46 | # some parts from https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string 47 | if BeautifulSoup(text, "html.parser").find(): 48 | try: 49 | soup = BeautifulSoup(text) 50 | text = soup.get_text() 51 | except: 52 | pass 53 | cleantext = html.unescape(text).encode('unicode_escape').decode('unicode_escape') 54 | # you can try this line or other similar things if you want to be more deliberate about cleaning! 55 | #cleantext = re.sub(CLEAN_NUMBERS_AND_ONE_LETTER, '', cleantext) 56 | cleantext = re.sub(CLEAN_NUMBERS, '', cleantext) 57 | cleantext = re.sub(CLEAN_REPEATED_PUNCTUATION, '', cleantext) 58 | return cleantext.lower() 59 | 60 | # Update this if you change preprocessing! 61 | def tokenize_content(text): 62 | removal = set(stopwords.words('english')).union(set(string.punctuation)) 63 | return [w for w in tokenize.word_tokenize(remove_tags_and_lowercase(text)) 64 | if w.lower() not in removal] 65 | 66 | 67 | def get_engine(): 68 | db_str = "postgresql://{}:{}@localhost:5432/{}".format( 69 | os.environ.get('DB_USERNAME'), 70 | os.environ.get('DB_PASSWORD'), 71 | os.environ.get('DB_NAME')) 72 | return create_engine(db_str) 73 | 74 | # Update this if you change preprocessing! 75 | def create_content_df(engine): 76 | sites_df = pd.read_sql( 77 | "select id, title, url, description, site_name from sites_sitepost WHERE read is False and interesting is False", 78 | con=engine) 79 | sites_df['type'] = 'sites' 80 | feeds_df = pd.read_sql( 81 | "select feeds_feedentry.id as id, feeds_feedentry.title as title, feeds_feedentry.url as url, feeds_feedentry.description as description, feeds_feed.title as site_name from feeds_feedentry JOIN feeds_feed ON feeds_feed.id = feed_id WHERE read is False and interesting is False", 82 | con=engine) 83 | feeds_df['type'] = 'feeds' 84 | reddit_df = pd.read_sql( 85 | "select sites_redditpost.id as id, sites_redditpost.title as title, sites_redditpost.url as url, sites_redditpost.description as description, sites_subreddit.name as site_name from sites_redditpost JOIN sites_subreddit ON sites_redditpost.id = sites_subreddit.id WHERE read is False and interesting is False", 86 | con=engine) 87 | reddit_df['type'] = 'reddit' 88 | return pd.concat([reddit_df, sites_df, feeds_df]) 89 | 90 | 91 | def update_score(pandas_row): 92 | if pandas_row.type == 'sites': 93 | obj = SitePost.objects.get(pk=pandas_row.id) 94 | elif pandas_row.type == 'feeds': 95 | obj = FeedEntry.objects.get(pk=pandas_row.id) 96 | else: 97 | obj = RedditPost.objects.get(pk=pandas_row.id) 98 | obj.recommended = pandas_row.y 99 | obj.save() 100 | 101 | 102 | class Command(BaseCommand): 103 | 104 | def handle(self, *args, **kwargs): 105 | try: 106 | # Update this if you change preprocessing! 107 | engine = get_engine() 108 | content_df = create_content_df(engine) 109 | logging.info('about to rate {} items'.format(content_df.shape[0])) 110 | content_df['full_text'] = content_df.apply(prepare_content, axis=1) 111 | content_df['cleaned_text'] = content_df['full_text'].map(lambda x: ' '.join(tokenize_content(x))) 112 | pipeline = joblib.load(os.getenv('PIPELINE_FILE')) 113 | if hasattr(pipeline, 'predict_proba'): 114 | proba = pipeline.predict_proba(content_df['cleaned_text']) 115 | # take only positive class 116 | y = proba[:, 1] 117 | else: 118 | y = pipeline.predict(content_df['cleaned_text']) 119 | content_df['y'] = y 120 | content_df.apply(update_score, axis=1) 121 | except Exception as e: 122 | logging.exception(e) 123 | logging.debug('failed to rate incoming content') 124 | -------------------------------------------------------------------------------- /priveedly/prod_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for priveedly project. 3 | 4 | Generated by 'django-admin startproject' using Django 4.2. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/4.2/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | from dotenv import load_dotenv 15 | import os 16 | 17 | import sentry_sdk 18 | from sentry_sdk.integrations.django import DjangoIntegration 19 | 20 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 21 | BASE_DIR = Path(__file__).resolve().parent.parent 22 | 23 | load_dotenv() 24 | 25 | # Quick-start development settings - unsuitable for production 26 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ 27 | 28 | # SECURITY WARNING: keep the secret key used in production secret! 29 | SECRET_KEY = os.environ.get('SECRET_KEY') 30 | 31 | # SECURITY WARNING: don't run with debug turned on in production! 32 | DEBUG = (os.getenv('DEBUG', 'False') == 'True') 33 | 34 | ALLOWED_HOSTS = [x for x in os.environ.get('ALLOWED_HOSTS').split(',')] 35 | SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') 36 | CSRF_TRUSTED_ORIGINS = ['https://{}'.format(x) for x in os.environ.get('ALLOWED_HOSTS').split(',') if '.com' in x] 37 | 38 | # Application definition 39 | 40 | INSTALLED_APPS = [ 41 | 'django.contrib.admin', 42 | 'django.contrib.auth', 43 | 'django.contrib.contenttypes', 44 | 'django.contrib.sessions', 45 | 'django.contrib.messages', 46 | 'django.contrib.staticfiles', 47 | 'django_filters', 48 | 'feeds', 49 | 'sites', 50 | ] 51 | 52 | MIDDLEWARE = [ 53 | 'django.middleware.security.SecurityMiddleware', 54 | 'django.contrib.sessions.middleware.SessionMiddleware', 55 | 'django.middleware.common.CommonMiddleware', 56 | 'django.middleware.csrf.CsrfViewMiddleware', 57 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 58 | 'login_required.middleware.LoginRequiredMiddleware', 59 | 'django.contrib.messages.middleware.MessageMiddleware', 60 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 61 | ] 62 | 63 | ROOT_URLCONF = 'priveedly.urls' 64 | 65 | TEMPLATES = [ 66 | { 67 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 68 | 'DIRS': [os.path.join(BASE_DIR, 'templates')], 69 | 'APP_DIRS': True, 70 | 'OPTIONS': { 71 | 'context_processors': [ 72 | 'django.template.context_processors.debug', 73 | 'django.template.context_processors.request', 74 | 'django.contrib.auth.context_processors.auth', 75 | 'django.contrib.messages.context_processors.messages', 76 | ], 77 | }, 78 | }, 79 | ] 80 | 81 | WSGI_APPLICATION = 'priveedly.wsgi.application' 82 | 83 | 84 | LOGIN_REQUIRED_IGNORE_PATHS = [ 85 | r'/admin', 86 | r'/accounts/login/$', 87 | r'/accounts/logout/$', 88 | ] 89 | 90 | LOGIN_REDIRECT_URL = "/" 91 | 92 | # Database 93 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases 94 | 95 | DATABASES = { 96 | 97 | # 'default': { 98 | # 'ENGINE': 'django.db.backends.sqlite3', 99 | # 'NAME': BASE_DIR / 'db.sqlite3', 100 | # 101 | # }, 102 | #} 103 | 104 | 105 | 106 | 107 | "default": { 108 | "ENGINE": "django.db.backends.postgresql", 109 | "NAME": os.environ.get('DB_NAME'), 110 | "USER": os.environ.get('DB_USERNAME'), 111 | "PASSWORD": os.environ.get('DB_PASSWORD'), 112 | "HOST": "localhost", 113 | "PORT": "", 114 | 115 | } 116 | 117 | } 118 | 119 | 120 | # Password validation 121 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators 122 | 123 | AUTH_PASSWORD_VALIDATORS = [ 124 | { 125 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 126 | }, 127 | { 128 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 129 | }, 130 | { 131 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 132 | }, 133 | { 134 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 135 | }, 136 | ] 137 | 138 | 139 | # Internationalization 140 | # https://docs.djangoproject.com/en/4.2/topics/i18n/ 141 | 142 | LANGUAGE_CODE = 'en-us' 143 | 144 | TIME_ZONE = 'UTC' 145 | 146 | USE_I18N = True 147 | 148 | USE_TZ = True 149 | 150 | 151 | # Static files (CSS, JavaScript, Images) 152 | # https://docs.djangoproject.com/en/4.2/howto/static-files/ 153 | 154 | STATIC_ROOT = os.path.join(BASE_DIR, 'static') 155 | STATIC_URL = 'static/' 156 | 157 | # Default primary key field type 158 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field 159 | 160 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' 161 | 162 | sentry_sdk.init( 163 | dsn=os.environ.get('SENTRY_DSN'), 164 | integrations=[ 165 | DjangoIntegration(), 166 | ], 167 | 168 | # Set traces_sample_rate to 1.0 to capture 100% 169 | # of transactions for performance monitoring. 170 | # We recommend adjusting this value in production. 171 | traces_sample_rate=0.1, 172 | 173 | # If you wish to associate users to errors (assuming you are using 174 | # django.contrib.auth) you may enable sending PII data. 175 | send_default_pii=False 176 | ) 177 | 178 | TWITTER_CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY') 179 | TWITTER_CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET') 180 | TWITTER_ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN') 181 | TWITTER_ACCESS_TOKEN_SECRET = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET') 182 | REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID') 183 | REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_CLIENT_SECRET') 184 | REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD') 185 | REDDIT_USER_AGENT = os.environ.get('REDDIT_USER_AGENT') 186 | REDDIT_USERNAME = os.environ.get('REDDIT_USERNAME') 187 | -------------------------------------------------------------------------------- /deployment/example_initial_ansible.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: YOUR_HOST_NAME 3 | become: yes 4 | 5 | tasks: 6 | 7 | - name: update 8 | apt: update_cache=yes 9 | 10 | - name: install fail2ban 11 | apt: pkg=fail2ban state=present 12 | 13 | - name: install git 14 | apt: name=git state=present 15 | 16 | - name: build essential 17 | apt: name=build-essential state=present 18 | 19 | - name: nginx 20 | apt: name=nginx state=present 21 | 22 | - name: install py3 23 | apt: name=python3 state=present 24 | 25 | - name: install pip 26 | apt: name=python3-pip state=present 27 | 28 | - name: install uwsgi 29 | apt: name=uwsgi state=present 30 | 31 | - name: install emperor 32 | apt: name=uwsgi-emperor state=present 33 | 34 | - name: install uwsgi py3 35 | apt: name=uwsgi-plugin-python3 state=present 36 | 37 | - name: install certbot 38 | apt: name=python3-certbot-nginx state=present 39 | 40 | - name: install postgres 41 | apt: name=postgresql state=present 42 | 43 | - name: install aws 44 | pip: name=awscli state=present 45 | 46 | - name: mk aws dir 47 | become_user: {{your_user}} 48 | file: path=/home/{{your_user}}/.aws mode=0700 recurse=yes state=directory 49 | 50 | - name: copy aws creds 51 | become_user: {{your_user}} 52 | copy: src=/home/{{your_user}}/.aws/credentials dest=/home/{{your_user}}/.aws/credentials 53 | 54 | - name: copy aws config 55 | become_user: {{your_user}} 56 | copy: src=/home/{{your_user}}/.aws/config dest=/home/{{your_user}}/.aws/config 57 | 58 | - name: install psycopg2 59 | apt: name=python3-psycopg2 state=present 60 | 61 | - name: install certbot 62 | apt: name=certbot state=present install_recommends=yes 63 | 64 | - name: install letsencrypt 65 | apt: name=letsencrypt state=present install_recommends=yes 66 | 67 | - name: change pr home perms to fetch 68 | file: path=/var/www/priveedly mode=0777 state=directory recurse=yes 69 | ignore_errors: yes 70 | 71 | - name: run certbot 72 | shell: certbot certonly --nginx -w /var/www/priveedly -d {{your_dns}} -n -m {{your_email}} --keep-until-expiring --agree-tos 73 | ignore_errors: yes 74 | 75 | - name: fetch python application 76 | become_user: {{your_user}} 77 | git: repo=git@github.com:YOUR_GITHUB/priveedly.git dest=/var/www/priveedly key_file=~/.ssh/id_ecdsa accept_hostkey=yes force=yes 78 | 79 | - name: change home perms to fetch 80 | file: path=/var/www/venv mode=0777 state=directory recurse=yes 81 | ignore_errors: yes 82 | 83 | - name: install virtualenv 84 | pip: executable=pip3 name=virtualenv state=present 85 | 86 | - name: install py requirements 87 | pip: requirements=/var/www/priveedly/requirements.txt virtualenv=/var/www/venv/priveedly virtualenv_python=python3 88 | 89 | - name: copy .venv 90 | copy: src=.prod_env dest=/var/www/priveedly/priveedly/.env 91 | 92 | - name: mk uwsgi dir 93 | file: path=/etc/uwsgi/vassals state=directory recurse=yes mode=0644 94 | 95 | - name: copy uwsgi 96 | copy: src=priveedly.ini dest=/etc/uwsgi-emperor/vassals/priveedly.ini 97 | 98 | - name: mk scripts dir 99 | file: path=/home/{{your_user}}/scripts state=directory recurse=yes mode=0777 100 | 101 | - name: copy cron 102 | copy: src=backup.sh dest=/home/{{your_user}}/scripts/priveedly_backup.sh 103 | 104 | - name: copy emperor config 105 | copy: src=emperor.ini dest=/etc/uwsgi/emperor.ini 106 | 107 | - name: copy systemd emperor config 108 | copy: src=templates/emperor.uwsgi.service dest=/etc/systemd/system/emperor.uwsgi.service 109 | 110 | - name: Create database 111 | become: yes 112 | become_user: postgres 113 | become_method: sudo 114 | postgresql_db: name=priveedly encoding='UTF-8' lc_collate='en_US.UTF-8' lc_ctype='en_US.UTF-8' state=present 115 | 116 | - name: copy database 117 | copy: src=templates/priveedly/dump.sql dest=/tmp/backup.sql 118 | 119 | - name: Importing data 120 | become_user: postgres 121 | shell: psql priveedly < /tmp/backup.sql 122 | 123 | - name: Create role for database 124 | become_user: postgres 125 | postgresql_user: db=priveedly user={{your_db_user}} password={{your_db_password}} priv=ALL state=present 126 | 127 | - name: Grant sequence permissions 128 | become_user: postgres 129 | postgresql_privs: database=priveedly state=present privs=ALL type=database roles=priveedly grant_option=no objs=priveedly 130 | 131 | #- name: copy pgpass 132 | #become_user: {{your_user}} 133 | #copy: src=templates/pg_pgpass.txt dest=/home/{{your_user}}/.pgpass mode=0600 134 | 135 | #- name: alter pg_hba 136 | #become_user: postgres 137 | #copy: src=templates/pg_hba.conf dest=/etc/postgresql/9.6/main/pg_hba.conf 138 | #notify: 139 | # - restart postgres 140 | 141 | - name: backup daily cron for sql 142 | cron: name="backup daily cron sql" user="{{your_user}}" minute="0" hour="4" job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t daily" 143 | 144 | - name: backup weekly cron for sql 145 | cron: name="backup weekly cron sql" user="{{your_user}}" minute="0" hour="4" weekday="0" job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t weekly" 146 | 147 | - name: backup monthly cron for sql 148 | cron: name="backup monthly cron sql" user="{{your_user}}" special_time=monthly job="bash /home/{{your_user}}/scripts/priveedly_backup.sh -t monthly" 149 | 150 | - name: restart uwsgi emperor 151 | shell: systemctl restart emperor.uwsgi.service 152 | 153 | - name: copy nginx files 154 | copy: src=nginx.conf dest=/etc/nginx/sites-available/priveedly 155 | 156 | - name: symlink and restart 157 | file: src=/etc/nginx/sites-available/priveedly dest=/etc/nginx/sites-enabled/priveedly state=link 158 | 159 | - name: restart server 160 | shell: /bin/true 161 | notify: 162 | - restart nginx 163 | 164 | handlers: 165 | - name: restart ssh 166 | service: name=ssh state=restarted 167 | 168 | - name: restart nginx 169 | service: name=nginx state=restarted 170 | 171 | - name: restart postgres 172 | service: name=postgresql state=restarted 173 | -------------------------------------------------------------------------------- /feeds/tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from django.contrib.auth.models import User 3 | from django.test import TestCase, Client 4 | from feeds.models import FeedCategory, Feed, FeedEntry 5 | from datetime import datetime, timezone, timedelta 6 | 7 | from feeds.management_commands import get_pub_date, get_title, get_description, get_image, parse_feed 8 | import feedparser 9 | 10 | class LogInTest(TestCase): 11 | def setUp(self): 12 | self.credentials = { 13 | 'username': 'testuser', 14 | 'password': 'secret'} 15 | user = User.objects.create_user(**self.credentials) 16 | user.save() 17 | 18 | def test_login(self): 19 | # try to reach home logged out 20 | response = self.client.get('/', follow=True) 21 | self.assertEqual(response.status_code, 200) 22 | last_url, status_code = response.redirect_chain[-1] 23 | 24 | self.assertEqual(last_url, '/accounts/login/?next=/') 25 | 26 | # send login 27 | response = self.client.post('/accounts/login/?next=/', self.credentials, follow=True) 28 | 29 | # logged in now 30 | last_url, status_code = response.redirect_chain[-1] 31 | self.assertEqual(last_url, "/") 32 | self.assertTrue(response.context['user'].is_active) 33 | 34 | 35 | class MainPageTesting(TestCase): 36 | 37 | def setUp(self): 38 | # Every test needs a client. 39 | self.client = Client() 40 | 41 | #user account 42 | self.credentials = { 43 | 'username': 'testuser', 44 | 'password': 'secret'} 45 | User.objects.create_user(**self.credentials) 46 | 47 | # feeds 48 | feed_cat = FeedCategory(name="testing") 49 | feed_cat.save() 50 | feed = Feed(title='test feed', url="https://example.com", category=feed_cat) 51 | feed.save() 52 | self.posts = [] 53 | for idx in range(3): 54 | feed_post = FeedEntry(feed=feed, 55 | title="Here is a post #{}".format(idx), 56 | url="https://localhost:8000/post-{}".format(idx), 57 | published=datetime.utcnow().replace(tzinfo=timezone(timedelta(0)))) 58 | feed_post.save() 59 | self.posts.append(feed_post) 60 | 61 | def test_main_page(self): 62 | response = self.client.login(**self.credentials) 63 | response = self.client.get('/', follow=True) 64 | self.assertEqual(response.status_code, 200) 65 | 66 | for post in self.posts: 67 | self.assertIn(post.title, str(response.content)) 68 | self.assertIn(post.url, str(response.content)) 69 | 70 | 71 | def test_mark_read(self): 72 | response = self.client.login(**self.credentials) 73 | id_list = '{},{}'.format(self.posts[0].id, self.posts[1].id) 74 | entry_types = '{},{}'.format(self.posts[0].entry_type, self.posts[1].entry_type) 75 | response = self.client.post('/feeds/mark-read/', 76 | {'id_list': id_list, 77 | 'entry_types': entry_types 78 | }) 79 | self.assertEqual(response.status_code, 200) 80 | self.assertJSONEqual( 81 | str(response.content, encoding='utf8'), 82 | {'success': True} 83 | ) 84 | 85 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[0].id).read) 86 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).read) 87 | 88 | response = self.client.get('/') 89 | 90 | self.assertIn(self.posts[2].title, str(response.content)) 91 | self.assertIn(self.posts[2].url, str(response.content)) 92 | 93 | self.assertNotIn(self.posts[1].title, str(response.content)) 94 | self.assertNotIn(self.posts[1].url, str(response.content)) 95 | 96 | self.assertNotIn(self.posts[0].title, str(response.content)) 97 | self.assertNotIn(self.posts[0].url, str(response.content)) 98 | 99 | 100 | def test_mark_read_later(self): 101 | response = self.client.login(**self.credentials) 102 | response = self.client.post('/feeds/mark-read-later/', 103 | {'entry_id': self.posts[1].id, 104 | 'entry_type': self.posts[1].entry_type 105 | }) 106 | self.assertEqual(response.status_code, 200) 107 | self.assertJSONEqual( 108 | str(response.content, encoding='utf8'), 109 | {'success': True} 110 | ) 111 | 112 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).read_later) 113 | 114 | 115 | response = self.client.get('/read-later/') 116 | 117 | self.assertIn(self.posts[1].title, str(response.content)) 118 | self.assertIn(self.posts[1].url, str(response.content)) 119 | 120 | self.assertNotIn(self.posts[2].title, str(response.content)) 121 | self.assertNotIn(self.posts[2].url, str(response.content)) 122 | 123 | self.assertNotIn(self.posts[0].title, str(response.content)) 124 | self.assertNotIn(self.posts[0].url, str(response.content)) 125 | 126 | 127 | response = self.client.post('/feeds/unmark-read-later/', 128 | {'entry_id': self.posts[1].id, 129 | 'entry_type': self.posts[1].entry_type 130 | }) 131 | self.assertEqual(response.status_code, 200) 132 | self.assertJSONEqual( 133 | str(response.content, encoding='utf8'), 134 | {'success': True} 135 | ) 136 | 137 | self.assertFalse(FeedEntry.objects.get(pk=self.posts[1].id).read_later) 138 | 139 | response = self.client.get('/read-later/') 140 | 141 | self.assertNotIn(self.posts[1].title, str(response.content)) 142 | self.assertNotIn(self.posts[1].url, str(response.content)) 143 | 144 | def test_mark_interesting(self): 145 | response = self.client.login(**self.credentials) 146 | response = self.client.post('/feeds/mark-interesting/', 147 | {'entry_id': self.posts[1].id, 148 | 'entry_type': self.posts[1].entry_type 149 | }) 150 | self.assertEqual(response.status_code, 200) 151 | self.assertJSONEqual( 152 | str(response.content, encoding='utf8'), 153 | {'success': True} 154 | ) 155 | 156 | self.assertTrue(FeedEntry.objects.get(pk=self.posts[1].id).interesting) 157 | self.assertFalse(FeedEntry.objects.get(pk=self.posts[1].id).read_later) 158 | 159 | 160 | class ParseFeedTests(TestCase): 161 | 162 | def setUp(self): 163 | # Every test needs a client. 164 | self.client = Client() 165 | 166 | #user account 167 | self.credentials = { 168 | 'username': 'testuser', 169 | 'password': 'secret'} 170 | User.objects.create_user(**self.credentials) 171 | self.feed_files = [ 172 | 'feeds/tests/iapp.rss', 173 | 'feeds/tests/nomnom.rss', 174 | 'feeds/tests/jvns_ca.xml', 175 | ] 176 | 177 | def test_feed_parsing_units(self): 178 | for feed_file in self.feed_files: 179 | parser = feedparser.parse(feed_file) 180 | file_contents = open(feed_file, 'r').read() 181 | for entry in parser.entries: 182 | 183 | # test pub date 184 | pub_date = get_pub_date(entry) 185 | self.assertTrue(pub_date.year >= 2023) 186 | self.assertTrue(isinstance(pub_date, datetime)) 187 | 188 | # test get_title 189 | title = get_title(entry) 190 | self.assertTrue(isinstance(title, str)) 191 | self.assertTrue(len(title)<=355) 192 | if hasattr(entry, 'title'): 193 | self.assertEqual(title, entry.title) 194 | 195 | # test get_description 196 | description = get_description(entry) 197 | self.assertTrue(isinstance(description, str)) 198 | if hasattr(entry, 'content'): 199 | self.assertIn(description, 200 | ''.join([ce.get('value') for ce in entry.content])) 201 | elif hasattr(entry, 'description'): 202 | self.assertEqual(description, entry.description) 203 | 204 | # test get_image 205 | image = get_image(description) 206 | if hasattr(entry, 'content'): 207 | if image: 208 | self.assertTrue(isinstance(image, str)) 209 | self.assertIn(image, 210 | ''.join([ce.get('value') for ce in entry.content])) 211 | else: 212 | self.assertEqual(image, None) 213 | 214 | def test_feed_reader(self): 215 | response = self.client.login(**self.credentials) 216 | initial_response = self.client.get('/') 217 | self.assertEqual(initial_response.status_code, 200) 218 | 219 | feed = Feed(title='test', url=self.feed_files[1]) # WARNING: this has to be updated 220 | # for timely parsing (i.e. update RSS feed) 221 | feed.save() 222 | parse_feed(feed) 223 | 224 | response = self.client.get('/') 225 | 226 | self.assertEqual(response.status_code, 200) 227 | 228 | self.assertNotEqual(initial_response.content, response.content) 229 | self.assertTrue(len(response.content) > len(initial_response.content)) 230 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Hello! Welcome to Priveedly 2 | 3 | Priveedly is a personal content and feed reader with the ability to build your own small personalized machine learning models. Priveedly helps you follow, read, save what you like on your own server. 4 | 5 | Currently supported content: 6 | 7 | - Any valid RSS feed 8 | - Subreddits 9 | - [HackerNews](https://news.ycombinator.com/) hottest 10 | - [Lobste.rs](https://lobste.rs/) hottest 11 | 12 | No longer supported but might work: 13 | 14 | - Twitter/X API 15 | 16 | The project is built using the [Django web framework](https://www.djangoproject.com/), [scikit-learn](https://scikit-learn.org/stable/) and with very minimal Javascript and Python-based feed scraping. It is currently only useful for you if you know Python and can navigate setting up your own server. 17 | 18 | > A short video [introduction to Priveedly is on YouTube](https://youtu.be/_aHZlSUO8Qs) 19 | 20 | If this is too advanced for you, stay tuned! I plan to find some one-click install setups for non-dev/tech folks. 🙂 21 | 22 | ### Why run your own content and feed-reader? 23 | 24 | - 🎯 **Autonomy**: Decide what types of content you want to read and update yourself without an algorithm. 25 | - 🔐 **Privacy**: It's a private service, for you and by you. Unless you give someone your login, they won't read your feed. 26 | - 💸 **No ads**: Because why do you want ads in the middle of your reading? 27 | - 🤓 **Self-study**: Because training ML models for yourself and by yourself can be a fun way to safely do data science and ML without contributing your data to a large-scale content platform. 28 | 29 | You can read more [about my experience and motivation on my blog](https://blog.kjamistan.com/priveedly-your-private-and-personal-content-reader-and-recommender.html). 30 | 31 | ## Table of Contents 32 | - [Installation](#installation) 33 | - [Usage](#usage) 34 | - [Contributing](#contributing) 35 | - [License](#license) 36 | 37 | ## Installation 38 | 39 | This repository doesn't yet have an easy one-click launch, but I am hoping someone might contribute that! 40 | 41 | For those who already know Python, the basic setup is as follows: 42 | 43 | 1. Clone the repository: 44 | ```bash 45 | git clone https://github.com/kjam/priveedly.git 46 | ``` 47 | 48 | 2. Create a virtual or Conda environment. 49 | ```bash 50 | conda create -n priveedly 51 | ``` 52 | 53 | 3. Install dependencies: 54 | ```bash 55 | conda install pip 56 | pip install -r requirements.txt 57 | ``` 58 | 59 | 4. Create database that can be used with Django, like postgresql (or your favorite database here). 60 | 61 | 5. Add environment file with necessary variables, see [example environment file](example_env). This needs to then be saved as '.env' in the main directory. 62 | 63 | 6. Migrate the database. 64 | ```bash 65 | python manage.py makemigrations 66 | python manage.py migrate 67 | ``` 68 | 69 | 7. Create a super user. 70 | ```bash 71 | python manage.py createsuperuser 72 | ``` 73 | 74 | 8. Run the server and navigate to /admin to log in. 75 | ```bash 76 | python manage.py runserver 77 | ``` 78 | 79 | The project is currently only tested on Python 3.9, but appears to work for other versions. Due to the dependencies, you might need to hold back your Python version if you plan on using things like Jupyter, scikit-learn, etc to train your models. 80 | 81 | ## Usage 82 | 83 | ### Local use 84 | 85 | First, get everything installed above and working. Then, you can enter a few feeds and subreddits you might like to read. To do so, add the feeds directly in the admin website (navigate there after running the runserver command). 86 | 87 | Then, you can test whether the parsing is working properly by opening a new terminal in your virtual environment and then running the following command. 88 | 89 | ```bash 90 | python manage.py parse_all 91 | ``` 92 | 93 | This command parses your saved feeds, Subreddits along with HackerNews and Lobste.rs top stories. If you'd like to change the parsing, please update the /feeds/commands/parse_all.py file. 94 | 95 | You can then navigate to the homepage when logged in (http://127.0.0.1:8000) and see the parsed feeds. The general reading flow is as follows: 96 | 97 | - ⏳ **Progress Bar**: On the top you have a bar telling you how far you are along in your reading backlog. It's very minimal and if you want to change it or redesign it, feel free! 98 | 99 | - 📚 **Articles**: Each page loads an oldest-first of all of your different feeds, reddits and other stories from HN and lobsters. If you click on the titles of any article, it will navigate to that article (use CNTRL+click if you want to open in a new tab). 100 | 101 | - ✅️ **Read Later**: If you see something you want to read later, click read later and it will be saved for later access. To see your read-later, navigate to http://127.0.0.1:8000/read-later 102 | 103 | - 📊 To train your **own recommendation model**, you'll need to save things to read-later, and then mark the content you like as interesting. If you want to use a different workflow, you could also change the main view to expose the interesting button on the main page. 104 | 105 | - 🗂️ **Recommended**: Once you have your own recommendation model running, you can visit http://127.0.0.1:8000/recommended to skip to the articles you might like the most. 106 | 107 | I wouldn't recommend running locally if you want to use it regularly, because it has the ability to parse and run in the background if you get it setup on a server. (see below) 108 | 109 | For the first few thousand entries, I wouldn't bother trying to train or use the machine learning parts because it won't be enough data for a useful model. Once you have many thousands of posts, it's worth using the machine learning example. If you are new to building language classification models, I recommend starting by watching [my video](https://youtu.be/AMy3K3NbrLw) and then trying it for yourself by running Jupyter in the notebooks folder and following along. 110 | 111 | To get Reddit working, you'll need to [sign up to get an API key](https://praw.readthedocs.io/en/stable/getting_started/configuration.html#configuration) and then store that in your environment file. To see how to do that, please see the [example environment file](example_env). That should then be saved as '.env' in the main directory. 112 | 113 | ### Personal server use 114 | 115 | Ideally, you have access to a server and can get Priveedly set up on that server. If you are familiar with [Ansible](https://docs.ansible.com/ansible/latest/index.html), you can see several reference scripts for your use in the deployment folder but they probably require updates or modifications based on your operating system and cloud provider. 116 | 117 | If you are willing to contribute a Dockerfile to ease deployment for those unfamiliar with Ansible, I would greatly appreciate any contributions. 118 | 119 | Important to note that when using a server, you'll want: 120 | 121 | - Enough storage to store all of your favorite articles and run the parser 122 | - Eventually enough RAM to run ML classification tasks 123 | - Good connectivity/throughput for parsing 124 | 125 | I recommend running the parse_all command every 2-3 hours if you plan on using the app relatively frequently. 126 | 127 | Once you have enough data to train your own model, you'll want to do that locally and then deploy it to the server. 128 | 129 | ### Training your own model 130 | 131 | If machine learning is new to you, you can get started by watching [my YouTube video](https://youtu.be/AMy3K3NbrLw) and then trying it for yourself by running Jupyter in the notebooks folder and following along. 132 | 133 | > Note: When/if you modify the data preparation steps, you must also modify the rate_all.py script in the feeds/management/commands folder. The data going into the model on the server must match how the model was trained. 134 | 135 | Once your model is trained, I recommend running the rate_all about once a day or every 6 hours if you have an especially busy feedreader. 136 | 137 | I'm happy to also host more Notebook and training contributions if you find a different model type works well for you, if you have another notebook that works better for a different set of languages or a more production-ready setup. You can also post your own notebook and explanations on your own site/repo for others to learn from! 138 | 139 | ### Some additional notes 140 | 141 | 1. So long tweets: Twitter changed their API and moved to paid only access after I had already been using this reader for a year or so. 😩 Therefore, I am not sure if the /tweets section still works anymore or not. If you are an active X user and want to test it out and let me know, I'd appreciate the feedback! 142 | 143 | 2. One-click deploy: I'd be really happy if someone wants to figure out an easy way for people to one-click deploy this. If you offer a service like this, please let me know and I'll see if I can get the repository in a shape to get it working! 144 | 145 | 3. Monkeypatching with Django: I originally started monkeypatching some of the sites parsing to add tests, only to find that django testing and monkeypatching are a bit of a pain when used together. If you have experience making these play nice, I'd love some help. 146 | 147 | 4. Supporting other languages and classifiers with beginner-friendly instructions: Because I'd like this to be useful for people of all ML-levels and also folks who like reading non-English texts, it'd be awesome to have more Jupyter contributions and accompanying posts/videos to help folks test out different types of one-person-use recommenders. Contributions are very welcome! 148 | 149 | ## Contributions 150 | 151 | I heartily welcome contributions that would benefit others. First and foremost, please use the project yourself before making significant contributions. 152 | 153 | I also suggest looking through the Issues for open asks from myself and other users. 154 | 155 | In general, please follow the following workflow: 156 | 157 | 1. Fork the repository. 158 | 2. Create a new branch: `git checkout -b feature-name`. 159 | 3. Make your changes. Please include tests if providing significant new functionality. 160 | 4. Push your branch: `git push origin feature-name`. 161 | 5. Create a pull request. 162 | 163 | ## License 164 | 165 | The project is shared under the [GNU Public License](LICENSE). 166 | -------------------------------------------------------------------------------- /static/simple.css: -------------------------------------------------------------------------------- 1 | /* Global variables. */ 2 | :root, 3 | ::backdrop { 4 | /* Set sans-serif & mono fonts */ 5 | --sans-font: -apple-system, BlinkMacSystemFont, "Avenir Next", Avenir, 6 | "Nimbus Sans L", Roboto, "Noto Sans", "Segoe UI", Arial, Helvetica, 7 | "Helvetica Neue", sans-serif; 8 | --mono-font: Consolas, Menlo, Monaco, "Andale Mono", "Ubuntu Mono", monospace; 9 | --standard-border-radius: 5px; 10 | 11 | /* Default (light) theme */ 12 | --bg: #fff; 13 | --accent-bg: #f5f7ff; 14 | --text: #212121; 15 | --text-light: #585858; 16 | --border: #898EA4; 17 | --accent: #C15CF6; 18 | --code: #d81b60; 19 | --preformatted: #444; 20 | --marked: #ffdd33; 21 | --disabled: #efefef; 22 | } 23 | 24 | /* Dark theme */ 25 | @media (prefers-color-scheme: dark) { 26 | :root, 27 | ::backdrop { 28 | color-scheme: dark; 29 | --bg: #212121; 30 | --accent-bg: #2b2b2b; 31 | --text: #dcdcdc; 32 | --text-light: #ababab; 33 | --accent: #C15CF6; 34 | --code: #f06292; 35 | --preformatted: #ccc; 36 | --disabled: #111; 37 | } 38 | /* Add a bit of transparency so light media isn't so glaring in dark mode */ 39 | img, 40 | video { 41 | opacity: 0.8; 42 | } 43 | } 44 | 45 | /* Reset box-sizing */ 46 | *, *::before, *::after { 47 | box-sizing: border-box; 48 | } 49 | 50 | /* Reset default appearance */ 51 | textarea, 52 | select, 53 | input, 54 | progress { 55 | appearance: none; 56 | -webkit-appearance: none; 57 | -moz-appearance: none; 58 | } 59 | 60 | html { 61 | /* Set the font globally */ 62 | font-family: var(--sans-font); 63 | scroll-behavior: smooth; 64 | } 65 | 66 | /* Make the body a nice central block */ 67 | body { 68 | color: var(--text); 69 | background-color: var(--bg); 70 | font-size: 1.15rem; 71 | line-height: 1.5; 72 | display: grid; 73 | grid-template-columns: 1fr min(45rem, 90%) 1fr; 74 | margin: 0; 75 | } 76 | body > * { 77 | grid-column: 2; 78 | } 79 | 80 | /* Make the header bg full width, but the content inline with body */ 81 | body > header { 82 | background-color: var(--accent-bg); 83 | border-bottom: 1px solid var(--border); 84 | text-align: center; 85 | padding: 0 0.5rem 2rem 0.5rem; 86 | grid-column: 1 / -1; 87 | } 88 | 89 | body > header h1 { 90 | max-width: 1200px; 91 | margin: 1rem auto; 92 | } 93 | 94 | body > header p { 95 | max-width: 40rem; 96 | margin: 1rem auto; 97 | } 98 | 99 | /* Add a little padding to ensure spacing is correct between content and header > nav */ 100 | main { 101 | padding-top: 1.5rem; 102 | } 103 | 104 | body > footer { 105 | margin-top: 4rem; 106 | padding: 2rem 1rem 1.5rem 1rem; 107 | color: var(--text-light); 108 | font-size: 0.9rem; 109 | text-align: center; 110 | border-top: 1px solid var(--border); 111 | } 112 | 113 | /* Format headers */ 114 | h1 { 115 | font-size: 3rem; 116 | } 117 | 118 | h2 { 119 | font-size: 2.6rem; 120 | margin-top: 3rem; 121 | } 122 | 123 | h3 { 124 | font-size: 2rem; 125 | margin-top: 3rem; 126 | } 127 | 128 | h4 { 129 | font-size: 1.44rem; 130 | } 131 | 132 | h5 { 133 | font-size: 1.15rem; 134 | } 135 | 136 | h6 { 137 | font-size: 0.96rem; 138 | } 139 | 140 | /* Prevent long strings from overflowing container */ 141 | p, h1, h2, h3, h4, h5, h6 { 142 | overflow-wrap: break-word; 143 | } 144 | 145 | /* Fix line height when title wraps */ 146 | h1, 147 | h2, 148 | h3 { 149 | line-height: 1.1; 150 | } 151 | 152 | /* Reduce header size on mobile */ 153 | @media only screen and (max-width: 720px) { 154 | h1 { 155 | font-size: 2.5rem; 156 | } 157 | 158 | h2 { 159 | font-size: 2.1rem; 160 | } 161 | 162 | h3 { 163 | font-size: 1.75rem; 164 | } 165 | 166 | h4 { 167 | font-size: 1.25rem; 168 | } 169 | } 170 | 171 | /* Format links & buttons */ 172 | a, 173 | a:visited { 174 | color: var(--accent); 175 | } 176 | 177 | a:hover { 178 | text-decoration: none; 179 | } 180 | 181 | button, 182 | [role="button"], 183 | input[type="submit"], 184 | input[type="reset"], 185 | input[type="button"], 186 | label[type="button"] { 187 | border: none; 188 | border-radius: var(--standard-border-radius); 189 | background-color: var(--accent); 190 | font-size: 1rem; 191 | color: var(--bg); 192 | padding: 0.7rem 0.9rem; 193 | margin: 0.5rem 0; 194 | } 195 | 196 | button[disabled], 197 | [role="button"][aria-disabled="true"], 198 | input[type="submit"][disabled], 199 | input[type="reset"][disabled], 200 | input[type="button"][disabled], 201 | input[type="checkbox"][disabled], 202 | input[type="radio"][disabled], 203 | select[disabled] { 204 | cursor: not-allowed; 205 | } 206 | 207 | input:disabled, 208 | textarea:disabled, 209 | select:disabled, 210 | button[disabled] { 211 | cursor: not-allowed; 212 | background-color: var(--disabled); 213 | color: var(--text-light) 214 | } 215 | 216 | input[type="range"] { 217 | padding: 0; 218 | } 219 | 220 | /* Set the cursor to '?' on an abbreviation and style the abbreviation to show that there is more information underneath */ 221 | abbr[title] { 222 | cursor: help; 223 | text-decoration-line: underline; 224 | text-decoration-style: dotted; 225 | } 226 | 227 | button:enabled:hover, 228 | [role="button"]:not([aria-disabled="true"]):hover, 229 | input[type="submit"]:enabled:hover, 230 | input[type="reset"]:enabled:hover, 231 | input[type="button"]:enabled:hover, 232 | label[type="button"]:hover { 233 | filter: brightness(1.4); 234 | cursor: pointer; 235 | } 236 | 237 | button:focus-visible:where(:enabled, [role="button"]:not([aria-disabled="true"])), 238 | input:enabled:focus-visible:where( 239 | [type="submit"], 240 | [type="reset"], 241 | [type="button"] 242 | ) { 243 | outline: 2px solid var(--accent); 244 | outline-offset: 1px; 245 | } 246 | 247 | /* Format navigation */ 248 | header > nav { 249 | font-size: 1rem; 250 | line-height: 2; 251 | padding: 1rem 0 0 0; 252 | } 253 | 254 | /* Use flexbox to allow items to wrap, as needed */ 255 | header > nav ul, 256 | header > nav ol { 257 | align-content: space-around; 258 | align-items: center; 259 | display: flex; 260 | flex-direction: row; 261 | flex-wrap: wrap; 262 | justify-content: center; 263 | list-style-type: none; 264 | margin: 0; 265 | padding: 0; 266 | } 267 | 268 | /* List items are inline elements, make them behave more like blocks */ 269 | header > nav ul li, 270 | header > nav ol li { 271 | display: inline-block; 272 | } 273 | 274 | header > nav a, 275 | header > nav a:visited { 276 | margin: 0 0.5rem 1rem 0.5rem; 277 | border: 1px solid var(--border); 278 | border-radius: var(--standard-border-radius); 279 | color: var(--text); 280 | display: inline-block; 281 | padding: 0.1rem 1rem; 282 | text-decoration: none; 283 | } 284 | 285 | header > nav a:hover { 286 | border-color: var(--accent); 287 | color: var(--accent); 288 | cursor: pointer; 289 | } 290 | 291 | /* Reduce nav side on mobile */ 292 | @media only screen and (max-width: 720px) { 293 | header > nav a { 294 | border: none; 295 | padding: 0; 296 | text-decoration: underline; 297 | line-height: 1; 298 | } 299 | } 300 | 301 | /* Consolidate box styling */ 302 | aside, details, pre, progress { 303 | background-color: var(--accent-bg); 304 | border: 1px solid var(--border); 305 | border-radius: var(--standard-border-radius); 306 | margin-bottom: 1rem; 307 | } 308 | 309 | aside { 310 | font-size: 1rem; 311 | width: 30%; 312 | padding: 0 15px; 313 | margin-left: 15px; 314 | float: right; 315 | } 316 | 317 | /* Make aside full-width on mobile */ 318 | @media only screen and (max-width: 720px) { 319 | aside { 320 | width: 100%; 321 | float: none; 322 | margin-left: 0; 323 | } 324 | } 325 | 326 | article, fieldset, dialog { 327 | border: 1px solid var(--border); 328 | padding: 1rem; 329 | border-radius: var(--standard-border-radius); 330 | margin-bottom: 1rem; 331 | } 332 | 333 | article h2:first-child, 334 | section h2:first-child { 335 | margin-top: 1rem; 336 | } 337 | 338 | section { 339 | border-top: 1px solid var(--border); 340 | border-bottom: 1px solid var(--border); 341 | padding: 2rem 1rem; 342 | margin: 3rem 0; 343 | } 344 | 345 | /* Don't double separators when chaining sections */ 346 | section + section, 347 | section:first-child { 348 | border-top: 0; 349 | padding-top: 0; 350 | } 351 | 352 | section:last-child { 353 | border-bottom: 0; 354 | padding-bottom: 0; 355 | } 356 | 357 | details { 358 | padding: 0.7rem 1rem; 359 | } 360 | 361 | summary { 362 | cursor: pointer; 363 | font-weight: bold; 364 | padding: 0.7rem 1rem; 365 | margin: -0.7rem -1rem; 366 | word-break: break-all; 367 | } 368 | 369 | details[open] > summary + * { 370 | margin-top: 0; 371 | } 372 | 373 | details[open] > summary { 374 | margin-bottom: 0.5rem; 375 | } 376 | 377 | details[open] > :last-child { 378 | margin-bottom: 0; 379 | } 380 | 381 | /* Format tables */ 382 | table { 383 | border-collapse: collapse; 384 | margin: 1.5rem 0; 385 | } 386 | 387 | td, 388 | th { 389 | border: 1px solid var(--border); 390 | text-align: left; 391 | padding: 0.5rem; 392 | } 393 | 394 | th { 395 | background-color: var(--accent-bg); 396 | font-weight: bold; 397 | } 398 | 399 | tr:nth-child(even) { 400 | /* Set every other cell slightly darker. Improves readability. */ 401 | background-color: var(--accent-bg); 402 | } 403 | 404 | table caption { 405 | font-weight: bold; 406 | margin-bottom: 0.5rem; 407 | } 408 | 409 | /* Format forms */ 410 | textarea, 411 | select, 412 | input { 413 | font-size: inherit; 414 | font-family: inherit; 415 | padding: 0.5rem; 416 | margin-bottom: 0.5rem; 417 | color: var(--text); 418 | background-color: var(--bg); 419 | border: 1px solid var(--border); 420 | border-radius: var(--standard-border-radius); 421 | box-shadow: none; 422 | max-width: 100%; 423 | display: inline-block; 424 | } 425 | label { 426 | display: block; 427 | } 428 | textarea:not([cols]) { 429 | width: 100%; 430 | } 431 | 432 | /* Add arrow to drop-down */ 433 | select:not([multiple]) { 434 | background-image: linear-gradient(45deg, transparent 49%, var(--text) 51%), 435 | linear-gradient(135deg, var(--text) 51%, transparent 49%); 436 | background-position: calc(100% - 15px), calc(100% - 10px); 437 | background-size: 5px 5px, 5px 5px; 438 | background-repeat: no-repeat; 439 | padding-right: 25px; 440 | } 441 | 442 | /* checkbox and radio button style */ 443 | input[type="checkbox"], 444 | input[type="radio"] { 445 | vertical-align: middle; 446 | position: relative; 447 | width: min-content; 448 | } 449 | 450 | input[type="checkbox"] + label, 451 | input[type="radio"] + label { 452 | display: inline-block; 453 | } 454 | 455 | input[type="radio"] { 456 | border-radius: 100%; 457 | } 458 | 459 | input[type="checkbox"]:checked, 460 | input[type="radio"]:checked { 461 | background-color: var(--accent); 462 | } 463 | 464 | input[type="checkbox"]:checked::after { 465 | /* Creates a rectangle with colored right and bottom borders which is rotated to look like a check mark */ 466 | content: " "; 467 | width: 0.18em; 468 | height: 0.32em; 469 | border-radius: 0; 470 | position: absolute; 471 | top: 0.05em; 472 | left: 0.17em; 473 | background-color: transparent; 474 | border-right: solid var(--bg) 0.08em; 475 | border-bottom: solid var(--bg) 0.08em; 476 | font-size: 1.8em; 477 | transform: rotate(45deg); 478 | } 479 | input[type="radio"]:checked::after { 480 | /* creates a colored circle for the checked radio button */ 481 | content: " "; 482 | width: 0.25em; 483 | height: 0.25em; 484 | border-radius: 100%; 485 | position: absolute; 486 | top: 0.125em; 487 | background-color: var(--bg); 488 | left: 0.125em; 489 | font-size: 32px; 490 | } 491 | 492 | /* Makes input fields wider on smaller screens */ 493 | @media only screen and (max-width: 720px) { 494 | textarea, 495 | select, 496 | input { 497 | width: 100%; 498 | } 499 | } 500 | 501 | /* Set a height for color input */ 502 | input[type="color"] { 503 | height: 2.5rem; 504 | padding: 0.2rem; 505 | } 506 | 507 | /* do not show border around file selector button */ 508 | input[type="file"] { 509 | border: 0; 510 | } 511 | 512 | /* Misc body elements */ 513 | hr { 514 | border: none; 515 | height: 1px; 516 | background: var(--border); 517 | margin: 1rem auto; 518 | } 519 | 520 | mark { 521 | padding: 2px 5px; 522 | border-radius: var(--standard-border-radius); 523 | background-color: var(--marked); 524 | color: black; 525 | } 526 | 527 | img, 528 | video { 529 | max-width: 100%; 530 | height: auto; 531 | border-radius: var(--standard-border-radius); 532 | } 533 | 534 | figure { 535 | margin: 0; 536 | display: block; 537 | overflow-x: auto; 538 | } 539 | 540 | figcaption { 541 | text-align: center; 542 | font-size: 0.9rem; 543 | color: var(--text-light); 544 | margin-bottom: 1rem; 545 | } 546 | 547 | blockquote { 548 | margin: 2rem 0 2rem 2rem; 549 | padding: 0.4rem 0.8rem; 550 | border-left: 0.35rem solid var(--accent); 551 | color: var(--text-light); 552 | font-style: italic; 553 | } 554 | 555 | cite { 556 | font-size: 0.9rem; 557 | color: var(--text-light); 558 | font-style: normal; 559 | } 560 | 561 | dt { 562 | color: var(--text-light); 563 | } 564 | 565 | /* Use mono font for code elements */ 566 | code, 567 | pre, 568 | pre span, 569 | kbd, 570 | samp { 571 | font-family: var(--mono-font); 572 | color: var(--code); 573 | } 574 | 575 | kbd { 576 | color: var(--preformatted); 577 | border: 1px solid var(--preformatted); 578 | border-bottom: 3px solid var(--preformatted); 579 | border-radius: var(--standard-border-radius); 580 | padding: 0.1rem 0.4rem; 581 | } 582 | 583 | pre { 584 | padding: 1rem 1.4rem; 585 | max-width: 100%; 586 | overflow: auto; 587 | color: var(--preformatted); 588 | } 589 | 590 | /* Fix embedded code within pre */ 591 | pre code { 592 | color: var(--preformatted); 593 | background: none; 594 | margin: 0; 595 | padding: 0; 596 | } 597 | 598 | /* Progress bars */ 599 | /* Declarations are repeated because you */ 600 | /* cannot combine vendor-specific selectors */ 601 | progress { 602 | width: 100%; 603 | } 604 | 605 | progress:indeterminate { 606 | background-color: var(--accent-bg); 607 | } 608 | 609 | progress::-webkit-progress-bar { 610 | border-radius: var(--standard-border-radius); 611 | background-color: var(--accent-bg); 612 | } 613 | 614 | progress::-webkit-progress-value { 615 | border-radius: var(--standard-border-radius); 616 | background-color: var(--accent); 617 | } 618 | 619 | progress::-moz-progress-bar { 620 | border-radius: var(--standard-border-radius); 621 | background-color: var(--accent); 622 | transition-property: width; 623 | transition-duration: 0.3s; 624 | } 625 | 626 | progress:indeterminate::-moz-progress-bar { 627 | background-color: var(--accent-bg); 628 | } 629 | 630 | dialog { 631 | max-width: 40rem; 632 | margin: auto; 633 | } 634 | 635 | dialog::backdrop { 636 | background-color: var(--bg); 637 | opacity: 0.8; 638 | } 639 | 640 | @media only screen and (max-width: 720px) { 641 | dialog { 642 | max-width: 100%; 643 | margin: auto 1em; 644 | } 645 | } 646 | 647 | /* Classes for buttons and notices */ 648 | .button, 649 | .button:visited { 650 | display: inline-block; 651 | text-decoration: none; 652 | border: none; 653 | border-radius: 5px; 654 | background: var(--accent); 655 | font-size: 1rem; 656 | color: var(--bg); 657 | padding: 0.7rem 0.9rem; 658 | margin: 0.5rem 0; 659 | } 660 | 661 | .button:hover, 662 | .button:focus { 663 | filter: brightness(1.4); 664 | cursor: pointer; 665 | } 666 | 667 | .notice { 668 | background: var(--accent-bg); 669 | border: 2px solid var(--border); 670 | border-radius: 5px; 671 | padding: 1.5rem; 672 | margin: 2rem 0; 673 | } 674 | -------------------------------------------------------------------------------- /notebooks/Training and Testing Simple Recommendation Classifiers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "dd4c3a25-bf29-4236-b89e-f1926867c11f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from sqlalchemy import create_engine\n", 13 | "from urllib.parse import urlparse\n", 14 | "from nltk.corpus import stopwords\n", 15 | "from nltk import tokenize\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from collections import Counter\n", 18 | "from imblearn.over_sampling import RandomOverSampler\n", 19 | "from bs4 import BeautifulSoup\n", 20 | "\n", 21 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 22 | "from sklearn.naive_bayes import ComplementNB\n", 23 | "from sklearn.linear_model import LogisticRegression\n", 24 | "from sklearn.model_selection import RandomizedSearchCV\n", 25 | "from sklearn.pipeline import Pipeline\n", 26 | "from sklearn.svm import SVC\n", 27 | "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", 28 | "\n", 29 | "import joblib\n", 30 | "import os\n", 31 | "import re\n", 32 | "import string\n", 33 | "import html\n", 34 | "\n", 35 | "from pprint import pprint\n", 36 | "from time import time\n", 37 | "from datetime import datetime" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "76fca79d-1422-4b82-b973-4aefd64167f4", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import nltk\n", 48 | "nltk.download('punkt_tab')\n", 49 | "nltk.download('stopwords')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "85f91db7-9646-4975-bac6-42f91be92251", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import os\n", 60 | "from dotenv import load_dotenv, dotenv_values \n", 61 | "load_dotenv() " 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "4474b034-a67a-457d-9204-e798e2672469", 67 | "metadata": {}, 68 | "source": [ 69 | "## Priveedly: Training a Simple Content Recommender (Classifier) for Personal Use\n", 70 | "\n", 71 | "This notebook is originally for use with [Priveedly](https://blog.kjamistan.com/priveedly-your-private-and-personal-content-reader-and-recommender.html), a personal use content aggregator system available on [GitHub](https://github.com/kjam/priveedly).\n", 72 | "\n", 73 | "- There is a YouTube video to walk you through the notebook at a high level, in case it is helpful! \n", 74 | "- There are some links below to learn more about how to use scikit-learn.\n", 75 | "- I welcome feedback and contributions via GitHub!\n", 76 | "- Most importantly: HAVE FUN playing with ML concepts!\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "b9b5a9fd-64eb-4038-a10e-24af5695aae1", 82 | "metadata": {}, 83 | "source": [ 84 | "# Getting text from Postgres into Pandas" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "64328592-1b29-4cca-a4e3-90dd4c3b0320", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "if os.path.isfile('data/cleaned.csv'):\n", 95 | " print (\"SKIP TO LOADING CLEANED DF!!!\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "df266446-4771-4440-9c4b-8463703c509e", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "engine = create_engine(os.getenv('LOCAL_DB_CONNSTR'))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "268bea0d-9996-4bf4-9ad7-040e15e2dd63", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "sites_df = pd.read_sql(\n", 116 | " \"select title, url, description, site_name, interesting from sites_sitepost WHERE published::date >= '2023-01-01'\", \n", 117 | " con=engine)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "6c072a4f-b809-44c6-b8e8-0ced8610b9fb", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "feeds_df = pd.read_sql(\n", 128 | " \"select feeds_feedentry.title as title, feeds_feedentry.url as url, feeds_feedentry.description as description, feeds_feed.title as site_name, interesting from feeds_feedentry JOIN feeds_feed ON feeds_feed.id = feed_id WHERE published::date >= '2023-01-01'\", \n", 129 | " con=engine)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "71050239-d0f4-49a6-bae0-e7e9fd943dd5", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "reddit_df = pd.read_sql(\n", 140 | " \"select sites_redditpost.title as title, sites_redditpost.url as url, sites_redditpost.description as description, sites_subreddit.name as site_name, interesting from sites_redditpost JOIN sites_subreddit ON sites_redditpost.id = sites_subreddit.id WHERE published::date >= '2023-01-01'\", \n", 141 | " con=engine)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "e2ee417e-18b6-4f3a-a31b-64544a81e8cb", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "content_df = pd.concat([reddit_df, sites_df, feeds_df])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "d9729773-70ae-4951-b543-6a050f9d3615", 157 | "metadata": {}, 158 | "source": [ 159 | "# Evaluating target" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "92887ef8-85d8-4ba3-a446-a20e6072ce98", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "content_df.interesting = content_df.interesting.astype(int)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "2a77a6b1-6b09-4b04-8791-de63d7b8678e", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "content_df.interesting.value_counts()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "53a4276c-049e-4c58-bffc-8aaa2e94658b", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "content_df.interesting.value_counts().iloc[0] / content_df.shape[0]" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "9965b795-cd98-4d8f-a57e-e44e641a37f8", 196 | "metadata": { 197 | "scrolled": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "content_df.interesting.value_counts().plot.bar()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "088f5ad6-ac5c-49c4-be23-294a466f4e48", 207 | "metadata": {}, 208 | "source": [ 209 | "# Preparing the text data\n", 210 | "\n", 211 | "You'll need to take this code and put it into the priveedly rate_all.py script (see management_commands/rate_all.py) once you are running your pipeline in production. \n", 212 | "\n", 213 | "If you are using non-English languages, you probably want to play around and adjust this preparation to fit what works for you. I would love if you want to contribute any interesting additional notebooks to the repo! :)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "5e72236a-7432-46a6-ad96-f23d12ef071c", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def tokenize_url(url_str):\n", 224 | " parsed_url = urlparse(url_str)\n", 225 | " return parsed_url.netloc, ' '.join(parsed_url.path.split('/')).replace('-', ' '), parsed_url.query.replace('?', ' ').replace('=', ' ')\n", 226 | "\n", 227 | "def prepare_content(pandas_row):\n", 228 | " netloc, path, query = tokenize_url(pandas_row.url)\n", 229 | " return ' '.join([pandas_row.title, pandas_row.description, pandas_row.site_name])\n", 230 | "\n", 231 | "CLEAN_NUMBERS = re.compile('[0-9,\\\\.$\\\\%]+')\n", 232 | "CLEAN_NUMBERS_AND_ONE_LETTER = re.compile('([a-z]\\\\d+)|(\\\\d+[a-z])|(\\\\d+[a-z]\\\\d+)')\n", 233 | "CLEAN_REPEATED_PUNCTUATION = re.compile('[!\\\\-\\\\/:-@-`’–{-~\"“”\\\\[\\\\]]+')\n", 234 | "\n", 235 | "def remove_tags_and_lowercase(text): \n", 236 | " # some parts from https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string\n", 237 | " if BeautifulSoup(text, \"html.parser\").find():\n", 238 | " try:\n", 239 | " soup = BeautifulSoup(text)\n", 240 | " text = soup.get_text()\n", 241 | " except:\n", 242 | " pass\n", 243 | " cleantext = html.unescape(text).encode('unicode_escape').decode('unicode_escape')\n", 244 | " # you can try this line or other similar things if you want to be more deliberate about cleaning!\n", 245 | " #cleantext = re.sub(CLEAN_NUMBERS_AND_ONE_LETTER, '', cleantext)\n", 246 | " cleantext = re.sub(CLEAN_NUMBERS, '', cleantext)\n", 247 | " cleantext = re.sub(CLEAN_REPEATED_PUNCTUATION, '', cleantext)\n", 248 | " return cleantext.lower()\n", 249 | "\n", 250 | "removal = set(stopwords.words('english')).union(set(string.punctuation))\n", 251 | "\n", 252 | "def tokenize_content(text):\n", 253 | " return [w for w in tokenize.word_tokenize(remove_tags_and_lowercase(text)) if w.lower() not in removal]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "891c0d6c-dc05-4916-aec7-2b67d9588351", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "content_df['full_text'] = content_df.apply(prepare_content, axis=1)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "15a2b39c-889d-4ed9-a322-a6b86855439f", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "content_df['cleaned_text'] = content_df['full_text'].map(lambda x: ' '.join(tokenize_content(x)))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "3561815b-9f3b-4408-ae84-5c97875ab78d", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "sample = content_df.sample(20)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "45e128c9-ec82-450e-8849-b49c7987b939", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "sample[[\"full_text\", \"cleaned_text\"]]" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "8fe81494-6edc-411e-a1a4-4fc931d2ac35", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "content_df.to_csv(\"data/cleaned.csv\")" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "11743eea-ad59-4420-8e9a-641766733b9a", 309 | "metadata": {}, 310 | "source": [ 311 | "### Now you can always load this way" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "3cfcdda5-9aa7-4e4e-aa17-7ce74fee0a15", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "content_df = pd.read_csv(\"data/cleaned.csv\")" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "id": "cfc424f3-8b7d-4a35-80ab-51baa13f76d1", 327 | "metadata": {}, 328 | "source": [ 329 | "### Dealing with class imbalance\n", 330 | "\n", 331 | "My classes are really lopsided. Yours might be different! If you notice that yours are more even, you can use the orig_X_train as the X_train (and so forth!).\n", 332 | "\n", 333 | "To help with my lopsided classes, I will use [Imbalanced Learn](https://imbalanced-learn.org/)." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "id": "7d510c49-ea58-4874-b8db-ba0768297d01", 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "oversampler = RandomOverSampler(sampling_strategy=0.15)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "10ed0d75-f9de-4c69-a0b3-77a92031a859", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "orig_X_train, orig_X_test, orig_y_train, orig_y_test = train_test_split(content_df.cleaned_text, content_df.interesting, \n", 354 | " test_size=0.3, stratify=content_df.interesting)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "a7326c30-44b2-4f0f-bd5a-d93d88cd2f5f", 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "Counter(orig_y_train), Counter(orig_y_test)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "48db71c5-80b9-417e-b37e-6c579cf3a120", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "X_res, y_res = oversampler.fit_resample(content_df[[\"cleaned_text\"]].to_numpy(), content_df.interesting.to_numpy())" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "76509404-152d-46c1-b60f-2fe0b4371b63", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "Counter(y_res)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "id": "27a9045e-1490-4a8b-bd3e-7594943fb96b", 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "X_train, X_test, y_train, y_test = train_test_split(X_res.flatten(), y_res, test_size=0.3)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "id": "ec4ee16b-f4da-408f-93c7-3b629d815a8a", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "Counter(y_train), Counter(y_test)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "id": "534aedb3-03ba-4ef7-85c3-a397a5013245", 410 | "metadata": {}, 411 | "source": [ 412 | "### Let's build some NLP pipelines with Scikit-learn!\n", 413 | "\n", 414 | "Scikit-learn is a great library for building machine learning models, especially with smaller personalized datasets, like this one! It has everything you need to get started and a great learning community and documentation.\n", 415 | "\n", 416 | "Want to learn more about scikit-learn and different machine learning models? Check out:\n", 417 | "\n", 418 | "- [Scikit-learn crash course](https://www.youtube.com/watch?v=0B5eIE_1vpU)\n", 419 | "- [Scikit-learn online learning course](https://inria.github.io/scikit-learn-mooc/)\n", 420 | "- [Calmcode](https://calmcode.io)\n", 421 | "- [probabl's YouTube Channel (some advanced topics)](https://www.youtube.com/@probabl_ai)\n", 422 | "\n", 423 | "Hat tip to [Vincent](https://github.com/koaning) for helping me assemble these resources!" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "id": "1dd58cbb-3972-4506-b3e5-b2c5211f0325", 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "svc_pipeline = Pipeline(\n", 434 | " [\n", 435 | " (\"vect\", TfidfVectorizer()),\n", 436 | " (\"clf\", SVC()), # more complex, but maybe not worth it\n", 437 | " ]\n", 438 | ")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "id": "6eeab15f-a95f-4a86-aa7e-a9c1d506d2fd", 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "bayes_pipeline = Pipeline(\n", 449 | " [\n", 450 | " (\"vect\", TfidfVectorizer()),\n", 451 | " (\"clf\", ComplementNB()), # better at imbalance\n", 452 | " ]\n", 453 | ")" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "id": "af29b2b5-e681-42a6-a34d-7ec4befed5e1", 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "logreg_pipeline = Pipeline(\n", 464 | " [\n", 465 | " (\"vect\", TfidfVectorizer()),\n", 466 | " (\"clf\", LogisticRegression()), # simple, but maybe good enough\n", 467 | " ]\n", 468 | ")" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "7d4a6091-1e7a-49d0-8bb0-360db18603e8", 474 | "metadata": {}, 475 | "source": [ 476 | "For looking up parameters to test, take a look at the following:\n", 477 | "\n", 478 | "- [TF-IDF Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)\n", 479 | "- [SVC Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)\n", 480 | "- [Complement Naive Bayes Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html)\n", 481 | "- [LogisticRegression Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "4b95d398-7f4c-47c2-a8d6-a3f271bae949", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "base_parameter_grid = {\n", 492 | " \"vect__max_df\": (0.8, 0.9),\n", 493 | " \"vect__min_df\": (0.01, 0.03),\n", 494 | " \"vect__ngram_range\": ((1, 1), (1, 2)), # unigrams or bigrams\n", 495 | " #\"vect__norm\": (\"l1\", \"l2\"),\n", 496 | "}" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "id": "c814383d-aea9-438b-8795-d97c307cb6f5", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "svc_parameter_grid = {\n", 507 | " \"clf__C\": (1, 10), # inverse of regularization strength (smaller = more regularization)\n", 508 | " \"clf__kernel\": ('rbf', 'sigmoid', 'poly') \n", 509 | "}\n" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "id": "857aaca3-bdbf-4382-a723-c87fde3238cf", 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "cnb_parameter_grid = {\n", 520 | " \"clf__alpha\": np.logspace(-6, 6, 13), # Additive (Laplace/Lidstone) smoothing parameter \n", 521 | "}" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "id": "3fce5e76-c849-4d51-8687-3aaa5d0f837a", 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "logreg_parameter_grid = {\n", 532 | " \"clf__C\": (1, 10), # inverse of regularization strength (smaller = more regularization)\n", 533 | " \"clf__solver\": (\"lbfgs\", \"liblinear\", \"newton-cholesky\"), \n", 534 | "}" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "id": "5d4859df-26c3-441d-bca4-3d2cc7bb659a", 540 | "metadata": {}, 541 | "source": [ 542 | "### Start by testing each model separately\n", 543 | "\n", 544 | "You can eventually productionize this with Weights and Biases, or just find the type of model that works best for your data and stick with that, updating only the training dataset over time. \n", 545 | "\n", 546 | "After you get your first model or two working, you likely also decide: oh I really only want to test SVC or I like having a fast LR model. Or even, I want to compare these simple models with a deep learning model or a local LLM.\n", 547 | "\n", 548 | "To test each one, change the lines below to reflect your changes:\n", 549 | "\n", 550 | "- use the parameter grid you set up above\n", 551 | "- change the model_name to something you will remember\n", 552 | "- change the estimator to the pipeline that you are evaluating" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "id": "1a6ca6a9-a428-4ed1-9f24-7fc2cc8e49ec", 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "parameter_grid = base_parameter_grid.copy()\n", 563 | "parameter_grid.update(logreg_parameter_grid) #CHANGE HERE: logreg_parameter_grid, cnb_parameter_grid, svc_parameter_grid\n", 564 | "model_name = \"LR\" # CHANGE HERE suggestion: LR, CNB, SVC\n", 565 | "\n", 566 | "random_search = RandomizedSearchCV(\n", 567 | " estimator=logreg_pipeline, # CHANGE HERE: logreg_pipeline, bayes_pipeline, svc_pipeline\n", 568 | " param_distributions=parameter_grid,\n", 569 | " n_iter=20,\n", 570 | " random_state=0,\n", 571 | " n_jobs=4,\n", 572 | " verbose=1,\n", 573 | ")\n", 574 | "\n", 575 | "print(\"Performing grid search...\")\n", 576 | "print(\"Hyperparameters to be evaluated:\")\n", 577 | "pprint(parameter_grid)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "id": "36a93f84-bd2c-4ee0-af08-b78a4418c3c7", 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "t0 = time()\n", 588 | "random_search.fit(X_train, y_train)\n", 589 | "print(f\"Done in {time() - t0:.3f}s\")" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "id": "5fe22603-c6d2-4ce6-903a-d07d5f28aa5f", 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "print(\"Best parameters combination found:\")\n", 600 | "best_parameters = random_search.best_estimator_.get_params()\n", 601 | "for param_name in sorted(parameter_grid.keys()):\n", 602 | " print(f\"{param_name}: {best_parameters[param_name]}\")" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "id": "a9d892e1-e3e0-4f56-89ee-590f74b39360", 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "test_accuracy = random_search.score(X_test, y_test)\n", 613 | "print(f\"Accuracy of the best parameters using CV random search: {random_search.best_score_:.3f}\")\n", 614 | "print(f\"Accuracy on test set: {test_accuracy:.3f}\")" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "id": "d4eb3062-c050-4a3e-9188-fc489900cdcd", 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "y_pred = random_search.predict(X_test)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "id": "122f7dce-4564-45af-8607-9e4447222f16", 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "human_labels = {0: 'not interesting',\n", 635 | " 1: 'interesting'}" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "id": "50146191-cdb6-48ae-9af6-1ee33cde685f", 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=[human_labels[c] for c in random_search.classes_])\n", 646 | "disp.plot()" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "id": "ffb04625-450f-40f0-9903-8d5412636c1c", 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "experiment_time = datetime.now().strftime(\"%Y%m%d_%H_%M\")\n", 657 | "with open(\"experiments/{}_{}.txt\".format(experiment_time, model_name), 'w') as documentation_file:\n", 658 | " for param_name in sorted(parameter_grid.keys()):\n", 659 | " documentation_file.write(f\"{param_name}: {best_parameters[param_name]}\")\n", 660 | " documentation_file.write(f\"Accuracy on the random search: {random_search.best_score_:.3f}\")\n", 661 | " documentation_file.write(f\"Accuracy on test set: {test_accuracy:.3f}\") " 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "id": "e9465786-1310-4c06-ad81-db68a871e7a9", 668 | "metadata": {}, 669 | "outputs": [], 670 | "source": [ 671 | "logreg_pipeline.set_params(**best_parameters) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "id": "e7483cb0-871b-4212-934d-212ac7afb6ab", 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "joblib.dump(logreg_pipeline, \"experiments/models/{}_{}_pipeline.pkl\".format(experiment_time, model_name)) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "id": "3f6eff3d-2c9c-4745-a816-af60b65a695f", 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "pipeline = logreg_pipeline # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "id": "9b00371a-310d-4fe3-b647-e0c0dc178f80", 697 | "metadata": {}, 698 | "source": [ 699 | "If you ever want to load again, you can just:\n" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "08714b49-dd0b-472f-a306-cb5025c5d7b2", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "pipeline = joblib.load('experiments/models/20250121_19_46_SVC_pipeline.pkl')" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "id": "51ff4d4f-6141-4763-909e-ccf416836f33", 715 | "metadata": {}, 716 | "source": [ 717 | "### Investigating / interpreting your model\n", 718 | "\n", 719 | "So now you have an idea of the accuracy, but will it work for what you want to use it for? \n", 720 | "\n", 721 | "Let's say that it's really good at recognizing exactly your interests based on some silly keywords that you don't think will hold in practice. Or let's say you're also just curious about what keywords might be most interesting to you and want to have a look at the inner workings of your system. Either way, it's a good idea to investigate the model in order to qualitatively compare the models you've trained and determine which model you want to use.\n", 722 | "\n", 723 | "The following parts of the notebook can help you investigate and figure out how you think about the model decisions.\n", 724 | "\n", 725 | "#### Note: LIME Text Explainer doesn't appear to work for my data with SVC; but that might be different for you ! Let me know if it does!" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "id": "3447d5f4-c393-429b-90a7-c084a67cd742", 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "from lime.lime_text import LimeTextExplainer\n", 736 | "\n", 737 | "\n", 738 | "explainer = LimeTextExplainer(class_names=[human_labels[c] for c in pipeline.classes_])" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "id": "e6f0ffa8-06e2-4523-ad86-335cc1376289", 745 | "metadata": {}, 746 | "outputs": [], 747 | "source": [ 748 | "sample_df = content_df.groupby(\"interesting\").sample(n=20)" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "id": "7c8b4111-1a40-4379-a968-d9e572b9f617", 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "pipeline.named_steps" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "id": "3e6fa799-82d4-4e07-a55c-a0ae249ffada", 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "vectorizer = pipeline.named_steps['vect']\n", 769 | "estimator = pipeline.named_steps['clf']" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "706c5bc4-333f-4042-85b5-49efd602dad0", 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "# this is a fix for the SVC problem in LIME (see https://github.com/marcotcr/lime/issues/465)\n", 780 | "def classifier_fn(X):\n", 781 | " vectorized_text_instance = vectorizer.transform(X)\n", 782 | " decision = estimator.decision_function(vectorized_text_instance)\n", 783 | " reshaped_decision = np.array(decision).reshape(-1, 1)\n", 784 | " return reshaped_decision" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "id": "ce7c92fd-a7e8-41c1-bfc6-637e6ab47684", 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "for example in sample_df.cleaned_text: \n", 795 | " try:\n", 796 | " if hasattr(pipeline, 'predict_proba'):\n", 797 | " exp = explainer.explain_instance(example, pipeline.predict_proba, labels=pipeline.classes_) \n", 798 | " elif \"SVC\" in str(estimator): # this is hacky :(\n", 799 | " exp = explainer.explain_instance(text_instance=example, classifier_fn=classifier_fn, labels=(0,))\n", 800 | " exp.show_in_notebook()\n", 801 | " except Exception as e:\n", 802 | " print(e)\n", 803 | " print('problem with this example')" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "id": "2fc8267f-59c0-496c-b8cc-3d6f5c5c383d", 810 | "metadata": { 811 | "scrolled": true 812 | }, 813 | "outputs": [], 814 | "source": [ 815 | "from sklearn.inspection import permutation_importance\n", 816 | "\n", 817 | "if hasattr(estimator, 'feature_log_prob_'): # bayesian\n", 818 | " neg_class_prob_sorted = estimator.feature_log_prob_[0, :].argsort()[::-1]\n", 819 | " pos_class_prob_sorted = estimator.feature_log_prob_[1, :].argsort()[::-1]\n", 820 | "elif hasattr(estimator, 'coef_'): # logreg\n", 821 | " pos_class_prob_sorted = estimator.coef_[0, :].argsort()[::-1]\n", 822 | " neg_class_prob_sorted = estimator.coef_[0, :].argsort()\n", 823 | "elif hasattr(estimator, 'kernel'): # svm\n", 824 | " X = vectorizer.transform(X_train).toarray() # this is inefficient and it might run out of memory or timeout :(\n", 825 | " # if this happens restart kernel and don't rerun \n", 826 | " perm_importance = permutation_importance(estimator, X, y_train)\n", 827 | " pos_class_prob_sorted = perm_importance.importances_mean.argsort()\n", 828 | " neg_class_prob_sorted = perm_importance.importances_mean.argsort()[::-1]\n", 829 | "\n", 830 | "\n", 831 | "feature_names = vectorizer.get_feature_names_out()\n", 832 | "\n", 833 | "print(np.take(feature_names, neg_class_prob_sorted[:100]))\n", 834 | "print(np.take(feature_names, pos_class_prob_sorted[:100]))\n" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": null, 840 | "id": "d35c814d-2212-4987-ae82-a5cf6d5a815c", 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "def find_word_rank(query):\n", 845 | " i, = np.where(feature_names == query)\n", 846 | " try:\n", 847 | " pos_i = np.where(pos_class_prob_sorted == i)\n", 848 | " neg_i = np.where(neg_class_prob_sorted == i)\n", 849 | " if pos_i < neg_i:\n", 850 | " print(\"ranked in positive score at position #{} out of {}\".format(pos_i[0][0], pos_class_prob_sorted.shape[0]))\n", 851 | " else:\n", 852 | " print(\"ranked in negative score at position #{} out of {}\".format(neg_i[0][0], neg_class_prob_sorted.shape[0]))\n", 853 | " except ValueError:\n", 854 | " print('token not found')\n" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "id": "77eea602-ed31-4031-a3cb-11c37961b699", 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "find_word_rank(\"crypto\")" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "id": "fa3c69a0-60f4-43cd-8de6-cb6a4be4c037", 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "find_word_rank(\"cryptography\")" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "id": "c8ee4a34-3cb1-4de2-8ac4-0d0368ac6d41", 880 | "metadata": {}, 881 | "source": [ 882 | "### If this is the main one you want to use, store it as pipeline.pkl and upload it to your server :)" 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "execution_count": null, 888 | "id": "650d779a-04bf-4da5-a41e-e4f4b1dc66a9", 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "joblib.dump(pipeline, \"pipeline.pkl\")" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": null, 898 | "id": "ca277943-8f82-493b-801c-0a2bedc38d4c", 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [] 902 | } 903 | ], 904 | "metadata": { 905 | "kernelspec": { 906 | "display_name": "Python 3 (ipykernel)", 907 | "language": "python", 908 | "name": "python3" 909 | }, 910 | "language_info": { 911 | "codemirror_mode": { 912 | "name": "ipython", 913 | "version": 3 914 | }, 915 | "file_extension": ".py", 916 | "mimetype": "text/x-python", 917 | "name": "python", 918 | "nbconvert_exporter": "python", 919 | "pygments_lexer": "ipython3", 920 | "version": "3.12.4" 921 | } 922 | }, 923 | "nbformat": 4, 924 | "nbformat_minor": 5 925 | } 926 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # GNU GENERAL PUBLIC LICENSE 2 | 3 | Version 3, 29 June 2007 4 | 5 | Copyright (C) 2007 Free Software Foundation, Inc. 6 | 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | ## Preamble 12 | 13 | The GNU General Public License is a free, copyleft license for 14 | software and other kinds of works. 15 | 16 | The licenses for most software and other practical works are designed 17 | to take away your freedom to share and change the works. By contrast, 18 | the GNU General Public License is intended to guarantee your freedom 19 | to share and change all versions of a program--to make sure it remains 20 | free software for all its users. We, the Free Software Foundation, use 21 | the GNU General Public License for most of our software; it applies 22 | also to any other work released this way by its authors. You can apply 23 | it to your programs, too. 24 | 25 | When we speak of free software, we are referring to freedom, not 26 | price. Our General Public Licenses are designed to make sure that you 27 | have the freedom to distribute copies of free software (and charge for 28 | them if you wish), that you receive source code or can get it if you 29 | want it, that you can change the software or use pieces of it in new 30 | free programs, and that you know you can do these things. 31 | 32 | To protect your rights, we need to prevent others from denying you 33 | these rights or asking you to surrender the rights. Therefore, you 34 | have certain responsibilities if you distribute copies of the 35 | software, or if you modify it: responsibilities to respect the freedom 36 | of others. 37 | 38 | For example, if you distribute copies of such a program, whether 39 | gratis or for a fee, you must pass on to the recipients the same 40 | freedoms that you received. You must make sure that they, too, receive 41 | or can get the source code. And you must show them these terms so they 42 | know their rights. 43 | 44 | Developers that use the GNU GPL protect your rights with two steps: 45 | (1) assert copyright on the software, and (2) offer you this License 46 | giving you legal permission to copy, distribute and/or modify it. 47 | 48 | For the developers' and authors' protection, the GPL clearly explains 49 | that there is no warranty for this free software. For both users' and 50 | authors' sake, the GPL requires that modified versions be marked as 51 | changed, so that their problems will not be attributed erroneously to 52 | authors of previous versions. 53 | 54 | Some devices are designed to deny users access to install or run 55 | modified versions of the software inside them, although the 56 | manufacturer can do so. This is fundamentally incompatible with the 57 | aim of protecting users' freedom to change the software. The 58 | systematic pattern of such abuse occurs in the area of products for 59 | individuals to use, which is precisely where it is most unacceptable. 60 | Therefore, we have designed this version of the GPL to prohibit the 61 | practice for those products. If such problems arise substantially in 62 | other domains, we stand ready to extend this provision to those 63 | domains in future versions of the GPL, as needed to protect the 64 | freedom of users. 65 | 66 | Finally, every program is threatened constantly by software patents. 67 | States should not allow patents to restrict development and use of 68 | software on general-purpose computers, but in those that do, we wish 69 | to avoid the special danger that patents applied to a free program 70 | could make it effectively proprietary. To prevent this, the GPL 71 | assures that patents cannot be used to render the program non-free. 72 | 73 | The precise terms and conditions for copying, distribution and 74 | modification follow. 75 | 76 | ## TERMS AND CONDITIONS 77 | 78 | ### 0. Definitions. 79 | 80 | "This License" refers to version 3 of the GNU General Public License. 81 | 82 | "Copyright" also means copyright-like laws that apply to other kinds 83 | of works, such as semiconductor masks. 84 | 85 | "The Program" refers to any copyrightable work licensed under this 86 | License. Each licensee is addressed as "you". "Licensees" and 87 | "recipients" may be individuals or organizations. 88 | 89 | To "modify" a work means to copy from or adapt all or part of the work 90 | in a fashion requiring copyright permission, other than the making of 91 | an exact copy. The resulting work is called a "modified version" of 92 | the earlier work or a work "based on" the earlier work. 93 | 94 | A "covered work" means either the unmodified Program or a work based 95 | on the Program. 96 | 97 | To "propagate" a work means to do anything with it that, without 98 | permission, would make you directly or secondarily liable for 99 | infringement under applicable copyright law, except executing it on a 100 | computer or modifying a private copy. Propagation includes copying, 101 | distribution (with or without modification), making available to the 102 | public, and in some countries other activities as well. 103 | 104 | To "convey" a work means any kind of propagation that enables other 105 | parties to make or receive copies. Mere interaction with a user 106 | through a computer network, with no transfer of a copy, is not 107 | conveying. 108 | 109 | An interactive user interface displays "Appropriate Legal Notices" to 110 | the extent that it includes a convenient and prominently visible 111 | feature that (1) displays an appropriate copyright notice, and (2) 112 | tells the user that there is no warranty for the work (except to the 113 | extent that warranties are provided), that licensees may convey the 114 | work under this License, and how to view a copy of this License. If 115 | the interface presents a list of user commands or options, such as a 116 | menu, a prominent item in the list meets this criterion. 117 | 118 | ### 1. Source Code. 119 | 120 | The "source code" for a work means the preferred form of the work for 121 | making modifications to it. "Object code" means any non-source form of 122 | a work. 123 | 124 | A "Standard Interface" means an interface that either is an official 125 | standard defined by a recognized standards body, or, in the case of 126 | interfaces specified for a particular programming language, one that 127 | is widely used among developers working in that language. 128 | 129 | The "System Libraries" of an executable work include anything, other 130 | than the work as a whole, that (a) is included in the normal form of 131 | packaging a Major Component, but which is not part of that Major 132 | Component, and (b) serves only to enable use of the work with that 133 | Major Component, or to implement a Standard Interface for which an 134 | implementation is available to the public in source code form. A 135 | "Major Component", in this context, means a major essential component 136 | (kernel, window system, and so on) of the specific operating system 137 | (if any) on which the executable work runs, or a compiler used to 138 | produce the work, or an object code interpreter used to run it. 139 | 140 | The "Corresponding Source" for a work in object code form means all 141 | the source code needed to generate, install, and (for an executable 142 | work) run the object code and to modify the work, including scripts to 143 | control those activities. However, it does not include the work's 144 | System Libraries, or general-purpose tools or generally available free 145 | programs which are used unmodified in performing those activities but 146 | which are not part of the work. For example, Corresponding Source 147 | includes interface definition files associated with source files for 148 | the work, and the source code for shared libraries and dynamically 149 | linked subprograms that the work is specifically designed to require, 150 | such as by intimate data communication or control flow between those 151 | subprograms and other parts of the work. 152 | 153 | The Corresponding Source need not include anything that users can 154 | regenerate automatically from other parts of the Corresponding Source. 155 | 156 | The Corresponding Source for a work in source code form is that same 157 | work. 158 | 159 | ### 2. Basic Permissions. 160 | 161 | All rights granted under this License are granted for the term of 162 | copyright on the Program, and are irrevocable provided the stated 163 | conditions are met. This License explicitly affirms your unlimited 164 | permission to run the unmodified Program. The output from running a 165 | covered work is covered by this License only if the output, given its 166 | content, constitutes a covered work. This License acknowledges your 167 | rights of fair use or other equivalent, as provided by copyright law. 168 | 169 | You may make, run and propagate covered works that you do not convey, 170 | without conditions so long as your license otherwise remains in force. 171 | You may convey covered works to others for the sole purpose of having 172 | them make modifications exclusively for you, or provide you with 173 | facilities for running those works, provided that you comply with the 174 | terms of this License in conveying all material for which you do not 175 | control copyright. Those thus making or running the covered works for 176 | you must do so exclusively on your behalf, under your direction and 177 | control, on terms that prohibit them from making any copies of your 178 | copyrighted material outside their relationship with you. 179 | 180 | Conveying under any other circumstances is permitted solely under the 181 | conditions stated below. Sublicensing is not allowed; section 10 makes 182 | it unnecessary. 183 | 184 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 185 | 186 | No covered work shall be deemed part of an effective technological 187 | measure under any applicable law fulfilling obligations under article 188 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 189 | similar laws prohibiting or restricting circumvention of such 190 | measures. 191 | 192 | When you convey a covered work, you waive any legal power to forbid 193 | circumvention of technological measures to the extent such 194 | circumvention is effected by exercising rights under this License with 195 | respect to the covered work, and you disclaim any intention to limit 196 | operation or modification of the work as a means of enforcing, against 197 | the work's users, your or third parties' legal rights to forbid 198 | circumvention of technological measures. 199 | 200 | ### 4. Conveying Verbatim Copies. 201 | 202 | You may convey verbatim copies of the Program's source code as you 203 | receive it, in any medium, provided that you conspicuously and 204 | appropriately publish on each copy an appropriate copyright notice; 205 | keep intact all notices stating that this License and any 206 | non-permissive terms added in accord with section 7 apply to the code; 207 | keep intact all notices of the absence of any warranty; and give all 208 | recipients a copy of this License along with the Program. 209 | 210 | You may charge any price or no price for each copy that you convey, 211 | and you may offer support or warranty protection for a fee. 212 | 213 | ### 5. Conveying Modified Source Versions. 214 | 215 | You may convey a work based on the Program, or the modifications to 216 | produce it from the Program, in the form of source code under the 217 | terms of section 4, provided that you also meet all of these 218 | conditions: 219 | 220 | - a) The work must carry prominent notices stating that you modified 221 | it, and giving a relevant date. 222 | - b) The work must carry prominent notices stating that it is 223 | released under this License and any conditions added under 224 | section 7. This requirement modifies the requirement in section 4 225 | to "keep intact all notices". 226 | - c) You must license the entire work, as a whole, under this 227 | License to anyone who comes into possession of a copy. This 228 | License will therefore apply, along with any applicable section 7 229 | additional terms, to the whole of the work, and all its parts, 230 | regardless of how they are packaged. This License gives no 231 | permission to license the work in any other way, but it does not 232 | invalidate such permission if you have separately received it. 233 | - d) If the work has interactive user interfaces, each must display 234 | Appropriate Legal Notices; however, if the Program has interactive 235 | interfaces that do not display Appropriate Legal Notices, your 236 | work need not make them do so. 237 | 238 | A compilation of a covered work with other separate and independent 239 | works, which are not by their nature extensions of the covered work, 240 | and which are not combined with it such as to form a larger program, 241 | in or on a volume of a storage or distribution medium, is called an 242 | "aggregate" if the compilation and its resulting copyright are not 243 | used to limit the access or legal rights of the compilation's users 244 | beyond what the individual works permit. Inclusion of a covered work 245 | in an aggregate does not cause this License to apply to the other 246 | parts of the aggregate. 247 | 248 | ### 6. Conveying Non-Source Forms. 249 | 250 | You may convey a covered work in object code form under the terms of 251 | sections 4 and 5, provided that you also convey the machine-readable 252 | Corresponding Source under the terms of this License, in one of these 253 | ways: 254 | 255 | - a) Convey the object code in, or embodied in, a physical product 256 | (including a physical distribution medium), accompanied by the 257 | Corresponding Source fixed on a durable physical medium 258 | customarily used for software interchange. 259 | - b) Convey the object code in, or embodied in, a physical product 260 | (including a physical distribution medium), accompanied by a 261 | written offer, valid for at least three years and valid for as 262 | long as you offer spare parts or customer support for that product 263 | model, to give anyone who possesses the object code either (1) a 264 | copy of the Corresponding Source for all the software in the 265 | product that is covered by this License, on a durable physical 266 | medium customarily used for software interchange, for a price no 267 | more than your reasonable cost of physically performing this 268 | conveying of source, or (2) access to copy the Corresponding 269 | Source from a network server at no charge. 270 | - c) Convey individual copies of the object code with a copy of the 271 | written offer to provide the Corresponding Source. This 272 | alternative is allowed only occasionally and noncommercially, and 273 | only if you received the object code with such an offer, in accord 274 | with subsection 6b. 275 | - d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | - e) Convey the object code using peer-to-peer transmission, 288 | provided you inform other peers where the object code and 289 | Corresponding Source of the work are being offered to the general 290 | public at no charge under subsection 6d. 291 | 292 | A separable portion of the object code, whose source code is excluded 293 | from the Corresponding Source as a System Library, need not be 294 | included in conveying the object code work. 295 | 296 | A "User Product" is either (1) a "consumer product", which means any 297 | tangible personal property which is normally used for personal, 298 | family, or household purposes, or (2) anything designed or sold for 299 | incorporation into a dwelling. In determining whether a product is a 300 | consumer product, doubtful cases shall be resolved in favor of 301 | coverage. For a particular product received by a particular user, 302 | "normally used" refers to a typical or common use of that class of 303 | product, regardless of the status of the particular user or of the way 304 | in which the particular user actually uses, or expects or is expected 305 | to use, the product. A product is a consumer product regardless of 306 | whether the product has substantial commercial, industrial or 307 | non-consumer uses, unless such uses represent the only significant 308 | mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to 312 | install and execute modified versions of a covered work in that User 313 | Product from a modified version of its Corresponding Source. The 314 | information must suffice to ensure that the continued functioning of 315 | the modified object code is in no case prevented or interfered with 316 | solely because modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or 331 | updates for a work that has been modified or installed by the 332 | recipient, or for the User Product in which it has been modified or 333 | installed. Access to a network may be denied when the modification 334 | itself materially and adversely affects the operation of the network 335 | or violates the rules and protocols for communication across the 336 | network. 337 | 338 | Corresponding Source conveyed, and Installation Information provided, 339 | in accord with this section must be in a format that is publicly 340 | documented (and with an implementation available to the public in 341 | source code form), and must require no special password or key for 342 | unpacking, reading or copying. 343 | 344 | ### 7. Additional Terms. 345 | 346 | "Additional permissions" are terms that supplement the terms of this 347 | License by making exceptions from one or more of its conditions. 348 | Additional permissions that are applicable to the entire Program shall 349 | be treated as though they were included in this License, to the extent 350 | that they are valid under applicable law. If additional permissions 351 | apply only to part of the Program, that part may be used separately 352 | under those permissions, but the entire Program remains governed by 353 | this License without regard to the additional permissions. 354 | 355 | When you convey a copy of a covered work, you may at your option 356 | remove any additional permissions from that copy, or from any part of 357 | it. (Additional permissions may be written to require their own 358 | removal in certain cases when you modify the work.) You may place 359 | additional permissions on material, added by you to a covered work, 360 | for which you have or can give appropriate copyright permission. 361 | 362 | Notwithstanding any other provision of this License, for material you 363 | add to a covered work, you may (if authorized by the copyright holders 364 | of that material) supplement the terms of this License with terms: 365 | 366 | - a) Disclaiming warranty or limiting liability differently from the 367 | terms of sections 15 and 16 of this License; or 368 | - b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | - c) Prohibiting misrepresentation of the origin of that material, 372 | or requiring that modified versions of such material be marked in 373 | reasonable ways as different from the original version; or 374 | - d) Limiting the use for publicity purposes of names of licensors 375 | or authors of the material; or 376 | - e) Declining to grant rights under trademark law for use of some 377 | trade names, trademarks, or service marks; or 378 | - f) Requiring indemnification of licensors and authors of that 379 | material by anyone who conveys the material (or modified versions 380 | of it) with contractual assumptions of liability to the recipient, 381 | for any liability that these contractual assumptions directly 382 | impose on those licensors and authors. 383 | 384 | All other non-permissive additional terms are considered "further 385 | restrictions" within the meaning of section 10. If the Program as you 386 | received it, or any part of it, contains a notice stating that it is 387 | governed by this License along with a term that is a further 388 | restriction, you may remove that term. If a license document contains 389 | a further restriction but permits relicensing or conveying under this 390 | License, you may add to a covered work material governed by the terms 391 | of that license document, provided that the further restriction does 392 | not survive such relicensing or conveying. 393 | 394 | If you add terms to a covered work in accord with this section, you 395 | must place, in the relevant source files, a statement of the 396 | additional terms that apply to those files, or a notice indicating 397 | where to find the applicable terms. 398 | 399 | Additional terms, permissive or non-permissive, may be stated in the 400 | form of a separately written license, or stated as exceptions; the 401 | above requirements apply either way. 402 | 403 | ### 8. Termination. 404 | 405 | You may not propagate or modify a covered work except as expressly 406 | provided under this License. Any attempt otherwise to propagate or 407 | modify it is void, and will automatically terminate your rights under 408 | this License (including any patent licenses granted under the third 409 | paragraph of section 11). 410 | 411 | However, if you cease all violation of this License, then your license 412 | from a particular copyright holder is reinstated (a) provisionally, 413 | unless and until the copyright holder explicitly and finally 414 | terminates your license, and (b) permanently, if the copyright holder 415 | fails to notify you of the violation by some reasonable means prior to 416 | 60 days after the cessation. 417 | 418 | Moreover, your license from a particular copyright holder is 419 | reinstated permanently if the copyright holder notifies you of the 420 | violation by some reasonable means, this is the first time you have 421 | received notice of violation of this License (for any work) from that 422 | copyright holder, and you cure the violation prior to 30 days after 423 | your receipt of the notice. 424 | 425 | Termination of your rights under this section does not terminate the 426 | licenses of parties who have received copies or rights from you under 427 | this License. If your rights have been terminated and not permanently 428 | reinstated, you do not qualify to receive new licenses for the same 429 | material under section 10. 430 | 431 | ### 9. Acceptance Not Required for Having Copies. 432 | 433 | You are not required to accept this License in order to receive or run 434 | a copy of the Program. Ancillary propagation of a covered work 435 | occurring solely as a consequence of using peer-to-peer transmission 436 | to receive a copy likewise does not require acceptance. However, 437 | nothing other than this License grants you permission to propagate or 438 | modify any covered work. These actions infringe copyright if you do 439 | not accept this License. Therefore, by modifying or propagating a 440 | covered work, you indicate your acceptance of this License to do so. 441 | 442 | ### 10. Automatic Licensing of Downstream Recipients. 443 | 444 | Each time you convey a covered work, the recipient automatically 445 | receives a license from the original licensors, to run, modify and 446 | propagate that work, subject to this License. You are not responsible 447 | for enforcing compliance by third parties with this License. 448 | 449 | An "entity transaction" is a transaction transferring control of an 450 | organization, or substantially all assets of one, or subdividing an 451 | organization, or merging organizations. If propagation of a covered 452 | work results from an entity transaction, each party to that 453 | transaction who receives a copy of the work also receives whatever 454 | licenses to the work the party's predecessor in interest had or could 455 | give under the previous paragraph, plus a right to possession of the 456 | Corresponding Source of the work from the predecessor in interest, if 457 | the predecessor has it or can get it with reasonable efforts. 458 | 459 | You may not impose any further restrictions on the exercise of the 460 | rights granted or affirmed under this License. For example, you may 461 | not impose a license fee, royalty, or other charge for exercise of 462 | rights granted under this License, and you may not initiate litigation 463 | (including a cross-claim or counterclaim in a lawsuit) alleging that 464 | any patent claim is infringed by making, using, selling, offering for 465 | sale, or importing the Program or any portion of it. 466 | 467 | ### 11. Patents. 468 | 469 | A "contributor" is a copyright holder who authorizes use under this 470 | License of the Program or a work on which the Program is based. The 471 | work thus licensed is called the contributor's "contributor version". 472 | 473 | A contributor's "essential patent claims" are all patent claims owned 474 | or controlled by the contributor, whether already acquired or 475 | hereafter acquired, that would be infringed by some manner, permitted 476 | by this License, of making, using, or selling its contributor version, 477 | but do not include claims that would be infringed only as a 478 | consequence of further modification of the contributor version. For 479 | purposes of this definition, "control" includes the right to grant 480 | patent sublicenses in a manner consistent with the requirements of 481 | this License. 482 | 483 | Each contributor grants you a non-exclusive, worldwide, royalty-free 484 | patent license under the contributor's essential patent claims, to 485 | make, use, sell, offer for sale, import and otherwise run, modify and 486 | propagate the contents of its contributor version. 487 | 488 | In the following three paragraphs, a "patent license" is any express 489 | agreement or commitment, however denominated, not to enforce a patent 490 | (such as an express permission to practice a patent or covenant not to 491 | sue for patent infringement). To "grant" such a patent license to a 492 | party means to make such an agreement or commitment not to enforce a 493 | patent against the party. 494 | 495 | If you convey a covered work, knowingly relying on a patent license, 496 | and the Corresponding Source of the work is not available for anyone 497 | to copy, free of charge and under the terms of this License, through a 498 | publicly available network server or other readily accessible means, 499 | then you must either (1) cause the Corresponding Source to be so 500 | available, or (2) arrange to deprive yourself of the benefit of the 501 | patent license for this particular work, or (3) arrange, in a manner 502 | consistent with the requirements of this License, to extend the patent 503 | license to downstream recipients. "Knowingly relying" means you have 504 | actual knowledge that, but for the patent license, your conveying the 505 | covered work in a country, or your recipient's use of the covered work 506 | in a country, would infringe one or more identifiable patents in that 507 | country that you have reason to believe are valid. 508 | 509 | If, pursuant to or in connection with a single transaction or 510 | arrangement, you convey, or propagate by procuring conveyance of, a 511 | covered work, and grant a patent license to some of the parties 512 | receiving the covered work authorizing them to use, propagate, modify 513 | or convey a specific copy of the covered work, then the patent license 514 | you grant is automatically extended to all recipients of the covered 515 | work and works based on it. 516 | 517 | A patent license is "discriminatory" if it does not include within the 518 | scope of its coverage, prohibits the exercise of, or is conditioned on 519 | the non-exercise of one or more of the rights that are specifically 520 | granted under this License. You may not convey a covered work if you 521 | are a party to an arrangement with a third party that is in the 522 | business of distributing software, under which you make payment to the 523 | third party based on the extent of your activity of conveying the 524 | work, and under which the third party grants, to any of the parties 525 | who would receive the covered work from you, a discriminatory patent 526 | license (a) in connection with copies of the covered work conveyed by 527 | you (or copies made from those copies), or (b) primarily for and in 528 | connection with specific products or compilations that contain the 529 | covered work, unless you entered into that arrangement, or that patent 530 | license was granted, prior to 28 March 2007. 531 | 532 | Nothing in this License shall be construed as excluding or limiting 533 | any implied license or other defenses to infringement that may 534 | otherwise be available to you under applicable patent law. 535 | 536 | ### 12. No Surrender of Others' Freedom. 537 | 538 | If conditions are imposed on you (whether by court order, agreement or 539 | otherwise) that contradict the conditions of this License, they do not 540 | excuse you from the conditions of this License. If you cannot convey a 541 | covered work so as to satisfy simultaneously your obligations under 542 | this License and any other pertinent obligations, then as a 543 | consequence you may not convey it at all. For example, if you agree to 544 | terms that obligate you to collect a royalty for further conveying 545 | from those to whom you convey the Program, the only way you could 546 | satisfy both those terms and this License would be to refrain entirely 547 | from conveying the Program. 548 | 549 | ### 13. Use with the GNU Affero General Public License. 550 | 551 | Notwithstanding any other provision of this License, you have 552 | permission to link or combine any covered work with a work licensed 553 | under version 3 of the GNU Affero General Public License into a single 554 | combined work, and to convey the resulting work. The terms of this 555 | License will continue to apply to the part which is the covered work, 556 | but the special requirements of the GNU Affero General Public License, 557 | section 13, concerning interaction through a network will apply to the 558 | combination as such. 559 | 560 | ### 14. Revised Versions of this License. 561 | 562 | The Free Software Foundation may publish revised and/or new versions 563 | of the GNU General Public License from time to time. Such new versions 564 | will be similar in spirit to the present version, but may differ in 565 | detail to address new problems or concerns. 566 | 567 | Each version is given a distinguishing version number. If the Program 568 | specifies that a certain numbered version of the GNU General Public 569 | License "or any later version" applies to it, you have the option of 570 | following the terms and conditions either of that numbered version or 571 | of any later version published by the Free Software Foundation. If the 572 | Program does not specify a version number of the GNU General Public 573 | License, you may choose any version ever published by the Free 574 | Software Foundation. 575 | 576 | If the Program specifies that a proxy can decide which future versions 577 | of the GNU General Public License can be used, that proxy's public 578 | statement of acceptance of a version permanently authorizes you to 579 | choose that version for the Program. 580 | 581 | Later license versions may give you additional or different 582 | permissions. However, no additional obligations are imposed on any 583 | author or copyright holder as a result of your choosing to follow a 584 | later version. 585 | 586 | ### 15. Disclaimer of Warranty. 587 | 588 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 589 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 590 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT 591 | WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT 592 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 593 | A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND 594 | PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE 595 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR 596 | CORRECTION. 597 | 598 | ### 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR 602 | CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 603 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES 604 | ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT 605 | NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR 606 | LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM 607 | TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER 608 | PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 609 | 610 | ### 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | ## How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these 626 | terms. 627 | 628 | To do so, attach the following notices to the program. It is safest to 629 | attach them to the start of each source file to most effectively state 630 | the exclusion of warranty; and each file should have at least the 631 | "copyright" line and a pointer to where the full notice is found. 632 | 633 | 634 | Copyright (C) 635 | 636 | This program is free software: you can redistribute it and/or modify 637 | it under the terms of the GNU General Public License as published by 638 | the Free Software Foundation, either version 3 of the License, or 639 | (at your option) any later version. 640 | 641 | This program is distributed in the hope that it will be useful, 642 | but WITHOUT ANY WARRANTY; without even the implied warranty of 643 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 644 | GNU General Public License for more details. 645 | 646 | You should have received a copy of the GNU General Public License 647 | along with this program. If not, see . 648 | 649 | Also add information on how to contact you by electronic and paper 650 | mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands \`show w' and \`show c' should show the 661 | appropriate parts of the General Public License. Of course, your 662 | program's commands might be different; for a GUI interface, you would 663 | use an "about box". 664 | 665 | You should also get your employer (if you work as a programmer) or 666 | school, if any, to sign a "copyright disclaimer" for the program, if 667 | necessary. For more information on this, and how to apply and follow 668 | the GNU GPL, see . 669 | 670 | The GNU General Public License does not permit incorporating your 671 | program into proprietary programs. If your program is a subroutine 672 | library, you may consider it more useful to permit linking proprietary 673 | applications with the library. If this is what you want to do, use the 674 | GNU Lesser General Public License instead of this License. But first, 675 | please read . 676 | --------------------------------------------------------------------------------