├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── SUMMARY.md ├── data.db ├── docs ├── how_it_works.md ├── installation.md ├── setup.md └── training.md ├── main.py ├── model.py ├── parameter_search.py ├── requirements.txt ├── review_corpus.py ├── settings.py ├── statistics.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | install: 6 | - "if [[ \"$TRAVIS_PYTHON_VERSION\" == \"2.7\" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; else wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;fi" 7 | - "bash miniconda.sh -b -p $HOME/miniconda" 8 | - "export PATH=\"$HOME/miniconda/bin:$PATH\"" 9 | - "hash -r" 10 | - "conda config --set always_yes yes --set changeps1 no" 11 | - "conda update -q conda" 12 | - "conda info -a" 13 | - "conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION scikit-learn" 14 | - "source activate test-environment" 15 | - "pip install praw sqlalchemy" 16 | script: python test.py 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Aurora0001 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LearnProgrammingBot 2 | 3 | ## [Documentation](https://aurora0001.gitbooks.io/learnprogrammingbot/content/index.html) | [How it Works](https://aurora0001.gitbooks.io/learnprogrammingbot/content/docs/how_it_works.html) | [Download](https://github.com/Aurora0001/LearnProgrammingBot/releases) 4 | 5 | [![Join the chat at https://gitter.im/Aurora0001/LearnProgrammingBot](https://badges.gitter.im/Aurora0001/LearnProgrammingBot.svg)](https://gitter.im/Aurora0001/LearnProgrammingBot?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 6 | [![Travis CI](https://travis-ci.org/Aurora0001/LearnProgrammingBot.svg?branch=master)](https://travis-ci.org/Aurora0001/LearnProgrammingBot) 7 | [![Dependency Status](https://www.versioneye.com/user/projects/571ce8b8fcd19a0039f17f9b/badge.svg?style=flat)](https://www.versioneye.com/user/projects/571ce8b8fcd19a0039f17f9b) 8 | 9 | LearnProgrammingBot is a bot for [reddit](https://reddit.com) that uses 10 | scikit-learn and supervised learning techniques to categorise submissions and 11 | reply with useful and appropriate links. 12 | 13 | It is intended to answer common questions on /r/learnprogramming that can 14 | be found on the wiki, but in theory it should be suitable for any subreddit 15 | provided that it is trained properly. The default training set should be fine 16 | for most programming subreddits, but it can be extended at any time. 17 | 18 | ## Installation 19 | LearnProgrammingBot requires scikit-learn, praw and sqlalchemy. Due to this, the 20 | installation instructions are slightly different depending on which platform you 21 | are using. It *should* work with both Python 2 and Python 3 (unit tests are 22 | coming soon) 23 | 24 | Before continuing, download the code (either through the source zip or releases 25 | tab) and extract it if necessary, or clone using git. Then, open a command 26 | prompt or terminal and `cd` to the directory where you have extracted the code. 27 | 28 | ### Windows 29 | As an administrator, in the command prompt, run: 30 | 31 | pip install -r requirements.txt 32 | 33 | ### Mac 34 | 35 | sudo pip install -r requirements.txt 36 | 37 | ### Debian/Ubuntu/Mint 38 | 39 | sudo apt-get install python-scipy 40 | sudo pip install sqlalchemy scikit-learn praw 41 | 42 | ## Setup and Running 43 | You'll need to enter a few variables into `settings.py`. Just follow the 44 | instructions on lines preceded by # (comments) and fill in the correct data. 45 | 46 | To run, use `./main.py run` in the terminal. This will run continuously until 47 | killed using Ctrl+C or an exception. You might find useful logging information 48 | in bot.log if the bot does crash. Feel free to report an issue if you do find a 49 | bug! 50 | 51 | ## Classifications 52 | Currently, the classifier only recognises 3 types of post classes: 53 | 54 | - 'good' - the post is a good question for /r/learnprogramming 55 | - 'faq' - the post contains a common question that is probably on the FAQ 56 | - 'bad' - the post is formatted badly, off topic or does not contain enough 57 | detail. 58 | 59 | ## Accuracy 60 | As of commit `9742b376ef4e845ac45cbd96e86dfe7156dc913e`, the classifier's accuracy is as follows: 61 | 62 | - Correct classification = 81% 63 | - False negative = 13% 64 | - False positive = 5% 65 | - Wrong category = 1% 66 | 67 | False negatives are counted as any time that the actual class was not 'good' and 68 | the classifier returned 'good'. False positives occur when the actual class was 69 | 'good' but the classifier did not return 'good'. Wrong category classifications 70 | occur when the classifier returned a different negative classification (i.e. 71 | 'faq' instead of 'bad') 72 | 73 | ## Roadmap and Planned Features 74 | - Unit tests 75 | - More modular approach so that extra modules can be installed to make the bot 76 | more customisable. 77 | 78 | ## Contributing 79 | We're happy to accept contributors - you don't need to be an expert, just file 80 | an issue about getting started and we can start there! 81 | 82 | ## License 83 | 84 | MIT License. Please see the LICENSE file! 85 | -------------------------------------------------------------------------------- /SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | * [Installation](docs/installation.md) 4 | * [Setup](docs/setup.md) 5 | * [Training](docs/training.md) 6 | * [How it Works](docs/how_it_works.md) 7 | 8 | -------------------------------------------------------------------------------- /data.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aurora0001/LearnProgrammingBot/38ca7c8343195e3009ccf8b9d4e6edd853cd07f4/data.db -------------------------------------------------------------------------------- /docs/how_it_works.md: -------------------------------------------------------------------------------- 1 | # How it Works 2 | The code for LearnProgrammingBot is quite simple, but the theory behind it is slightly more difficult to get to grips with. Here's a 'bird's eye view' of how LearnProgrammingBot works: 3 | 4 | 1. Train support vector machine with known data (a 'corpus') 5 | 2. Fetch latest posts from reddit 6 | 3. 'Vectorize' the post into a numpy array 7 | 4. Classify the array using the trained support vector machine 8 | 5. If the post class is not 'good', check the responses dictionary for the correct response, and reply. 9 | 10 | Below, I'll try to explain the reasons for each of the steps and how they work. 11 | 12 | 13 | ## The Classifier 14 | Before explaining how LearnProgrammingBot's classifier works, it might be helpful to briefly talk about the document classification problem as a whole, and the different types of learning techniques. 15 | 16 | ### Types of Machine Learning 17 | There are two types of learning that are used for the majority of AI problems: **supervised learning** and **unsupervised learning**. 18 | 19 | Supervised learning is where the algorithm is shown some samples and the correct answers, and it extrapolates so that it can answer similar questions. It's similar to how a child learns through asking questions and using the answers to predict things in the future. 20 | 21 | Unsupervised learning is less useful for classification, because we already know the correct categories. It works better for data mining (finding trends that you don't already know). 22 | 23 | ### Classification Algorithms 24 | There are a few big solutions to classification problems, which all work in slightly different ways but provide similar outcomes. 25 | 26 | [Naive Bayes (NB) classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) are simple and popular classifiers which are often used for spam detection. They work on a simple principle, which Wikipedia illustrates like this: 27 | 28 | ![formula](https://upload.wikimedia.org/math/c/e/d/cedd117f3768b05f1822ae874d3fc303.png) 29 | 30 | Usually, NB classifiers work very quickly, but aren't as accurate as Support Vector Machines (SVMs). If you're interested in reading more about their competitiveness with SVMs, you can read [this paper](http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf). 31 | 32 | [Support Vector Machines](https://en.wikipedia.org/wiki/Support_vector_machine) appear similar to NB classifiers, but they are not probability-based - they can only return either 'Category A' or 'Category B'. Essentially, they find a line in a graph that splits the two datasets as accurately as possible, like this: 33 | 34 | ![SVM diagram](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Svm_separating_hyperplanes_%28SVG%29.svg/512px-Svm_separating_hyperplanes_%28SVG%29.svg.png) 35 | 36 | It's clear that both $$H_2$$ and $$H_3$$ are suitable lines, but $$H_1$$ is incorrect. The training period allows the SVM to calculate the best line. 37 | 38 | As you can see, SVMs can only split data points into two groups. To allow the SVM to split data points into multiple groups, a strategy called [one-vs-the-rest](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is used. Essentially, this makes multiple graphs, which might be like this: 39 | 40 | - 'good' vs rest 41 | - 'faq' vs rest 42 | - 'bad' vs rest 43 | 44 | Therefore, if it is in the 'rest' section for every graph but 'bad', the document must be 'bad'. 45 | 46 | ### The Vectorizer 47 | It's easy to understand how the SVM works with points, but one aspect that we haven't covered is how the points are actually calculated from a document of text. Obviously, you can't just pick a random point for a document - that'd produce nonsensical results! 48 | 49 | The solution to this is the *vectorizer*. As the name suggests, it turns text into a mathematical vector. This is done through a model known as the [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model). The example on Wikipedia (see the link) is very clear, and this is how scikit-learn's `CountVectorizer` works. Once the text has been turned into a vector, the numerical values *can* be used to position a point for the SVM. 50 | 51 | However, this method is a bit naive and might miss important words that aren't common. Instead, 'the' might be ranked as the most important word, which could cause the SVM to fall victim to an effect called [overfitting](https://en.wikipedia.org/wiki/Overfitting). This is where 'junk values' are misinterpreted as statistically important, leading to significant inaccuracies. 52 | 53 | An improved technique uses [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). This is an algorithm to rank words in a body of text by their importance, which can help to catch the key words in a message, even if they're only said once or twice. 54 | 55 | ### Summary 56 | Here's a beautiful ASCII-art graph for the key stages: 57 | 58 | #### Training 59 | 60 | Corpus of Training Data (pre-classified) 61 | | 62 | Process with Vectorizer to calcualte all 63 | key words and store it in the 'bag' 64 | | 65 | Process with SVM to train and fit the correct 66 | lines to split the groups 67 | 68 | #### Classification 69 | 70 | 71 | Text To Process (fetched from reddit) 72 | | 73 | Process with Vectorizer into Bag-of-Words, 74 | searching for words found in training 75 | phase 76 | | 77 | Classify with SVM using pre-fitted line 78 | | 79 | Return correct document classification -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | LearnProgrammingBot requires scikit-learn, praw and sqlalchemy. Due to this, the installation instructions are slightly different depending on which platform you are using. It should work with both Python 2 and Python 3 (unit tests are coming soon). 3 | 4 | To install, first download the source from either [the master ZIP](https://github.com/Aurora0001/LearnProgrammingBot/archive/master.zip) or the [releases](https://github.com/Aurora0001/LearnProgrammingBot/releases) tab, and extract the zip file into any directory. Alternatively, you can clone the source using git by running: 5 | 6 | git clone https://github.com/Aurora0001/LearnProgrammingBot.git 7 | 8 | Then, follow the instructions for your platform to install the dependencies: 9 | 10 | ## Windows 11 | As an administrator, in the command prompt, run: 12 | 13 | pip install -r requirements.txt 14 | 15 | If pip is not recognised, you may need to install it using [these instructions](http://stackoverflow.com/questions/4750806/how-do-i-install-pip-on-windows#12476379). If you want to download a pre-compiled version of Python with SciPy, try [Python(x,y)](https://python-xy.github.io/downloads.html). 16 | 17 | ## Mac 18 | 19 | sudo pip install -r requirements.txt 20 | 21 | ## Debian/Ubuntu/Mint 22 | 23 | sudo apt-get install python-scipy python-pip 24 | sudo pip install sqlalchemy scikit-learn praw 25 | 26 | ## Other Linux Distributions 27 | Check if your distribution has a package such as `python-scipy`, which will save time and avoid the need for you to compile NumPy from source (which is slow and quite difficult). If you **can** use a package, just run this afterwards: 28 | 29 | sudo pip install sqlalchemy scikit-learn praw 30 | 31 | Make sure that you've installed the package for `pip` too, if you haven't already. 32 | 33 | If your distribution does not have a SciPy package, just run this (and prepare for a long wait!): 34 | 35 | sudo pip install -r requirements.txt 36 | 37 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | To configure LearnProgrammingBot, you'll need to obtain an OAuth access token from reddit. This will allow LearnProgrammingBot to log in to the account that you want to automate. 3 | 4 | If you already have the token for `CLIENT_ACCESSCODE`, skip this section. This code is **not** the ID or secret, though. 5 | 6 | ## Getting the OAuth Tokens 7 | To use OAuth (which reddit requires), you need **3 tokens**: the client id, the client secret and the access token. 8 | ### Getting the ID and Secret Tokens 9 | To create these tokens, you'll need to go to [the app preferences](https://www.reddit.com/prefs/apps/) page, while logged in as your bot account. If you don't see something like this, you may need to click 'create another app...': 10 | 11 | Set the **name** box to 'LearnProgrammingBot' (or a custom name, if you prefer - it isn't important). 12 | Select the **script** app type from the radio buttons below the textbox. 13 | Leave **description** and **about url** blank, and enter **http://127.0.0.1/callback** in the **redirect uri** box. 14 | Then, click 'create app', and you should see something like what you see in the image: 15 | 16 | ![image](https://camo.githubusercontent.com/d53f92cd85d1279a239444acee25179e8e6d8bb5/687474703a2f2f692e696d6775722e636f6d2f65326b4f5231612e706e67) 17 | 18 | The token under '**personal use script**' is your *client ID*. The token underlined in red is your *client secret*. 19 | 20 | Open up `settings.py` and change the following lines to your ID and secret: 21 | 22 | CLIENT_ID = 'my_client_id_here' 23 | CLIENT_SECRET = 'my_client_secret_here' 24 | 25 | You can ignore any lines preceded by #. 26 | 27 | ### Getting your Access Token 28 | LearnProgrammingBot can help you generate your access token automatically. This only needs to be done once - after this, it can be done manually. 29 | 30 | In a terminal, run: 31 | 32 | ./main.py create-token 33 | 34 | A web browser should open (if you are logged in as your bot account). Click 'Allow', and wait to be redirected. You will probably get something like this: 35 | 36 | ![](https://praw.readthedocs.org/en/stable/_images/CodeUrl.png) 37 | 38 | Don't worry, this is **correct**. Copy the token after `code=` (circled in the image), and put it in `settings.py` as CLIENT_ACCESSTOKEN. **Do not include the `code=` section - this will not work!** 39 | 40 | ## Running LearnProgrammingBot 41 | 42 | You're now ready to run LearnProgrammingBot (finally!). Use `./main.py` run in the terminal. This will run continuously until killed using `Ctrl+C` or an exception. You might find useful logging information in bot.log if the bot does crash. Feel free to report an issue if you do find a bug! 43 | -------------------------------------------------------------------------------- /docs/training.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | To train the bot, you need to install LearnProgrammingBot and its dependencies (see the Installation section). You do **not** need to create OAuth tokens as shown in the Setup section if you are only training the bot. 4 | 5 | ## Training with a Specific Post 6 | You can train the bot with one post if it has misclassified it, using the following command: 7 | 8 | ./main.py train --id ID 9 | 10 | Where ID is the reddit submission ID, for example: 11 | 12 | https://www.reddit.com/r/learnprogramming/comments/4g4far/meta_i_wrote_a_bot_for_rlearnprogramming_that/ 13 | ^^^^^^^ 14 | In this link, the id is **4g4far**, so you could train it with: 15 | 16 | ./main.py train --id 4g4far 17 | 18 | LearnProgrammingBot will then fetch the post from reddit, and display it for you 19 | to review. It will then prompt you to enter the correct classification of the 20 | post. Here are the categories (an updated list is found in `review_corpus.py`) 21 | 22 | Valid categories: 23 | - good 24 | - off_topic (incl. bad questions) 25 | - faq_get_started (incl. getting started with a project - where do I start?) 26 | - faq_career 27 | - faq_resource (incl. challenges e.g. codewars) 28 | - faq_resource_podcast 29 | - faq_tool (incl. laptop specs) 30 | - faq_language (e.g. how much should I know before I am expert, which should I 31 | pick) 32 | - faq_other (including motivation, 'does a programmer google?', project ideas etc.) 33 | - faq_what_now (what to do after codecademy etc.) 34 | 35 | For the best results, it's best to be generous with your classification, and, if in doubt, classify as 'good'. Check `data.db` for examples of how previous posts were classified, if you're not sure. 36 | 37 | ## Training in Batches 38 | You might find it easier to train with larger samples from the 'new' feed of /r/learnprogramming. This is supported with the `train-batch` command, which can be used like so: 39 | 40 | ./main.py train-batch --limit AMOUNT_OF_POSTS_TO_CLASSIFY 41 | 42 | This is also interactive, just like the `train` command. To see the valid classifications, please see the above section. 43 | 44 | ## Committing Changes 45 | To merge your database changes with the main repository, [fork LearnProgrammingBot](https://github.com/Aurora0001/LearnProgrammingBot) on GitHub, then clone your copy. Train the classifier using the steps listed above, then [create a pull request](https://help.github.com/articles/using-pull-requests/). Try to do this relatively quickly (i.e. don't wait for days before merging) because it's difficult to resolve merge conflicts with the database. 46 | 47 | ### Summary 48 | 1. Fork repository 49 | 2. `git clone https://github.com/MyUserName/LearnProgrammingBot` 50 | 3. Train classifer 51 | 4. `git commit -m "Trained classifier with X new records"` 52 | 5. `git push origin master` 53 | 6. Create pull request on GitHub 54 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | 3 | from __future__ import print_function 4 | 5 | from sklearn.pipeline import make_union 6 | from sklearn.base import TransformerMixin 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sqlalchemy.orm import sessionmaker 9 | from sqlalchemy import create_engine 10 | from sklearn import svm 11 | import numpy as np 12 | import logging 13 | import webbrowser 14 | import argparse 15 | import sys 16 | import re 17 | 18 | from settings import LOGFILE_URI, DATABASE_URI, LOG_LEVEL, CLIENT_ID, CLIENT_SECRET, CLIENT_ACCESSCODE, SUBREDDIT, REDIRECT_URI, LOG_FORMAT 19 | import model 20 | import praw 21 | 22 | # This allows for Python 3 compatibility by replacing input() on Python 2 23 | if sys.version_info[:2] <= (2, 7): 24 | input = raw_input 25 | 26 | responses = { 27 | 'off_topic': ''' 28 | Hi! Your post might not attract good responses on /r/learnprogramming. 29 | This may be because you didn't include a code sample, provided very little 30 | detail or linked content that doesn't seem relevant. You can improve your post 31 | by: 32 | 33 | - [Asking Questions The Smart Way](http://catb.org/~esr/faqs/smart-questions.html) 34 | - Avoiding posting links without any explanation, discussion or question (links 35 | might get a better response on /r/programming) 36 | - Using code pastebins (images don't count!) 37 | - Reviewing the post guidelines on the sidebar 38 | 39 | Don't worry about this message if you think it's a mistake - it may just be an 40 | error in my classifier, but please check the resources above anyway to make 41 | sure that your post gets the best responses. 42 | ''', 43 | 'faq_get_started': ''' 44 | Hello! Your post seems to be about getting started with programming or a 45 | project. You can find some great resources about this in the 46 | [/r/learnprogramming FAQ](https://www.reddit.com/r/learnprogramming/wiki/faq). 47 | 48 | Specifically, you might find these useful: 49 | 50 | - [Getting Started with Programming](https://www.reddit.com/r/learnprogramming/wiki/gettingstarted) 51 | - [FAQ - How do I get started?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_get_started_with_programming.3F) 52 | - [FAQ - How do I get started with a large project?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_get_started_with_a_large_project_and_keep_up_with_it.3F) 53 | ''', 54 | 'faq_career': ''' 55 | Hello! Your post seems to be about careers in programming. You'll 56 | be able to get the best advice in the subreddit /r/cscareerquestions, who 57 | specifically deal with questions like this. 58 | 59 | The wiki also has some useful advice about this: 60 | 61 | - [FAQ - Careers](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_careers_and_jobs) 62 | ''', 63 | 'faq_resource': ''' 64 | Hello! You seem to be looking for a resource or tutorial. The /r/learnprogramming 65 | wiki has a comprehensive list of resources that might be useful to you, but if 66 | what you're looking for isn't on there, please help by adding it! 67 | 68 | - [Online Resources](http://www.reddit.com/r/learnprogramming/wiki/online) 69 | - [Books](http://www.reddit.com/r/learnprogramming/wiki/books) 70 | - [Programming Challenges](http://www.reddit.com/r/learnprogramming/wiki/faq#wiki_where_can_i_find_practice_exercises_and_project_ideas.3F) 71 | 72 | You might also like the [Awesome Lists](https://awesomelists.top/), which are 73 | curated lists for the best libraries, tools and resources for most programming 74 | languages, topics and tools. 75 | ''', 76 | 'faq_tool': ''' 77 | Hello! Your post seems to be about a programming tool, IDE or hardware (e.g. a laptop). 78 | 79 | Take a look at the following links: 80 | 81 | - /r/suggestalaptop 82 | - [Wiki - Programming Tools](https://www.reddit.com/r/learnprogramming/wiki/tools) 83 | ''', 84 | 'faq_language': ''' 85 | Hello! You seem to be asking about which programming language to use for a 86 | project or which language to learn. This is quite a frequent question so you 87 | might find that you get the best answer from the 88 | [FAQ](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_which_programming_language_should_i_start_with.3F). 89 | 90 | Also, why not try the [choosing a language tool](http://choosing-a-language.techboss.co/) 91 | by Techboss which should guide you in picking a suitable language. 92 | 93 | The general advice here is that you should focus on one programming language 94 | that you know well, so you can improve your *algorithmic thinking* skills. 95 | 'Language hopping' tends to be a bad idea because you are always learning 96 | syntax, which is less important. 97 | ''', 98 | 'faq_other': ''' 99 | Hello! Your post seems similar to an FAQ question, but I can't specifically 100 | figure out which section would be helpful to you. 101 | 102 | Take a look through [the wiki](https://www.reddit.com/r/learnprogramming/wiki/index) 103 | if you haven't already, and check to see if it helps you. If not, please 104 | report an issue so I can give more specific help in future! 105 | ''', 106 | 'faq_resource_podcast': ''' 107 | Looking for a podcast? You might find these threads useful: 108 | 109 | - [Podcasts for Beginners](https://www.reddit.com/r/learnprogramming/comments/47dusa/podcasts_any_recommendations_for_a_beginner/) 110 | - [Advanced Programming Podcasts](https://www.reddit.com/r/learnprogramming/comments/3pw6gl/advanced_programming_concepts_or_fun_fact_type/) 111 | ''', 112 | 'faq_what_now': ''' 113 | Hi! If you've just completed your first course and aren't sure where to go next, take a look at some of these guides and see if they help: 114 | 115 | - [FAQ - Now what do I do?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_now_what_do_i_do.3F) 116 | - [How do I move from beginner to intermediate level?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_move_from_an_beginning_to_an_intermediate_level.3F) 117 | ''' 118 | } 119 | 120 | post_signature = ''' 121 | 122 | --- 123 | I am a bot for /r/learnprogramming using supervised learning to provide helpful 124 | responses to common posts. I'm open source and accept pull requests and 125 | contributions! 126 | 127 | [[Learn More]](https://github.com/Aurora0001/LearnProgrammingBot) 128 | [[Report an Issue (or reply below with feedback)]](https://github.com/Aurora0001/LearnProgrammingBot/issues) 129 | ''' 130 | 131 | 132 | class PostTransformer(TransformerMixin): 133 | """ 134 | Transforms posts on four characteristics: 135 | - Amount of links 136 | - Length of post 137 | - Contains block code 138 | - Contains inline code 139 | """ 140 | def __init__(self, word_k=10000, link_k=5): 141 | # TODO: grid search for best constants 142 | self.word_k = word_k 143 | self.link_k = link_k 144 | 145 | def fit(self, *args): 146 | return self 147 | 148 | def transform(self, X, *args, **kwargs): 149 | ret = [] 150 | for item in X: 151 | ret.append(float(len(item)) / self.word_k) 152 | ret.append(float(item.count('http')) / self.link_k) 153 | ret.append(float(item.count(' ')) / len(item)) 154 | 155 | y = np.array(ret).reshape(-1, 3) 156 | return y 157 | 158 | fit_transform = transform 159 | 160 | class Classifier(object): 161 | """ 162 | Wrapper for the vectorizer and classifier that handles training of both. 163 | """ 164 | def __init__(self, training_values=None, training_targets=None): 165 | self.vectorizer = make_union(TfidfVectorizer(), PostTransformer()) 166 | # Set using parameter_search. TODO: review after updating 167 | # corpus. 168 | self.classifier = svm.LinearSVC(C=1, loss='squared_hinge', multi_class='ovr', class_weight='balanced', tol=1e-6) 169 | if training_values is not None and training_targets is not None: 170 | self.fit(training_values, training_targets) 171 | 172 | def fit(self, training_values, training_targets): 173 | training_values = self.vectorizer.fit_transform(training_values).toarray() 174 | self.classifier.fit(training_values, training_targets) 175 | 176 | def classify(self, text): 177 | transformed_text = self.vectorizer.transform([text]).toarray() 178 | return self.classifier.predict(transformed_text) 179 | 180 | def get_probability(self, text): 181 | transformed_text = self.vectorizer.transform([text]).toarray() 182 | return self.classifier.decision_function(transformed_text) 183 | 184 | def connect_to_database(uri): 185 | engine = create_engine(uri) 186 | return sessionmaker(bind=engine) 187 | 188 | def get_reddit_client(): 189 | reddit = praw.Reddit(user_agent='all platforms:Learn Programming Bot:v0.2.0-pre (by /u/Aurora0001, contact at github.com/Aurora0001/LearnProgrammingBot/issues)') 190 | reddit.set_oauth_app_info(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, redirect_uri=REDIRECT_URI) 191 | return reddit 192 | 193 | def run_bot(args): 194 | logging.basicConfig(filename=LOGFILE_URI, level=LOG_LEVEL, format=LOG_FORMAT) 195 | 196 | logging.info('Connecting to database {}'.format(DATABASE_URI)) 197 | Session = connect_to_database(DATABASE_URI) 198 | logging.info('Database connection OK') 199 | 200 | session = Session() 201 | data = session.query(model.Corpus).all() 202 | 203 | data_values = [col.title + ' ' + col.text for col in data] 204 | data_targets = [col.category for col in data] 205 | 206 | logging.info('Training classifier with {} values'.format(len(data_values))) 207 | classifier = Classifier(data_values, data_targets) 208 | logging.info('Classifier trained') 209 | 210 | logging.info('Connecting to reddit...') 211 | reddit = get_reddit_client() 212 | 213 | logging.info('Authorizing...') 214 | access_information = reddit.get_access_information(CLIENT_ACCESSCODE) 215 | reddit.set_access_credentials(**access_information) 216 | logging.info('Logged in successfully.') 217 | 218 | for message in praw.helpers.submission_stream(reddit, SUBREDDIT, limit=5, verbosity=0): 219 | message_text = message.title + ' ' + message.selftext 220 | pred = classifier.classify(message_text)[0] 221 | if pred in responses: 222 | if args.supervised and input('Classify {} as {}? (y/n) '.format(message.id, pred)).lower() != 'y': 223 | continue 224 | 225 | try: 226 | message.add_comment(responses[pred] + post_signature) 227 | except praw.errors.RateLimitExceeded: 228 | # TODO: 229 | # Ideally, errors should actually be handled properly. Perhaps a dequeue could be used 230 | # to store all the posts which failed, which could be retried every minute (or so) 231 | logging.error('Rate limit exceeded, cannot post to thread {}'.format(message.title)) 232 | 233 | def train_id(args): 234 | train_bot(args, True) 235 | 236 | def train_batch(args): 237 | train_bot(args, False) 238 | 239 | def train_bot(args, by_id): 240 | reddit = get_reddit_client() 241 | if by_id: 242 | messages = [reddit.get_submission(submission_id=args.id)] 243 | else: 244 | messages = reddit.get_subreddit(SUBREDDIT).get_new(limit=args.limit) 245 | for message in messages: 246 | print(message.title) 247 | print('----------') 248 | print(message.selftext) 249 | print('') 250 | message_type = input('Enter category: ') 251 | if message_type == '': 252 | continue 253 | Session = connect_to_database(DATABASE_URI) 254 | session = Session() 255 | session.add(model.Corpus(title=message.title, text=message.selftext, category=message_type)) 256 | session.commit() 257 | 258 | def create_token(args): 259 | reddit = get_reddit_client() 260 | url = reddit.get_authorize_url('uniqueKey', 'identity,submit,read', True) 261 | webbrowser.open(url) 262 | print(' !!! ') 263 | print('Please copy the access code that you are redirected to ') 264 | print('like this: http://praw.readthedocs.org/en/latest/_images/CodeUrl.png') 265 | print('You need to put it in settings.py as CLIENT_ACCESSCODE') 266 | print(' !!! ') 267 | 268 | def classify_item(args): 269 | reddit = get_reddit_client() 270 | post = reddit.get_submission(submission_id=args.id) 271 | 272 | Session = connect_to_database(DATABASE_URI) 273 | session = Session() 274 | 275 | data = session.query(model.Corpus).all() 276 | data_values = [col.title + ' ' + col.text for col in data] 277 | data_targets = [col.category for col in data] 278 | 279 | classifier = Classifier(data_values, data_targets) 280 | post_text = post.title + ' ' + post.selftext 281 | classification = classifier.classify(post_text)[0] 282 | probability = classifier.get_probability(post_text)[0] 283 | print('p({}) = {}'.format(classification, max(probability))) 284 | print('-----------') 285 | for (i, class_) in enumerate(classifier.classifier.classes_): 286 | print('p({}) = {}'.format(class_, probability[i])) 287 | 288 | def initialise_database(args): 289 | engine = create_engine(DATABASE_URI) 290 | model.Corpus.metadata.create_all(engine) 291 | 292 | if __name__ == '__main__': 293 | parser = argparse.ArgumentParser() 294 | subparsers = parser.add_subparsers() 295 | parser_run = subparsers.add_parser('run', help='runs the bot') 296 | parser_run.add_argument('--supervised', action='store_true') 297 | parser_run.set_defaults(func=run_bot) 298 | parser_train = subparsers.add_parser('train', help='adds training data to the bot (using a specific id)') 299 | parser_train.add_argument('--id', type=str, required=True, help='the submission id of the post to review') 300 | parser_train.set_defaults(func=train_id) 301 | parser_batch = subparsers.add_parser('train-batch', help='adds training data to the bot in batches') 302 | parser_batch.add_argument('--limit', type=int, required=True, help='the maximum number of posts to fetch') 303 | parser_batch.set_defaults(func=train_batch) 304 | parser_token = subparsers.add_parser('create-token', help='gets an access token with your client id/secret') 305 | parser_token.set_defaults(func=create_token) 306 | parser_init = subparsers.add_parser('init', help='initialises the database, ready to insert training data') 307 | parser_init.set_defaults(func=initialise_database) 308 | parser_classify = subparsers.add_parser('classify', help='classifies a specific post using the trained data') 309 | parser_classify.add_argument('--id', type=str, required=True, help='the submission id of the post to classify') 310 | parser_classify.set_defaults(func=classify_item) 311 | args = parser.parse_args(sys.argv[1:]) 312 | args.func(args) 313 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String 2 | from sqlalchemy.ext.declarative import declarative_base 3 | Base = declarative_base() 4 | 5 | class Corpus(Base): 6 | __tablename__ = 'corpora' 7 | id = Column(Integer, primary_key=True) 8 | text = Column(String, nullable=False) 9 | title = Column(String, nullable=False) 10 | category = Column(String, nullable=True) 11 | -------------------------------------------------------------------------------- /parameter_search.py: -------------------------------------------------------------------------------- 1 | from sklearn.grid_search import GridSearchCV 2 | from sklearn.pipeline import Pipeline 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy import create_engine 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | import main 9 | import model 10 | import settings 11 | 12 | param_grid = [ 13 | { 14 | 'C': [1, 5, 10, 50], 15 | 'loss': ['hinge', 'squared_hinge'], 16 | 'tol': [1e-6, 1e-4, 1e-2, 1e-1], 17 | 'multi_class': ['ovr', 'crammer_singer'], 18 | 'class_weight': ['balanced'] 19 | } 20 | ] 21 | 22 | if __name__ == '__main__': 23 | engine = create_engine(settings.DATABASE_URI) 24 | Session = sessionmaker(bind=engine) 25 | session = Session() 26 | data = session.query(model.Corpus).all() 27 | data_values = [col.title + ' ' + col.text for col in data] 28 | data_targets = [col.category for col in data] 29 | classifier = main.Classifier() 30 | classifier.vectorizer.fit_transform(data_values) 31 | grid_search = GridSearchCV(classifier.classifier, param_grid, n_jobs=-1) 32 | grid_search.fit(classifier.vectorizer.transform(data_values), data_targets) 33 | print('Best score: {}'.format(grid_search.best_score_)) 34 | parameters = grid_search.best_estimator_.get_params() 35 | for parameter in parameters.keys(): 36 | print("{} - {}".format(parameter, parameters[parameter])) 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | sqlalchemy 5 | praw 6 | -------------------------------------------------------------------------------- /review_corpus.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import sessionmaker 2 | from sqlalchemy import create_engine 3 | import sys 4 | 5 | import main 6 | import model 7 | import settings 8 | 9 | """ 10 | Valid categories: 11 | - good 12 | - off_topic (incl. bad questions) 13 | - faq_get_started (incl. getting started with a project - where do I start?) 14 | - faq_career 15 | - faq_resource (incl. challenges e.g. codewars) 16 | - faq_resource_podcast 17 | - faq_tool (incl. laptop specs) 18 | - faq_language (e.g. how much should I know before I am expert, which should I 19 | pick) 20 | - faq_other (including motivation, 'does a programmer google?', project ideas etc.) 21 | - faq_what_now (what to do after codecademy etc.) 22 | 23 | """ 24 | 25 | # This allows for Python 3 compatibility by replacing input() on Python 2 26 | if sys.version_info[:2] <= (2, 7): 27 | input = raw_input 28 | 29 | if __name__ == '__main__': 30 | engine = create_engine(settings.DATABASE_URI) 31 | Session = sessionmaker(bind=engine) 32 | session = Session() 33 | data = session.query(model.Corpus).all() 34 | for message in data: 35 | print(message.title) 36 | print('----') 37 | print(message.text) 38 | print('') 39 | category = input('Enter category for post: ') 40 | message.category = category 41 | session.commit() 42 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # Run ./main.py init to create data.db if you are not using the provided 2 | # training data. You need to delete the old data.db first ONLY IF YOU ARE NOT 3 | # USING THE TRAINING DATA. If you are using the default training data, 4 | # do nothing. 5 | DATABASE_URI = 'sqlite:///data.db' 6 | 7 | # This will be auto-created if it does not exist 8 | LOGFILE_URI = 'bot.log' 9 | LOG_LEVEL = 20 10 | LOG_FORMAT = '%(asctime)s [%(levelname)s] %(message)s [%(module)s:%(funcName)s:%(lineno)d]' 11 | 12 | SUBREDDIT = 'learnprogramming' 13 | REDIRECT_URI = 'http://127.0.0.1/callback' 14 | 15 | # To generate your client id and secret, go to https://www.reddit.com/prefs/apps/ 16 | # while logged in as your bot account. Create an app, using any name you like. Set 17 | # the type to 'script', and the 'redirect uri' to http://127.0.0.1/callback 18 | # The string labelled 'secret' is your CLIENT_SECRET, and the other one is your 19 | # CLIENT_ID. 20 | CLIENT_ID = 'PLEASE_SET_THIS' 21 | CLIENT_SECRET = 'PLEASE_SET_THIS' 22 | 23 | # Please use ./main.py create-token to generate this, after setting CLIENT_ID 24 | # and CLIENT_SECRET. It will open your web browser to get permission to 25 | # use OAuth. This only needs to be done once. 26 | CLIENT_ACCESSCODE = 'PLEASE_SET_THIS' 27 | -------------------------------------------------------------------------------- /statistics.py: -------------------------------------------------------------------------------- 1 | from sklearn.learning_curve import learning_curve 2 | from sqlalchemy.orm import sessionmaker 3 | from sqlalchemy import create_engine 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | import main 8 | import model 9 | import settings 10 | 11 | if __name__ == '__main__': 12 | # Adapted from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html (BSD license) 13 | engine = create_engine(settings.DATABASE_URI) 14 | Session = sessionmaker(bind=engine) 15 | session = Session() 16 | data = session.query(model.Corpus).all() 17 | data_values = [col.title + ' ' + col.text for col in data] 18 | data_targets = [col.category for col in data] 19 | classifier = main.Classifier() 20 | classifier.vectorizer.fit_transform(data_values) 21 | train_sizes, train_scores, test_scores = learning_curve(classifier.classifier, classifier.vectorizer.transform(data_values), data_targets, cv=6) 22 | train_scores_mean = np.mean(train_scores, axis=1) 23 | train_scores_std = np.std(train_scores, axis=1) 24 | test_scores_mean = np.mean(test_scores, axis=1) 25 | test_scores_std = np.std(test_scores, axis=1) 26 | plt.figure() 27 | plt.title('LearnProgrammingBot Training Scores') 28 | plt.xlabel("Training Samples") 29 | plt.ylabel("Accuracy") 30 | plt.grid() 31 | 32 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 33 | train_scores_mean + train_scores_std, alpha=0.1, 34 | color="r") 35 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 36 | test_scores_mean + test_scores_std, alpha=0.1, color="g") 37 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 38 | label="Training score") 39 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", 40 | label="Cross-validation score") 41 | plt.ylim(0, 1) 42 | plt.show() 43 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from sqlalchemy.orm import sessionmaker 3 | from sqlalchemy import create_engine 4 | from sklearn import cross_validation 5 | 6 | import model 7 | import main 8 | 9 | class TestClassifier(unittest.TestCase): 10 | def test_classifications(self): 11 | false_positives = 0 12 | false_negatives = 0 13 | correct = 0 14 | wrong = 0 15 | engine = create_engine('sqlite:///data.db') 16 | Session = sessionmaker(bind=engine) 17 | session = Session() 18 | training_data = session.query(model.Corpus).all() 19 | training_values = [rec.title + ' ' + rec.text for rec in training_data] 20 | training_targets = [rec.category for rec in training_data] 21 | training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split(training_values, training_targets, test_size=0.2, random_state=0) 22 | classifier = main.Classifier(training_values, training_targets) 23 | for (i, message_text) in enumerate(testing_values): 24 | classification = classifier.classify(message_text)[0] 25 | if testing_targets[i] == 'good' and classification != 'good': 26 | false_positives += 1 27 | print(message_text) 28 | print('[Suspected {}; actually good]'.format(classification)) 29 | print('---') 30 | elif testing_targets[i] != 'good' and classification == 'good': 31 | false_negatives += 1 32 | elif testing_targets[i] == classification: 33 | correct += 1 34 | else: 35 | wrong += 1 36 | print(message_text) 37 | print('[Suspected {}; actually {}]'.format(classification, testing_targets[i])) 38 | print('---') 39 | print('{} false positives ({})'.format(false_positives, float(false_positives)/len(testing_values))) 40 | print('{} false negatives ({})'.format(false_negatives, float(false_negatives)/len(testing_values))) 41 | print('{} correct ({})'.format(correct, float(correct)/len(testing_values))) 42 | print('{} wrong ({})'.format(wrong, float(wrong)/len(testing_values))) 43 | if float(false_positives) / len(testing_values) > 0.05: 44 | raise Exception('False positive rate too high!') 45 | elif float(correct) / len(testing_values) < 0.6: 46 | raise Exception('Correct identification rate too low!') 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | --------------------------------------------------------------------------------