├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── SUMMARY.md
├── data.db
├── docs
    ├── how_it_works.md
    ├── installation.md
    ├── setup.md
    └── training.md
├── main.py
├── model.py
├── parameter_search.py
├── requirements.txt
├── review_corpus.py
├── settings.py
├── statistics.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.5"
 5 | install: 
 6 |   - "if [[ \"$TRAVIS_PYTHON_VERSION\" == \"2.7\" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; else wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;fi"
 7 |   - "bash miniconda.sh -b -p $HOME/miniconda"
 8 |   - "export PATH=\"$HOME/miniconda/bin:$PATH\""
 9 |   - "hash -r"
10 |   - "conda config --set always_yes yes --set changeps1 no"
11 |   - "conda update -q conda"
12 |   - "conda info -a"
13 |   - "conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION scikit-learn"
14 |   - "source activate test-environment"
15 |   - "pip install praw sqlalchemy"
16 | script: python test.py
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Aurora0001
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LearnProgrammingBot
 2 | 
 3 | ## [Documentation](https://aurora0001.gitbooks.io/learnprogrammingbot/content/index.html) | [How it Works](https://aurora0001.gitbooks.io/learnprogrammingbot/content/docs/how_it_works.html) | [Download](https://github.com/Aurora0001/LearnProgrammingBot/releases)
 4 | 
 5 | [![Join the chat at https://gitter.im/Aurora0001/LearnProgrammingBot](https://badges.gitter.im/Aurora0001/LearnProgrammingBot.svg)](https://gitter.im/Aurora0001/LearnProgrammingBot?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 6 | [![Travis CI](https://travis-ci.org/Aurora0001/LearnProgrammingBot.svg?branch=master)](https://travis-ci.org/Aurora0001/LearnProgrammingBot)
 7 | [![Dependency Status](https://www.versioneye.com/user/projects/571ce8b8fcd19a0039f17f9b/badge.svg?style=flat)](https://www.versioneye.com/user/projects/571ce8b8fcd19a0039f17f9b)
 8 | 
 9 | LearnProgrammingBot is a bot for [reddit](https://reddit.com) that uses
10 | scikit-learn and supervised learning techniques to categorise submissions and
11 | reply with useful and appropriate links.
12 | 
13 | It is intended to answer common questions on /r/learnprogramming that can
14 | be found on the wiki, but in theory it should be suitable for any subreddit
15 | provided that it is trained properly. The default training set should be fine
16 | for most programming subreddits, but it can be extended at any time.
17 | 
18 | ## Installation
19 | LearnProgrammingBot requires scikit-learn, praw and sqlalchemy. Due to this, the
20 | installation instructions are slightly different depending on which platform you
21 | are using. It *should* work with both Python 2 and Python 3 (unit tests are
22 | coming soon)
23 | 
24 | Before continuing, download the code (either through the source zip or releases
25 | tab) and extract it if necessary, or clone using git. Then, open a command
26 | prompt or terminal and `cd` to the directory where you have extracted the code.
27 | 
28 | ### Windows
29 | As an administrator, in the command prompt, run:
30 | 
31 |     pip install -r requirements.txt
32 | 
33 | ### Mac
34 | 
35 |     sudo pip install -r requirements.txt
36 | 
37 | ### Debian/Ubuntu/Mint
38 | 
39 |     sudo apt-get install python-scipy
40 |     sudo pip install sqlalchemy scikit-learn praw
41 | 
42 | ## Setup and Running
43 | You'll need to enter a few variables into `settings.py`. Just follow the
44 | instructions on lines preceded by # (comments) and fill in the correct data.
45 | 
46 | To run, use `./main.py run` in the terminal. This will run continuously until
47 | killed using Ctrl+C or an exception. You might find useful logging information
48 | in bot.log if the bot does crash. Feel free to report an issue if you do find a
49 | bug!
50 | 
51 | ## Classifications
52 | Currently, the classifier only recognises 3 types of post classes:
53 | 
54 | - 'good' - the post is a good question for /r/learnprogramming
55 | - 'faq' - the post contains a common question that is probably on the FAQ
56 | - 'bad' - the post is formatted badly, off topic or does not contain enough
57 | detail.
58 | 
59 | ## Accuracy
60 | As of commit `9742b376ef4e845ac45cbd96e86dfe7156dc913e`, the classifier's accuracy is as follows:
61 | 
62 | - Correct classification = 81%
63 | - False negative = 13%
64 | - False positive = 5%
65 | - Wrong category = 1%
66 | 
67 | False negatives are counted as any time that the actual class was not 'good' and
68 | the classifier returned 'good'. False positives occur when the actual class was
69 | 'good' but the classifier did not return 'good'. Wrong category classifications
70 | occur when the classifier returned a different negative classification (i.e.
71 | 'faq' instead of 'bad')
72 | 
73 | ## Roadmap and Planned Features
74 | - Unit tests
75 | - More modular approach so that extra modules can be installed to make the bot
76 | more customisable.
77 | 
78 | ## Contributing
79 | We're happy to accept contributors - you don't need to be an expert, just file
80 | an issue about getting started and we can start there!
81 | 
82 | ## License
83 | 
84 | MIT License. Please see the LICENSE file!
85 | 


--------------------------------------------------------------------------------
/SUMMARY.md:
--------------------------------------------------------------------------------
1 | # Summary
2 | 
3 | * [Installation](docs/installation.md)
4 | * [Setup](docs/setup.md)
5 | * [Training](docs/training.md)
6 | * [How it Works](docs/how_it_works.md)
7 | 
8 | 


--------------------------------------------------------------------------------
/data.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aurora0001/LearnProgrammingBot/38ca7c8343195e3009ccf8b9d4e6edd853cd07f4/data.db


--------------------------------------------------------------------------------
/docs/how_it_works.md:
--------------------------------------------------------------------------------
 1 | # How it Works
 2 |  The code for LearnProgrammingBot is quite simple, but the theory behind it is slightly more difficult to get to grips with. Here's a 'bird's eye view' of how LearnProgrammingBot works:
 3 | 
 4 | 1. Train support vector machine with known data (a 'corpus')
 5 | 2. Fetch latest posts from reddit
 6 | 3. 'Vectorize' the post into a numpy array
 7 | 4. Classify the array using the trained support vector machine
 8 | 5. If the post class is not 'good', check the responses dictionary for the correct response, and reply.
 9 | 
10 | Below, I'll try to explain the reasons for each of the steps and how they work.
11 | 
12 | 
13 | ## The Classifier
14 |  Before explaining how LearnProgrammingBot's classifier works, it might be helpful to briefly talk about the document classification problem as a whole, and the different types of learning techniques.
15 |  
16 |  ### Types of Machine Learning
17 |  There are two types of learning that are used for the majority of AI problems: **supervised learning** and **unsupervised learning**. 
18 |  
19 |  Supervised learning is where the algorithm is shown some samples and the correct answers, and it extrapolates so that it can answer similar questions. It's similar to how a child learns through asking questions and using the answers to predict things in the future.
20 |  
21 |  Unsupervised learning is less useful for classification, because we already know the correct categories. It works better for data mining (finding trends that you don't already know). 
22 |  
23 |  ### Classification Algorithms
24 |  There are a few big solutions to classification problems, which all work in slightly different ways but provide similar outcomes.
25 |  
26 |  [Naive Bayes (NB) classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) are simple and popular classifiers which are often used for spam detection. They work on a simple principle, which Wikipedia illustrates like this:
27 |  
28 |  ![formula](https://upload.wikimedia.org/math/c/e/d/cedd117f3768b05f1822ae874d3fc303.png)
29 |  
30 |  Usually, NB classifiers work very quickly, but aren't as accurate as Support Vector Machines (SVMs). If you're interested in reading more about their competitiveness with SVMs, you can read [this paper](http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf).
31 |  
32 |  [Support Vector Machines](https://en.wikipedia.org/wiki/Support_vector_machine) appear similar to NB classifiers, but they are not probability-based - they can only return either 'Category A' or 'Category B'. Essentially, they find a line in a graph that splits the two datasets as accurately as possible, like this:
33 |  
34 | ![SVM diagram](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Svm_separating_hyperplanes_%28SVG%29.svg/512px-Svm_separating_hyperplanes_%28SVG%29.svg.png)
35 | 
36 |  It's clear that both $$H_2$$ and $$H_3$$ are suitable lines, but $$H_1$$ is incorrect. The training period allows  the SVM to calculate the best line.
37 | 
38 |  As you can see, SVMs can only split data points into two groups. To allow the SVM to split data points  into multiple groups, a strategy called [one-vs-the-rest](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is used. Essentially, this makes multiple  graphs, which might be like this:
39 |  
40 |  - 'good' vs rest
41 |  - 'faq' vs rest
42 |  - 'bad' vs rest
43 |  
44 | Therefore, if it is in the 'rest' section for every graph but 'bad', the document must be 'bad'.
45 |  
46 |  ### The Vectorizer
47 |  It's easy to understand how the SVM works with points, but one aspect that we haven't covered is how the points are actually calculated from a document of text. Obviously, you can't just pick a random point for a document - that'd produce nonsensical results!
48 |  
49 |  The solution to this is the *vectorizer*. As the name suggests, it turns text into a mathematical vector. This is done through a model known as the [bag-of-words](https://en.wikipedia.org/wiki/Bag-of-words_model). The example on Wikipedia (see the link) is very clear, and this is how scikit-learn's `CountVectorizer` works. Once the text has been turned into a vector, the numerical values *can* be used to position a point for the SVM.
50 |  
51 |  However, this method is a bit naive and might miss important words that aren't common. Instead, 'the' might be ranked as the most important word, which could cause the SVM to fall victim to an effect called [overfitting](https://en.wikipedia.org/wiki/Overfitting). This is where 'junk values' are misinterpreted as statistically important, leading to significant inaccuracies.
52 |  
53 |  An improved technique uses [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). This is an algorithm to rank words in a body of text by their importance, which can help to catch the key words in a message, even if they're only said once or twice.
54 |  
55 |  ### Summary
56 |  Here's a beautiful ASCII-art graph for the key stages:
57 |  
58 |  #### Training
59 |  
60 |        Corpus of Training Data (pre-classified)
61 |                        |
62 |        Process with Vectorizer to calcualte all 
63 |        key words and store it in the 'bag'
64 |                        |
65 |        Process with SVM to train and fit the correct
66 |        lines to split the groups
67 |      
68 |  #### Classification
69 | 
70 | 
71 |         Text To Process (fetched from reddit)
72 |                          |
73 |         Process with Vectorizer into Bag-of-Words,
74 |         searching for words found in training
75 |         phase
76 |                          |
77 |         Classify with SVM using pre-fitted line
78 |                          |
79 |         Return correct document classification


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 |  LearnProgrammingBot requires scikit-learn, praw and sqlalchemy. Due to this, the installation instructions are slightly different depending on which platform you are using. It should work with both Python 2 and Python 3 (unit tests are coming soon).
 3 |  
 4 |  To install, first download the source from either [the master ZIP](https://github.com/Aurora0001/LearnProgrammingBot/archive/master.zip) or the [releases](https://github.com/Aurora0001/LearnProgrammingBot/releases) tab, and extract the zip file into any directory. Alternatively, you can clone the source using git by running:
 5 |  
 6 |      git clone https://github.com/Aurora0001/LearnProgrammingBot.git
 7 |      
 8 |  Then, follow the instructions for your platform to install the dependencies:
 9 |  
10 | ## Windows
11 |  As an administrator, in the command prompt, run:
12 | 
13 |     pip install -r requirements.txt
14 | 
15 | If pip is not recognised, you may need to install it using [these instructions](http://stackoverflow.com/questions/4750806/how-do-i-install-pip-on-windows#12476379). If you want to download a pre-compiled version of Python with SciPy, try [Python(x,y)](https://python-xy.github.io/downloads.html).
16 | 
17 | ## Mac
18 | 
19 |     sudo pip install -r requirements.txt
20 |     
21 | ## Debian/Ubuntu/Mint
22 | 
23 |     sudo apt-get install python-scipy python-pip
24 |     sudo pip install sqlalchemy scikit-learn praw
25 |      
26 | ## Other Linux Distributions
27 |  Check if your distribution has a package such as `python-scipy`, which will save time and avoid the need for you to compile NumPy from source (which is slow and quite difficult). If you **can** use a package, just run this afterwards: 
28 |  
29 |     sudo pip install sqlalchemy scikit-learn praw
30 | 
31 | Make sure that you've installed the package for `pip` too, if you haven't already.
32 | 
33 | If your distribution does not have a SciPy package, just run this (and prepare for a long wait!): 
34 | 
35 |     sudo pip install -r requirements.txt
36 |     
37 | 


--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
 1 | # Setup
 2 |  To configure LearnProgrammingBot, you'll need to obtain an OAuth access token from reddit. This will allow LearnProgrammingBot to log in to the account that you want to automate. 
 3 |  
 4 |  If you already have the token for `CLIENT_ACCESSCODE`, skip this section. This code is **not** the ID or secret, though.
 5 |  
 6 | ## Getting the OAuth Tokens
 7 |  To use OAuth (which reddit requires), you need **3 tokens**: the client id, the client secret and the access token.
 8 | ### Getting the ID and Secret Tokens
 9 |  To create these tokens, you'll need to go to [the app preferences](https://www.reddit.com/prefs/apps/) page, while logged in as your bot account. If you don't see something like this, you may need to click 'create another app...':
10 | 
11 |  Set the **name** box to 'LearnProgrammingBot' (or a custom name, if you prefer - it isn't important).
12 |  Select the **script** app type from the radio buttons below the textbox.
13 |  Leave **description** and **about url** blank, and enter **http://127.0.0.1/callback** in the **redirect uri** box.
14 |  Then, click 'create app', and you should see something like what you see in the image:
15 |  
16 |  ![image](https://camo.githubusercontent.com/d53f92cd85d1279a239444acee25179e8e6d8bb5/687474703a2f2f692e696d6775722e636f6d2f65326b4f5231612e706e67)
17 | 
18 | The token under '**personal use script**' is your *client ID*. The token underlined in red is your *client secret*.
19 | 
20 | Open up `settings.py` and change the following lines to your ID and secret:
21 | 
22 |     CLIENT_ID = 'my_client_id_here'
23 |     CLIENT_SECRET = 'my_client_secret_here'
24 |     
25 | You can ignore any lines preceded by #.
26 | 
27 | ### Getting your Access Token
28 | LearnProgrammingBot can help you generate your access token automatically. This only needs to be done once - after this, it can be done manually.
29 | 
30 | In a terminal, run:
31 | 
32 |     ./main.py create-token
33 |     
34 | A web browser should open (if you are logged in as your bot account). Click 'Allow', and wait to be redirected. You will probably get something like this:
35 | 
36 | ![](https://praw.readthedocs.org/en/stable/_images/CodeUrl.png)
37 | 
38 | Don't worry, this is **correct**. Copy the token after `code=` (circled in the image), and put it in `settings.py` as CLIENT_ACCESSTOKEN. **Do not include the `code=` section - this will not work!**
39 | 
40 | ## Running LearnProgrammingBot
41 | 
42 | You're now ready to run LearnProgrammingBot (finally!). Use `./main.py` run in the terminal. This will run continuously until killed using `Ctrl+C` or an exception. You might find useful logging information in bot.log if the bot does crash. Feel free to report an issue if you do find a bug!
43 | 


--------------------------------------------------------------------------------
/docs/training.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | To train the bot, you need to install LearnProgrammingBot and its dependencies (see the Installation section). You do **not** need to create OAuth tokens as shown in the Setup section if you are only training the bot.
 4 | 
 5 | ## Training with a Specific Post
 6 |  You can train the bot with one post if it has misclassified it, using the following command:
 7 | 
 8 |      ./main.py train --id ID
 9 | 
10 |  Where ID is the reddit submission ID, for example:
11 | 
12 |     https://www.reddit.com/r/learnprogramming/comments/4g4far/meta_i_wrote_a_bot_for_rlearnprogramming_that/
13 |                                                       ^^^^^^^
14 | In this link, the id is **4g4far**, so you could train it with:
15 | 
16 |     ./main.py train --id 4g4far
17 | 
18 | LearnProgrammingBot will then fetch the post from reddit, and display it for you
19 | to review. It will then prompt you to enter the correct classification of the
20 | post. Here are the categories (an updated list is found in `review_corpus.py`)
21 | 
22 | Valid categories:
23 | - good
24 | - off_topic (incl. bad questions)
25 | - faq_get_started (incl. getting started with a project - where do I start?)
26 | - faq_career
27 | - faq_resource (incl. challenges e.g. codewars)
28 |   - faq_resource_podcast
29 | - faq_tool (incl. laptop specs)
30 | - faq_language (e.g. how much should I know before I am expert, which should I
31 | pick)
32 | - faq_other (including motivation, 'does a programmer google?', project ideas etc.)
33 | - faq_what_now (what to do after codecademy etc.)
34 | 
35 | For the best results, it's best to be generous with your classification, and, if in doubt, classify as 'good'. Check `data.db` for examples of how previous posts were classified, if you're not sure.
36 | 
37 | ## Training in Batches
38 |  You might find it easier to train with larger samples from the 'new' feed of /r/learnprogramming. This is supported with the `train-batch` command, which can be used like so:
39 | 
40 |      ./main.py train-batch --limit AMOUNT_OF_POSTS_TO_CLASSIFY
41 | 
42 |  This is also interactive, just like the `train` command. To see the valid classifications, please see the above section.
43 | 
44 | ## Committing Changes
45 |  To merge your database changes with the main repository, [fork LearnProgrammingBot](https://github.com/Aurora0001/LearnProgrammingBot) on GitHub, then clone your copy. Train the classifier using the steps listed above, then [create a pull request](https://help.github.com/articles/using-pull-requests/). Try to do this relatively quickly (i.e. don't wait for days before merging) because it's difficult to resolve merge conflicts with the database.
46 | 
47 | ### Summary
48 | 1. Fork repository
49 | 2. `git clone https://github.com/MyUserName/LearnProgrammingBot`
50 | 3. Train classifer
51 | 4. `git commit -m "Trained classifier with X new records"`
52 | 5. `git push origin master`
53 | 6. Create pull request on GitHub
54 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python2
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | from sklearn.pipeline import make_union
  6 | from sklearn.base import TransformerMixin
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sqlalchemy.orm import sessionmaker
  9 | from sqlalchemy import create_engine
 10 | from sklearn import svm
 11 | import numpy as np
 12 | import logging
 13 | import webbrowser
 14 | import argparse
 15 | import sys
 16 | import re
 17 | 
 18 | from settings import LOGFILE_URI, DATABASE_URI, LOG_LEVEL, CLIENT_ID, CLIENT_SECRET, CLIENT_ACCESSCODE, SUBREDDIT, REDIRECT_URI, LOG_FORMAT
 19 | import model
 20 | import praw
 21 | 
 22 | # This allows for Python 3 compatibility by replacing input() on Python 2
 23 | if sys.version_info[:2] <= (2, 7):
 24 |     input = raw_input
 25 | 
 26 | responses = {
 27 |     'off_topic': '''
 28 | Hi! Your post might not attract good responses on /r/learnprogramming.
 29 | This may be because you didn't include a code sample, provided very little
 30 | detail or linked content that doesn't seem relevant. You can improve your post
 31 | by:
 32 | 
 33 | - [Asking Questions The Smart Way](http://catb.org/~esr/faqs/smart-questions.html)
 34 | - Avoiding posting links without any explanation, discussion or question (links
 35 | might get a better response on /r/programming)
 36 | - Using code pastebins (images don't count!)
 37 | - Reviewing the post guidelines on the sidebar
 38 | 
 39 | Don't worry about this message if you think it's a mistake - it may just be an
 40 | error in my classifier, but please check the resources above anyway to make
 41 | sure that your post gets the best responses.
 42 | ''',
 43 |     'faq_get_started': '''
 44 | Hello! Your post seems to be about getting started with programming or a
 45 | project. You can find some great resources about this in the
 46 | [/r/learnprogramming FAQ](https://www.reddit.com/r/learnprogramming/wiki/faq).
 47 | 
 48 | Specifically, you might find these useful:
 49 | 
 50 | - [Getting Started with Programming](https://www.reddit.com/r/learnprogramming/wiki/gettingstarted)
 51 | - [FAQ - How do I get started?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_get_started_with_programming.3F)
 52 | - [FAQ - How do I get started with a large project?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_get_started_with_a_large_project_and_keep_up_with_it.3F)
 53 | ''',
 54 |     'faq_career': '''
 55 | Hello! Your post seems to be about careers in programming. You'll
 56 | be able to get the best advice in the subreddit /r/cscareerquestions, who
 57 | specifically deal with questions like this.
 58 | 
 59 | The wiki also has some useful advice about this:
 60 | 
 61 | - [FAQ - Careers](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_careers_and_jobs)
 62 | ''',
 63 |     'faq_resource': '''
 64 | Hello! You seem to be looking for a resource or tutorial. The /r/learnprogramming
 65 | wiki has a comprehensive list of resources that might be useful to you, but if
 66 | what you're looking for isn't on there, please help by adding it!
 67 | 
 68 | - [Online Resources](http://www.reddit.com/r/learnprogramming/wiki/online)
 69 | - [Books](http://www.reddit.com/r/learnprogramming/wiki/books)
 70 | - [Programming Challenges](http://www.reddit.com/r/learnprogramming/wiki/faq#wiki_where_can_i_find_practice_exercises_and_project_ideas.3F)
 71 | 
 72 | You might also like the [Awesome Lists](https://awesomelists.top/), which are
 73 | curated lists for the best libraries, tools and resources for most programming
 74 | languages, topics and tools.
 75 | ''',
 76 |     'faq_tool': '''
 77 | Hello! Your post seems to be about a programming tool, IDE or hardware (e.g. a laptop).
 78 | 
 79 | Take a look at the following links:
 80 | 
 81 | - /r/suggestalaptop
 82 | - [Wiki - Programming Tools](https://www.reddit.com/r/learnprogramming/wiki/tools)
 83 | ''',
 84 |     'faq_language': '''
 85 | Hello! You seem to be asking about which programming language to use for a
 86 | project or which language to learn. This is quite a frequent question so you
 87 | might find that you get the best answer from the
 88 | [FAQ](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_which_programming_language_should_i_start_with.3F).
 89 | 
 90 | Also, why not try the [choosing a language tool](http://choosing-a-language.techboss.co/)
 91 | by Techboss which should guide you in picking a suitable language.
 92 | 
 93 | The general advice here is that you should focus on one programming language
 94 | that you know well, so you can improve your *algorithmic thinking* skills.
 95 | 'Language hopping' tends to be a bad idea because you are always learning
 96 | syntax, which is less important.
 97 | ''',
 98 |     'faq_other': '''
 99 | Hello! Your post seems similar to an FAQ question, but I can't specifically
100 | figure out which section would be helpful to you.
101 | 
102 | Take a look through [the wiki](https://www.reddit.com/r/learnprogramming/wiki/index)
103 | if you haven't already, and check to see if it helps you. If not, please
104 | report an issue so I can give more specific help in future!
105 | ''',
106 |     'faq_resource_podcast': '''
107 | Looking for a podcast? You might find these threads useful:
108 | 
109 | - [Podcasts for Beginners](https://www.reddit.com/r/learnprogramming/comments/47dusa/podcasts_any_recommendations_for_a_beginner/)
110 | - [Advanced Programming Podcasts](https://www.reddit.com/r/learnprogramming/comments/3pw6gl/advanced_programming_concepts_or_fun_fact_type/)
111 |     ''',
112 |     'faq_what_now': '''
113 | Hi! If you've just completed your first course and aren't sure where to go next, take a look at some of these guides and see if they help:
114 | 
115 | - [FAQ - Now what do I do?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_now_what_do_i_do.3F)
116 | - [How do I move from beginner to intermediate level?](https://www.reddit.com/r/learnprogramming/wiki/faq#wiki_how_do_i_move_from_an_beginning_to_an_intermediate_level.3F)
117 |     '''
118 | }
119 | 
120 | post_signature = '''
121 | 
122 | ---
123 | I am a bot for /r/learnprogramming using supervised learning to provide helpful
124 | responses to common posts. I'm open source and accept pull requests and
125 | contributions!
126 | 
127 | [[Learn More]](https://github.com/Aurora0001/LearnProgrammingBot)
128 | [[Report an Issue (or reply below with feedback)]](https://github.com/Aurora0001/LearnProgrammingBot/issues)
129 | '''
130 | 
131 | 
132 | class PostTransformer(TransformerMixin):
133 |     """
134 |     Transforms posts on four characteristics:
135 |     - Amount of links
136 |     - Length of post
137 |     - Contains block code
138 |     - Contains inline code
139 |     """
140 |     def __init__(self, word_k=10000, link_k=5):
141 |         # TODO: grid search for best constants
142 |         self.word_k = word_k
143 |         self.link_k = link_k
144 | 
145 |     def fit(self, *args):
146 |         return self
147 | 
148 |     def transform(self, X, *args, **kwargs):
149 |         ret = []
150 |         for item in X:
151 |             ret.append(float(len(item)) / self.word_k)
152 |             ret.append(float(item.count('http')) / self.link_k)
153 |             ret.append(float(item.count('    ')) / len(item))
154 | 
155 |         y = np.array(ret).reshape(-1, 3)
156 |         return y
157 | 
158 |     fit_transform = transform
159 | 
160 | class Classifier(object):
161 |     """
162 |     Wrapper for the vectorizer and classifier that handles training of both.
163 |     """
164 |     def __init__(self, training_values=None, training_targets=None):
165 |         self.vectorizer = make_union(TfidfVectorizer(), PostTransformer())
166 |         # Set using parameter_search. TODO: review after updating
167 |         # corpus.
168 |         self.classifier = svm.LinearSVC(C=1, loss='squared_hinge', multi_class='ovr', class_weight='balanced', tol=1e-6)
169 |         if training_values is not None and training_targets is not None:
170 |             self.fit(training_values, training_targets)
171 | 
172 |     def fit(self, training_values, training_targets):
173 |         training_values = self.vectorizer.fit_transform(training_values).toarray()
174 |         self.classifier.fit(training_values, training_targets)
175 | 
176 |     def classify(self, text):
177 |         transformed_text = self.vectorizer.transform([text]).toarray()
178 |         return self.classifier.predict(transformed_text)
179 | 
180 |     def get_probability(self, text):
181 |         transformed_text = self.vectorizer.transform([text]).toarray()
182 |         return self.classifier.decision_function(transformed_text)
183 | 
184 | def connect_to_database(uri):
185 |     engine = create_engine(uri)
186 |     return sessionmaker(bind=engine)
187 | 
188 | def get_reddit_client():
189 |     reddit = praw.Reddit(user_agent='all platforms:Learn Programming Bot:v0.2.0-pre (by /u/Aurora0001, contact at github.com/Aurora0001/LearnProgrammingBot/issues)')
190 |     reddit.set_oauth_app_info(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, redirect_uri=REDIRECT_URI)
191 |     return reddit
192 | 
193 | def run_bot(args):
194 |     logging.basicConfig(filename=LOGFILE_URI, level=LOG_LEVEL, format=LOG_FORMAT)
195 | 
196 |     logging.info('Connecting to database {}'.format(DATABASE_URI))
197 |     Session = connect_to_database(DATABASE_URI)
198 |     logging.info('Database connection OK')
199 | 
200 |     session = Session()
201 |     data = session.query(model.Corpus).all()
202 | 
203 |     data_values = [col.title + ' ' + col.text for col in data]
204 |     data_targets = [col.category for col in data]
205 | 
206 |     logging.info('Training classifier with {} values'.format(len(data_values)))
207 |     classifier = Classifier(data_values, data_targets)
208 |     logging.info('Classifier trained')
209 | 
210 |     logging.info('Connecting to reddit...')
211 |     reddit = get_reddit_client()
212 | 
213 |     logging.info('Authorizing...')
214 |     access_information = reddit.get_access_information(CLIENT_ACCESSCODE)
215 |     reddit.set_access_credentials(**access_information)
216 |     logging.info('Logged in successfully.')
217 | 
218 |     for message in praw.helpers.submission_stream(reddit, SUBREDDIT, limit=5, verbosity=0):
219 |         message_text = message.title + ' ' + message.selftext
220 |         pred = classifier.classify(message_text)[0]
221 |         if pred in responses:
222 |             if args.supervised and input('Classify {} as {}? (y/n) '.format(message.id, pred)).lower() != 'y':
223 |                 continue
224 | 
225 |             try:
226 |                 message.add_comment(responses[pred] + post_signature)
227 |             except praw.errors.RateLimitExceeded:
228 |                 # TODO:
229 |                 # Ideally, errors should actually be handled properly. Perhaps a dequeue could be used
230 |                 # to store all the posts which failed, which could be retried every minute (or so)
231 |                 logging.error('Rate limit exceeded, cannot post to thread {}'.format(message.title))
232 | 
233 | def train_id(args):
234 |     train_bot(args, True)
235 | 
236 | def train_batch(args):
237 |     train_bot(args, False)
238 | 
239 | def train_bot(args, by_id):
240 |     reddit = get_reddit_client()
241 |     if by_id:
242 |         messages = [reddit.get_submission(submission_id=args.id)]
243 |     else:
244 |         messages = reddit.get_subreddit(SUBREDDIT).get_new(limit=args.limit)
245 |     for message in messages:
246 |         print(message.title)
247 |         print('----------')
248 |         print(message.selftext)
249 |         print('')
250 |         message_type = input('Enter category: ')
251 |         if message_type == '':
252 |             continue
253 |         Session = connect_to_database(DATABASE_URI)
254 |         session = Session()
255 |         session.add(model.Corpus(title=message.title, text=message.selftext, category=message_type))
256 |         session.commit()
257 | 
258 | def create_token(args):
259 |     reddit = get_reddit_client()
260 |     url = reddit.get_authorize_url('uniqueKey', 'identity,submit,read', True)
261 |     webbrowser.open(url)
262 |     print('                   !!!                    ')
263 |     print('Please copy the access code that you are redirected to ')
264 |     print('like this: http://praw.readthedocs.org/en/latest/_images/CodeUrl.png')
265 |     print('You need to put it in settings.py as CLIENT_ACCESSCODE')
266 |     print('                   !!!                    ')
267 | 
268 | def classify_item(args):
269 |     reddit = get_reddit_client()
270 |     post = reddit.get_submission(submission_id=args.id)
271 | 
272 |     Session = connect_to_database(DATABASE_URI)
273 |     session = Session()
274 | 
275 |     data = session.query(model.Corpus).all()
276 |     data_values = [col.title + ' ' + col.text for col in data]
277 |     data_targets = [col.category for col in data]
278 | 
279 |     classifier = Classifier(data_values, data_targets)
280 |     post_text = post.title + ' ' + post.selftext
281 |     classification = classifier.classify(post_text)[0]
282 |     probability = classifier.get_probability(post_text)[0]
283 |     print('p({}) = {}'.format(classification, max(probability)))
284 |     print('-----------')
285 |     for (i, class_) in enumerate(classifier.classifier.classes_):
286 |         print('p({}) = {}'.format(class_, probability[i]))
287 | 
288 | def initialise_database(args):
289 |     engine = create_engine(DATABASE_URI)
290 |     model.Corpus.metadata.create_all(engine)
291 | 
292 | if __name__ == '__main__':
293 |     parser = argparse.ArgumentParser()
294 |     subparsers = parser.add_subparsers()
295 |     parser_run = subparsers.add_parser('run', help='runs the bot')
296 |     parser_run.add_argument('--supervised', action='store_true')
297 |     parser_run.set_defaults(func=run_bot)
298 |     parser_train = subparsers.add_parser('train', help='adds training data to the bot (using a specific id)')
299 |     parser_train.add_argument('--id', type=str, required=True, help='the submission id of the post to review')
300 |     parser_train.set_defaults(func=train_id)
301 |     parser_batch = subparsers.add_parser('train-batch', help='adds training data to the bot in batches')
302 |     parser_batch.add_argument('--limit', type=int, required=True, help='the maximum number of posts to fetch')
303 |     parser_batch.set_defaults(func=train_batch)
304 |     parser_token = subparsers.add_parser('create-token', help='gets an access token with your client id/secret')
305 |     parser_token.set_defaults(func=create_token)
306 |     parser_init = subparsers.add_parser('init', help='initialises the database, ready to insert training data')
307 |     parser_init.set_defaults(func=initialise_database)
308 |     parser_classify = subparsers.add_parser('classify', help='classifies a specific post using the trained data')
309 |     parser_classify.add_argument('--id', type=str, required=True, help='the submission id of the post to classify')
310 |     parser_classify.set_defaults(func=classify_item)
311 |     args = parser.parse_args(sys.argv[1:])
312 |     args.func(args)
313 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Integer, String
 2 | from sqlalchemy.ext.declarative import declarative_base
 3 | Base = declarative_base()
 4 | 
 5 | class Corpus(Base):
 6 |     __tablename__ = 'corpora'
 7 |     id = Column(Integer, primary_key=True)
 8 |     text = Column(String, nullable=False)
 9 |     title = Column(String, nullable=False)
10 |     category = Column(String, nullable=True)
11 | 


--------------------------------------------------------------------------------
/parameter_search.py:
--------------------------------------------------------------------------------
 1 | from sklearn.grid_search import GridSearchCV
 2 | from sklearn.pipeline import Pipeline
 3 | from sqlalchemy.orm import sessionmaker
 4 | from sqlalchemy import create_engine
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | import main
 9 | import model
10 | import settings
11 | 
12 | param_grid = [
13 |     {
14 |         'C': [1, 5, 10, 50],
15 |         'loss': ['hinge', 'squared_hinge'],
16 |         'tol': [1e-6, 1e-4, 1e-2, 1e-1],
17 |         'multi_class': ['ovr', 'crammer_singer'],
18 |         'class_weight': ['balanced']
19 |     }
20 | ]
21 | 
22 | if __name__ == '__main__':
23 |     engine = create_engine(settings.DATABASE_URI)
24 |     Session = sessionmaker(bind=engine)
25 |     session = Session()
26 |     data = session.query(model.Corpus).all()
27 |     data_values = [col.title + ' ' + col.text for col in data]
28 |     data_targets = [col.category for col in data]
29 |     classifier = main.Classifier()
30 |     classifier.vectorizer.fit_transform(data_values)
31 |     grid_search = GridSearchCV(classifier.classifier, param_grid, n_jobs=-1)
32 |     grid_search.fit(classifier.vectorizer.transform(data_values), data_targets)
33 |     print('Best score: {}'.format(grid_search.best_score_))
34 |     parameters = grid_search.best_estimator_.get_params()
35 |     for parameter in parameters.keys():
36 |         print("{} - {}".format(parameter, parameters[parameter]))
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | sqlalchemy
5 | praw
6 | 


--------------------------------------------------------------------------------
/review_corpus.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy.orm import sessionmaker
 2 | from sqlalchemy import create_engine
 3 | import sys
 4 | 
 5 | import main
 6 | import model
 7 | import settings
 8 | 
 9 | """
10 | Valid categories:
11 | - good
12 | - off_topic (incl. bad questions)
13 | - faq_get_started (incl. getting started with a project - where do I start?)
14 | - faq_career
15 | - faq_resource (incl. challenges e.g. codewars)
16 |   - faq_resource_podcast
17 | - faq_tool (incl. laptop specs)
18 | - faq_language (e.g. how much should I know before I am expert, which should I
19 | pick)
20 | - faq_other (including motivation, 'does a programmer google?', project ideas etc.)
21 | - faq_what_now (what to do after codecademy etc.)
22 | 
23 | """
24 | 
25 | # This allows for Python 3 compatibility by replacing input() on Python 2
26 | if sys.version_info[:2] <= (2, 7):
27 |     input = raw_input
28 | 
29 | if __name__ == '__main__':
30 |     engine = create_engine(settings.DATABASE_URI)
31 |     Session = sessionmaker(bind=engine)
32 |     session = Session()
33 |     data = session.query(model.Corpus).all()
34 |     for message in data:
35 |         print(message.title)
36 |         print('----')
37 |         print(message.text)
38 |         print('')
39 |         category = input('Enter category for post: ')
40 |         message.category = category
41 |         session.commit()
42 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | # Run ./main.py init to create data.db if you are not using the provided
 2 | # training data. You need to delete the old data.db first ONLY IF YOU ARE NOT
 3 | # USING THE TRAINING DATA. If you are using the default training data,
 4 | # do nothing.
 5 | DATABASE_URI = 'sqlite:///data.db'
 6 | 
 7 | # This will be auto-created if it does not exist
 8 | LOGFILE_URI = 'bot.log'
 9 | LOG_LEVEL = 20
10 | LOG_FORMAT = '%(asctime)s [%(levelname)s] %(message)s [%(module)s:%(funcName)s:%(lineno)d]'
11 | 
12 | SUBREDDIT = 'learnprogramming'
13 | REDIRECT_URI = 'http://127.0.0.1/callback'
14 | 
15 | # To generate your client id and secret, go to https://www.reddit.com/prefs/apps/
16 | # while logged in as your bot account. Create an app, using any name you like. Set
17 | # the type to 'script', and the 'redirect uri' to http://127.0.0.1/callback
18 | # The string labelled 'secret' is your CLIENT_SECRET, and the other one is your
19 | # CLIENT_ID.
20 | CLIENT_ID = 'PLEASE_SET_THIS'
21 | CLIENT_SECRET = 'PLEASE_SET_THIS'
22 | 
23 | # Please use ./main.py create-token to generate this, after setting CLIENT_ID
24 | # and CLIENT_SECRET. It will open your web browser to get permission to
25 | # use OAuth. This only needs to be done once.
26 | CLIENT_ACCESSCODE = 'PLEASE_SET_THIS'
27 | 


--------------------------------------------------------------------------------
/statistics.py:
--------------------------------------------------------------------------------
 1 | from sklearn.learning_curve import learning_curve
 2 | from sqlalchemy.orm import sessionmaker
 3 | from sqlalchemy import create_engine
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | import main
 8 | import model
 9 | import settings
10 | 
11 | if __name__ == '__main__':
12 |     # Adapted from http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html (BSD license)
13 |     engine = create_engine(settings.DATABASE_URI)
14 |     Session = sessionmaker(bind=engine)
15 |     session = Session()
16 |     data = session.query(model.Corpus).all()
17 |     data_values = [col.title + ' ' + col.text for col in data]
18 |     data_targets = [col.category for col in data]
19 |     classifier = main.Classifier()
20 |     classifier.vectorizer.fit_transform(data_values)
21 |     train_sizes, train_scores, test_scores = learning_curve(classifier.classifier, classifier.vectorizer.transform(data_values), data_targets, cv=6)
22 |     train_scores_mean = np.mean(train_scores, axis=1)
23 |     train_scores_std = np.std(train_scores, axis=1)
24 |     test_scores_mean = np.mean(test_scores, axis=1)
25 |     test_scores_std = np.std(test_scores, axis=1)
26 |     plt.figure()
27 |     plt.title('LearnProgrammingBot Training Scores')
28 |     plt.xlabel("Training Samples")
29 |     plt.ylabel("Accuracy")
30 |     plt.grid()
31 | 
32 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
33 |                      train_scores_mean + train_scores_std, alpha=0.1,
34 |                      color="r")
35 |     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
36 |                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
37 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
38 |              label="Training score")
39 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
40 |              label="Cross-validation score")
41 |     plt.ylim(0, 1)
42 |     plt.show()
43 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from sqlalchemy.orm import sessionmaker
 3 | from sqlalchemy import create_engine
 4 | from sklearn import cross_validation
 5 | 
 6 | import model
 7 | import main
 8 | 
 9 | class TestClassifier(unittest.TestCase):
10 |     def test_classifications(self):
11 |         false_positives = 0
12 |         false_negatives = 0
13 |         correct = 0
14 |         wrong = 0
15 |         engine = create_engine('sqlite:///data.db')
16 |         Session = sessionmaker(bind=engine)
17 |         session = Session()
18 |         training_data = session.query(model.Corpus).all()
19 |         training_values = [rec.title + ' ' + rec.text for rec in training_data]
20 |         training_targets = [rec.category for rec in training_data]
21 |         training_values, testing_values, training_targets, testing_targets = cross_validation.train_test_split(training_values, training_targets, test_size=0.2, random_state=0)
22 |         classifier = main.Classifier(training_values, training_targets)
23 |         for (i, message_text) in enumerate(testing_values):
24 |             classification = classifier.classify(message_text)[0]
25 |             if testing_targets[i] == 'good' and classification != 'good':
26 |                 false_positives += 1
27 |                 print(message_text)
28 |                 print('[Suspected {}; actually good]'.format(classification))
29 |                 print('---')
30 |             elif testing_targets[i] != 'good' and classification == 'good':
31 |                 false_negatives += 1
32 |             elif testing_targets[i] == classification:
33 |                 correct += 1
34 |             else:
35 |                 wrong += 1
36 |                 print(message_text)
37 |                 print('[Suspected {}; actually {}]'.format(classification, testing_targets[i]))
38 |                 print('---')
39 |         print('{} false positives ({})'.format(false_positives, float(false_positives)/len(testing_values)))
40 |         print('{} false negatives ({})'.format(false_negatives, float(false_negatives)/len(testing_values)))
41 |         print('{} correct ({})'.format(correct, float(correct)/len(testing_values)))
42 |         print('{} wrong ({})'.format(wrong, float(wrong)/len(testing_values)))
43 |         if float(false_positives) / len(testing_values) > 0.05:
44 |             raise Exception('False positive rate too high!')
45 |         elif float(correct) / len(testing_values) < 0.6:
46 |             raise Exception('Correct identification rate too low!')
47 | 
48 | if __name__ == '__main__':
49 |     unittest.main()
50 | 


--------------------------------------------------------------------------------