├── redditnetwork
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── ioutils.py
    │   ├── dateutils.py
    │   ├── fastfreqdist.py
    │   ├── stringutils.py
    │   └── datautils.py
    ├── constants.py
    ├── network_extractor.py
    └── corpus_reader.py
├── setup.py
├── .gitignore
├── README.md
└── network_example.ipynb


/redditnetwork/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/redditnetwork/constants.py:
--------------------------------------------------------------------------------
1 | DATA_HOME="/dfs/scratch0/reddit/"
2 | SPACY_COMMENTS=DATA_HOME+"spacy_comments/"
3 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/ioutils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cPickle as pickle
 3 | import json
 4 | 
 5 | def mkdir(directory):
 6 |     if not os.path.exists(directory):
 7 |         os.makedirs(directory) 
 8 | 
 9 | def write_pickle(data, filename):
10 |     fp = open(filename, "wb")
11 |     pickle.dump(data, fp)
12 | 
13 | def load_pickle(filename):
14 |     fp = open(filename, "rb")
15 |     return pickle.load(fp)
16 | 
17 | def write_json(data, filename):
18 |     fp = open(filename, "wb")
19 |     json.dump(data, fp)
20 | 
21 | def load_json(filename):
22 |     fp = open(filename, "rb")
23 |     return json.load(fp)
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | 
 4 | setup(name='redditnetwork',
 5 |       version='1.0',
 6 |       description='Code for accessing processing Reddit data',
 7 |       author='William L. Hamilton',
 8 |       author_email='will.leif.hamiltion@gmail.com',
 9 |       license='MIT',
10 |       install_requires=['spacy==1.2.0',
11 |                         'networkx',
12 |                         'numpy',
13 |                         'isoweek',
14 |                         'pandas',
15 |                         'nltk'
16 |                         ],
17 |       package_data={'redditnetwork': ['README.md']},
18 |       packages=find_packages())
19 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/dateutils.py:
--------------------------------------------------------------------------------
 1 | import datetime 
 2 | from isoweek import Week
 3 | import calendar
 4 | 
 5 | def get_week(timestamp):
 6 |     timestamp = datetime.datetime.utcfromtimestamp(float(timestamp))
 7 |     date = timestamp.date()
 8 |     iso_info = date.isocalendar()
 9 |     week = iso_info[1] - 1
10 |     return week
11 | 
12 | def get_week_timestamp(year, week):
13 |     d = Week(year, week).monday()
14 |     return calendar.timegm(d.timetuple())
15 | 
16 | def day_week(timestamp):
17 |     timestamp = datetime.datetime.utcfromtimestamp(float(timestamp))
18 |     date = timestamp.date()
19 |     iso_info = date.isocalendar()
20 |     week = iso_info[1] - 1
21 |     day = week * 7 + iso_info[2] - 1
22 |     return day, week
23 | 
24 | def month_year(timestamp):
25 |     date = datetime.datetime.utcfromtimestamp(int(timestamp))
26 |     return (date.year, date.month)
27 | 
28 | def month_year_add(month_year, increment):
29 |     month_year = (month_year[0] + (increment + month_year[1] - 1) / 12,
30 |         (month_year[1] + increment - 1) % 12 + 1)
31 |     return month_year
32 | 
33 | def previous_month_year(month_year):
34 |     month_year = (month_year[0], month_year[1]-1)
35 |     if month_year[1] < 1:
36 |         month_year= (month_year[0] - 1, 12)
37 |     return month_year
38 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/fastfreqdist.py:
--------------------------------------------------------------------------------
 1 | from nltk.probability import FreqDist, MLEProbDist
 2 | import numpy as np
 3 | 
 4 | class CachedFreqDist(FreqDist):
 5 |     """
 6 |     A read only version of nltk's FreqDist that caches the sample size for speed.
 7 |     DO NOT UPDATE COUNTS OF THIS OBJECT, resulting frequencies will not sum to one.
 8 |     """
 9 |     def __init__(self, freqdist):
10 |         FreqDist.__init__(self, freqdist)
11 |         self._N = np.sum(self.values())
12 | 
13 |     def N(self):
14 |         return self._N
15 | 
16 |     # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs,
17 |     # here, freq() does probs
18 |     def freq(self, sample):
19 |         if self.N() == 0:
20 |             return 0
21 |         return float(self[sample]) / self.N()
22 | 
23 | class MultiGenMLEProbDist(MLEProbDist):
24 |     """
25 |     An extension of nltk's MLEProbDist that allows for fast sampling for larger sample sizes
26 |     """
27 | 
28 |     def __init__(self, freqdist, bins=None):
29 |         MLEProbDist.__init__(self, freqdist, bins)
30 |         self._probarray = np.zeros((len(freqdist),))
31 |         self._probmap = {}
32 |         for i, item in enumerate(freqdist.keys()):
33 |             self._probarray[i] = freqdist.freq(item)
34 |             self._probmap[i] = item
35 | 
36 |     def generate_many(self, n):
37 |         return {self._probmap[i]:count for i, count in 
38 |                 enumerate(np.random.multinomial(n, self._probarray)) if count != 0}
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # project specific
  2 | seaborn
  3 | scikits-bootstrap
  4 | pattern
  5 | statsmodels
  6 | .git_old
  7 | scikit-learn
  8 | *mallet*
  9 | venv
 10 | *egg-info*
 11 | build
 12 | dist
 13 | .ipynb_checkpoints
 14 | 
 15 | *.class
 16 | 
 17 | *.c
 18 | 
 19 | # file system nonsense
 20 | *.DS_Store
 21 | *.__afs*
 22 | 
 23 | # swp files
 24 | *.swo
 25 | *.swp
 26 | 
 27 | # Cython and c build remnants
 28 | build
 29 | *.o
 30 | *.so
 31 | 
 32 | # Python build remnants
 33 | *.pyc
 34 | 
 35 | ## Core latex/pdflatex auxiliary files:
 36 | *.pdf
 37 | *.aux
 38 | *.lof
 39 | *.log
 40 | *.lot
 41 | *.fls
 42 | *.out
 43 | *.toc
 44 | 
 45 | ## Intermediate documents:
 46 | *.dvi
 47 | *-converted-to.*
 48 | # these rules might exclude image files for figures etc.
 49 | # *.ps
 50 | # *.eps
 51 | # *.pdf
 52 | 
 53 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 54 | *.bbl
 55 | *.bcf
 56 | *.blg
 57 | *-blx.aux
 58 | *-blx.bib
 59 | *.brf
 60 | *.run.xml
 61 | 
 62 | ## Build tool auxiliary files:
 63 | *.fdb_latexmk
 64 | *.synctex.gz
 65 | *.synctex.gz(busy)
 66 | *.pdfsync
 67 | 
 68 | ## Auxiliary and intermediate files from other packages:
 69 | 
 70 | # algorithms
 71 | *.alg
 72 | *.loa
 73 | 
 74 | # amsthm
 75 | *.thm
 76 | 
 77 | # beamer
 78 | *.nav
 79 | *.snm
 80 | *.vrb
 81 | 
 82 | #(e)ledmac/(e)ledpar
 83 | *.end
 84 | *.[1-9]
 85 | *.[1-9][0-9]
 86 | *.[1-9][0-9][0-9]
 87 | *.[1-9]R
 88 | *.[1-9][0-9]R
 89 | *.[1-9][0-9][0-9]R
 90 | *.eledsec[1-9]
 91 | *.eledsec[1-9]R
 92 | *.eledsec[1-9][0-9]
 93 | *.eledsec[1-9][0-9]R
 94 | *.eledsec[1-9][0-9][0-9]
 95 | *.eledsec[1-9][0-9][0-9]R
 96 | 
 97 | # glossaries
 98 | *.acn
 99 | *.acr
100 | *.glg
101 | *.glo
102 | *.gls
103 | 
104 | # hyperref
105 | *.brf
106 | 
107 | # listings
108 | *.lol
109 | 
110 | # makeidx
111 | *.idx
112 | *.ilg
113 | *.ind
114 | *.ist
115 | 
116 | # minitoc
117 | *.maf
118 | *.mtc
119 | *.mtc0
120 | 
121 | # minted
122 | *.pyg
123 | 
124 | # morewrites
125 | *.mw
126 | 
127 | # nomencl
128 | *.nlo
129 | 
130 | # sagetex
131 | *.sagetex.sage
132 | *.sagetex.py
133 | *.sagetex.scmd
134 | 
135 | # sympy
136 | *.sout
137 | *.sympy
138 | sympy-plots-for-*.tex/
139 | 
140 | # todonotes
141 | *.tdo
142 | 
143 | # xindy
144 | *.xdy
145 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/stringutils.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import nltk
 3 | import re
 4 | 
 5 | NLTK_STOP = set(nltk.corpus.stopwords.words('english'))
 6 | PUNCTUATION = set(string.punctuation)
 7 | URL = ['www', '.com', '.net', '.org', '.edu', '//', '/u/', '/r/', 'http']
 8 | lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
 9 | HTTP = re.compile("https?$")
10 | BOT = re.compile("bot\d*$")
11 | 
12 | def is_bot(word):
13 |     word = word.lower()
14 |     return bool(BOT.search(word))
15 | 
16 | def is_punkt(word):
17 |     return word in PUNCTUATION
18 | 
19 | def is_http(word):
20 |     return bool(HTTP.match(word))
21 | 
22 | def has_numbers(input_string):
23 |     return bool(re.search(r'\d', input_string))
24 | 
25 | def is_ascii(s):
26 |     try:
27 |         s.decode("ascii")
28 |     except UnicodeEncodeError:
29 |         return False
30 |     else:
31 |         return True
32 | 
33 | def is_url(word):
34 |     b = [1 if t in word else 0 for t in URL]
35 |     return sum(b) > 0
36 | 
37 | def clean_word_replace(word):
38 |     word = word.strip()
39 |     word = word.strip(string.punctuation)
40 |     word = word.lower()
41 |     if not is_ascii(word):
42 |         return "<SPECIAL>"
43 |     elif is_http(word):
44 |         return "<URL>"
45 |     elif has_numbers(word):
46 |         return "<NUM>"
47 |     else:
48 |         return word
49 | 
50 | def clean_word(word, lower=True, stem=True, remove_stop=True):
51 |     word = word.strip()
52 |     word = word.strip(string.punctuation)
53 |     w = word.lower()
54 |     if remove_stop:
55 |         if word.startswith("'"):
56 |             return ""
57 |         if not is_ascii(word):
58 |             return ""
59 |         b = [1 if t in word else 0 for t in URL]
60 |         if sum(b) > 0:
61 |             return ""
62 |         if w in PUNCTUATION or w in NLTK_STOP:
63 |             return ""
64 |         if has_numbers(w):
65 |             return ""
66 |     if stem:
67 |         w = lemmatizer.lemmatize(w) # for nouns
68 |         w = lemmatizer.lemmatize(w, pos = 'v') # for verbs
69 |     return w
70 | 
71 | def is_stop(word):
72 |     if len(word) == 1:
73 |         return True
74 |     elif word.isdigit():
75 |         return True
76 |     elif word in NLTK_STOP:
77 |         return True
78 |     else:
79 |         return False
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # redditnetwork
 2 | 
 3 | Code for managing Reddit dataset (assuming user has access to the Stanford Reddit Data).
 4 | 
 5 | ## Installing / set-up
 6 | 
 7 | The code requires that version 1.2.0 of spacy is installed, in order to load and manipulate the pre-processed text data (spacy https://spacy.io/docs/api/doc).
 8 | If you are using a newer version of spacy and don't want to downgrade, it is recommended that you use a virtual environment, i.e. run the following in the root redditnetwork directory
 9 | 
10 |     pip install virtualenv
11 |     python -m virtualenv venv
12 |     source venv/bin/activate
13 | 
14 | After you have (optionally) set-up the virtual environment, run
15 | 
16 |     pip setup.py install
17 |     python -m spacy.en.download all
18 |     python -m nltk.downloader stopwords
19 | The first command installs the package and all necessary dependencies. 
20 | The second command downloads the spacy model data. It will download around 1Gb data (including pre-trained word vectors) into the spacy installation directory, so make sure you have space in that directory.
21 | The third model downloads the standard nltk stopword lists; this should only take a few seconds.
22 | Note that these models might already be installed/downloaded. There is no need to reinstall/download if this is the case.
23 |  
24 | 
25 | ## Using the code
26 | 
27 | There are two main use cases for this code:
28 | 1) Accessing the Reddit post and comment corpus.
29 | 2) Extracting heterogeneous/multilayer networks from the data.
30 | 
31 | ### Accessing and iterating over comments and posts
32 | 
33 | The `corpus_reader.py` file contains a number of useful classes that allow you to access and iterate over the processed Reddit data.
34 | If you have read access to the Stanford Reddit data (hosted by the InfoLab) then these should work out of the box.
35 | The data is designed so that you can access comments or posts from a specific subreddit for a specific time-period (month).
36 | 
37 | 
38 | When you iterate over comments for a particular subreddit/month, 
39 | each comment will be represented by a dictionary with attributes corresponding to comments score, author, timestamp, etc.
40 | The processed text data is accessible via the "doc" attribute, which is a spacy Doc object (https://spacy.io/docs/api/doc).
41 | This Doc object contains the raw text, along with various processed/annotated versions (pos tags, lemmas, etc.).
42 | See the spacy docs for more info.
43 | 
44 | ### Extracting multilayer networks from the data
45 | 
46 | The `network_extractor.py` file contains code for extracting network data corresponding to one week of activity in a specific subreddit.
47 | See the `network_example.ipynb` notebook for an example an more information.
48 | 


--------------------------------------------------------------------------------
/redditnetwork/utils/datautils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | 
 6 | from redditnetwork import constants
 7 | from redditnetwork.utils.ioutils import load_json
 8 | 
 9 | 
10 | DATA = constants.DATA_HOME
11 | 
12 | def make_data_frame(communities, feature_dict):
13 |     """
14 |     Makes a pandas dataframe for name, months, and dictionary of feature funcs.
15 |     Each feature func should take name and return feature value.
16 |     Constructed dataframe has flat csv style structure and missing values are removed.
17 |     """
18 | 
19 |     temp = collections.defaultdict(list)
20 |     feature_dict["name"] = lambda name : name
21 |     for name in communities:
22 |         for feature, feature_func in feature_dict.iteritems():
23 |             temp[feature].append(feature_func(name))
24 |     df = pd.DataFrame(temp)
25 |     df = df.replace([np.inf, -np.inf], np.nan)
26 |     df = df.dropna()
27 |     return df
28 | 
29 | def make_data_frame_time(communities, time_range, feature_dict):
30 |     """
31 |     Makes a pandas dataframe for name, months, and dictionary of feature funcs.
32 |     Each feature func should take (name, month) and return feature value.
33 |     Constructed dataframe has flat csv style structure and missing values are removed.
34 |     """
35 | 
36 |     temp = collections.defaultdict(list)
37 |     feature_dict["name"] = lambda name, time : name
38 |     feature_dict["time"] = lambda name, time : time
39 |     for name in communities:
40 |         for time in time_range:
41 |             for feature, feature_func in feature_dict.iteritems():
42 |                 temp[feature].append(feature_func(name, time))
43 |     df = pd.DataFrame(temp)
44 |     df = df.replace([np.inf, -np.inf], np.nan)
45 |     df = df.dropna()
46 |     return df
47 | 
48 | 
49 | def read_filtered_users():
50 |     users = set()
51 |     with open(DATA + 'filtered_users.txt') as fp:
52 |         for line in fp:
53 |             x = line.strip().split('\t')
54 |             users.add(x[0])
55 |     return users
56 | 
57 | def read_subreddit_names(year=None):
58 |     exclude_set = set(load_json(constants.DATA_HOME + "exclude_set.json"))
59 |     subs = set([])
60 |     if year == None:
61 |         for year in constants.YEARS:
62 |             subs.update([e.split(".")[0] for e in os.listdir(constants.DATA_HOME + "spacy_comments/" + str(year))])
63 |     else:
64 |         subs.update([e.split(".")[0] for e in os.listdir(constants.DATA_HOME + "spacy_comments/" + str(year))])
65 |     return subs-exclude_set
66 | 
67 | def valid_subreddits():
68 |     subreddits = []
69 |     with open(DATA + "total_comment_counts.tsv") as fp:
70 |         for line in fp:
71 |             subreddits.append(line.split("\t")[0])
72 |     return subreddits
73 | 


--------------------------------------------------------------------------------
/redditnetwork/network_extractor.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import numpy as np
  3 | 
  4 | from collections import Counter, defaultdict
  5 | 
  6 | from redditnetwork.corpus_reader import PostMap, WeekIterWrapper, SpacyComments, MultiIterWrapper
  7 | from redditnetwork.utils.dateutils import get_week_timestamp
  8 | 
  9 | VEC_SIZE=300
 10 | SIF=10e-4
 11 | 
 12 | def extract_month_network_multisubreddits(subreddits, year, month):
 13 |     """
 14 |     Extracts a multilayer network of users comments and posts for
 15 |     multiple subreddits from the specified month.
 16 |     """
 17 |     post_map = {}
 18 |     for subreddit in subreddits:
 19 |         post_map.update(PostMap(subreddit, year, month).post_map)
 20 |     comment_iter = MultiIterWrapper([SpacyComments(subreddit, year, month) for
 21 |         subreddit in subreddits])
 22 |     return extract_network(post_map, comment_iter, 0)
 23 | 
 24 | 
 25 | def extract_week_network_multisubreddits(subreddits, year, week):
 26 |     """
 27 |     Extracts a multilayer network of users comments and posts for
 28 |     multiple subreddits from the specified week.
 29 |     """
 30 |     post_map = {}
 31 |     for subreddit in subreddits:
 32 |         post_map.update(PostMap(subreddit, year, -1, week=week).post_map)
 33 |     comment_iter = MultiIterWrapper([WeekIterWrapper(SpacyComments, week, subreddit, year) for
 34 |         subreddit in subreddits])
 35 |     return extract_network(post_map, comment_iter, 0)
 36 | 
 37 | def extract_year_network(subreddit, year):
 38 |     """
 39 |     Extracts a multi-layer network of users, comments, and posts.
 40 |     Data is taken from a specific month (num between 1 and 12) in a specific year.
 41 |     """
 42 |     post_map = {}
 43 |     for month in range(1,13):
 44 |         post_map.update(PostMap(subreddit, year, month).post_map)
 45 |     comment_iter = MultiIterWrapper([SpacyComments(subreddit, year, month) for
 46 |         month in range(1,13)])
 47 |     base_time = get_week_timestamp(year,0)
 48 |     return extract_network(post_map, comment_iter, base_time)
 49 | 
 50 | 
 51 | def extract_month_network(subreddit, year, month):
 52 |     """
 53 |     Extracts a multi-layer network of users, comments, and posts.
 54 |     Data is taken from a specific month (num between 1 and 12) in a specific year.
 55 |     """
 56 |     post_map = PostMap(subreddit, year, month)
 57 |     comment_iter = SpacyComments(subreddit, year, month)
 58 |     #TODO: Actually do this... It is not a big deal since the values
 59 |     # will be internally consistent, but still...
 60 |     month_base_time = get_week_timestamp(year, month/4-2)
 61 |     return extract_network(post_map.post_map, comment_iter, month_base_time)
 62 | 
 63 | def extract_week_network(subreddit, year, week):
 64 |     """
 65 |     Extracts a multi-layer network of users, comments, and posts.
 66 |     Data is taken from a specific week (num between 1 and 50) in a specific year.
 67 |     """
 68 |     post_map = PostMap(subreddit, year, -1, week=week)
 69 |     comment_iter = WeekIterWrapper(SpacyComments, week, subreddit, year)
 70 |     week_base_time = get_week_timestamp(year, week)
 71 | 
 72 |     return extract_network(post_map.post_map, comment_iter, week_base_time)
 73 | 
 74 | def _get_embedding(doc, counter, total_count):
 75 |     vecs = [word.vector*(SIF / (counter[word.lower_]/total_count + SIF)) for word in doc if word.has_vector]
 76 |     if len(vecs) == 0 or np.isnan(np.sum(np.array(vecs))):
 77 |         return np.zeros((VEC_SIZE,))
 78 |     else:
 79 |         vecs = np.array(vecs)
 80 |         vecs = np.mean(vecs, axis=0)
 81 |         return vecs
 82 | 
 83 | def extract_network(post_map, comment_iter, base_time, idf=True):
 84 | 
 85 |     if idf:
 86 |         df = Counter()
 87 |         total_count = 0.
 88 |         for comment in comment_iter:
 89 |             for word in comment["doc"]:
 90 |                 df[word.lower_] += 1
 91 |                 total_count += 1.
 92 |     else:
 93 |         df = defaultdict(float)
 94 |         total_count = 1.
 95 | 
 96 |     graph = nx.DiGraph(user_feats={},
 97 |             post_feats = {"score" : 1, "time": 1, "num_comments": 1, "subreddit" : 1, "length" : 1, "word_vecs" : VEC_SIZE},
 98 |             comment_feats = {"score" : 1, "time" : 1, "post_time_offset": 1, "length" : 1, "subreddit" : 1, "word_vecs" : VEC_SIZE})
 99 | 
100 |     ## Add all posts as nodes connected to their authors
101 |     for post in post_map.values():
102 |         graph.add_node(post["id"], 
103 |                 type="post",
104 |                 score=post["score"],
105 |                 num_comments=post["num_comments"],
106 |                 subreddit=post["subreddit"],
107 |                 time=(int(post["timestamp"])-base_time)/3600.,
108 |                 length=len(post["doc"]),
109 |                 word_vecs=_get_embedding(post["doc"], df, total_count))
110 |         if not graph.has_node(post["author"]):
111 |             graph.add_node(post["author"], type="user")
112 |         graph.add_edge(post["author"], post["id"], type="user_post")
113 | 
114 |     skipped_missing_parent = 0
115 |     skipped_missing_post = 0
116 |     for i, comment in enumerate(comment_iter):
117 |         # skip comments that don't respond to a post from this week
118 |         if not comment["post"] in post_map:
119 |             skipped_missing_post += 1
120 |             continue
121 |         # skip comments that don't respond to a  parent from this week
122 |         if comment["parent"] != comment["post"] and not graph.has_node(comment["parent"]):
123 |             skipped_missing_parent += 1
124 |             continue
125 | 
126 |         # add author node if necessary
127 |         if not graph.has_node(comment["author"]):
128 |             graph.add_node(comment["author"], type="user")
129 | 
130 |         # add comment node
131 |         graph.add_node(comment["id"],
132 |                 type="comment",
133 |                 score=comment["score"],
134 |                 subreddit=comment["subreddit"],
135 |                 time=(comment["timestamp"]-base_time)/3600.,
136 |                 post_time_offset=(comment["timestamp"]-int(post["timestamp"]))/3600.,
137 |                 length=len(comment["doc"]),
138 |                 word_vecs=_get_embedding(comment["doc"], df, total_count))
139 | 
140 |         # Add edges
141 |         graph.add_edge(comment["author"], comment["id"], type="user_comment")
142 |         if comment["parent"] != comment["post"]:
143 |             graph.add_edge(comment["parent"], comment["id"], type="comment_comment")
144 |         else:
145 |             graph.add_edge(comment["post"], comment["id"], type="post_comment")
146 | 
147 |     print "Processed {:d} comments, of which {:d} were removed for missing post and {:d} for missing parent".format(
148 |             i, skipped_missing_post, skipped_missing_parent)
149 |     return graph 
150 | 


--------------------------------------------------------------------------------
/redditnetwork/corpus_reader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various objects for iterating over and accessing processed Reddit data.
  3 | """
  4 | 
  5 | import json
  6 | 
  7 | from redditnetwork import constants
  8 | from redditnetwork.utils.datautils import read_filtered_users
  9 | from redditnetwork.utils.stringutils import is_bot
 10 | from redditnetwork.utils.dateutils import get_week
 11 | 
 12 | from spacy.tokens.doc import Doc
 13 | from spacy.en import English
 14 | 
 15 | SPACY_VOCAB = English().vocab
 16 | FILTERED_USERS = read_filtered_users()
 17 | 
 18 | class MultiIterWrapper():
 19 |     def __init__(self, iters):
 20 |         self.iters = iters
 21 | 
 22 |     def __iter__(self):
 23 |         for _iter in self.iters:
 24 |             for item in _iter:
 25 |                 yield item
 26 | 
 27 | class WeekIterWrapper():
 28 |     """
 29 |     Gets an iterator over a specific week.
 30 |     Takes a comment or post iterator class as an argument.
 31 |     Annoyingly weeks and months are not aligned....
 32 |     """
 33 |     def __init__(self, cls, week, subreddit, year, **kw_args):
 34 |         # Throws an assertion if you try to get weeks that cross between
 35 |         # years
 36 |         assert week != 0 and week < 51
 37 |         self.week = week
 38 |         month = week / 4 + 1
 39 |         ## Week 1 for us == week 2 for ISO weeks
 40 |         self.iter1 = cls(subreddit, year, month, **kw_args)
 41 |         self.iter2 = cls(subreddit, year, month+1, **kw_args)
 42 | 
 43 |     def __iter__(self):
 44 |         for item in self.iter1.__iter__(week=self.week):
 45 |             yield item
 46 |         for item in self.iter2.__iter__(week=self.week):
 47 |             yield item
 48 | 
 49 | class PostMap():
 50 |     """
 51 |     Map into post data.
 52 |     """
 53 |     def __init__(self, subreddit, year, month, week=None, path=None,
 54 |             clean_deleted=True, clean_bots=True):
 55 |         if path == None:
 56 |             path = constants.DATA_HOME + "spacy_posts/"
 57 |         if not week is None:
 58 |             print "Warning: Using week argument and ignoring month"
 59 |             self.post_map = self._make_map(
 60 |                     WeekIterWrapper(PostIterator, week, subreddit, 
 61 |                         year, path=path, 
 62 |                         clean_deleted=True, clean_bots=True))
 63 |         else:
 64 |             self.post_map = self._make_map(
 65 |                     PostIterator(subreddit, year, month, path=path, 
 66 |                         clean_deleted=True, clean_bots=True))
 67 | 
 68 |     def _make_map(self, info_iterator):
 69 |         post_map = {}
 70 |         for entry in info_iterator:
 71 |             post_map[entry["id"]] = entry
 72 |         return post_map
 73 | 
 74 |     def get_post(self, id):
 75 |         return self.post_map[id]
 76 | 
 77 |     def __getitem__(self, id):
 78 |         return self.get_post(id)
 79 | 
 80 |     def __contains__(self, id):
 81 |         return id in self.post_map
 82 | 
 83 | 
 84 | class PostIterator():
 85 |     """
 86 |     Iterator over post metadata only
 87 |     """
 88 |     def __init__(self, subreddit, year, month, path=None, 
 89 |             clean_deleted=True, clean_bots=True):
 90 |         if path == None:
 91 |             path = constants.DATA_HOME + "spacy_posts/"
 92 |         path += "{:d}_{:02d}/".format(year, month) + subreddit
 93 |         self._vocab = SPACY_VOCAB
 94 |         self.path = path
 95 |         self._len = None
 96 |         self.clean_bots = clean_bots
 97 |         self.clean_deleted = clean_deleted
 98 |         self.subreddit = subreddit
 99 | 
100 |     def _parse_info(self, line):
101 |         info = json.loads(line)
102 |         info["subreddit"] = self.subreddit
103 |         return info
104 | 
105 |     def __len__(self):
106 |         if self._len == None:
107 |             i = -1
108 |             with open(self.path + ".info") as fp:
109 |                 for i, _ in enumerate(fp):
110 |                     pass
111 |             self._len = i + 1
112 |         return self._len
113 | 
114 |     def __iter__(self, week=None):
115 |         with open(self.path + ".info")  as info:
116 |             with open(self.path + ".title.bin") as title_bin:
117 |                 for byte_string in Doc.read_bytes(title_bin):
118 |                     info_line = info.readline()
119 |                     comment_info = self._parse_info(info_line)
120 |                     if not (week is None) and get_week(comment_info["timestamp"]) != week:
121 |                         continue
122 |                     if self.clean_deleted and comment_info["author"] == "[deleted]":
123 |                         continue
124 |                     if self.clean_bots and (is_bot(comment_info["author"]) or 
125 |                         comment_info["author"] in FILTERED_USERS):
126 |                         continue
127 |                     comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string)
128 |                     yield comment_info
129 | 
130 | 
131 | class InfoIterator():
132 |     """
133 |     Iterator over comment metadata only
134 |     """
135 |     def __init__(self, subreddit, year, month=None, path=None,
136 |             clean_deleted=True, clean_bots=True):
137 |         if path == None:
138 |             path = constants.DATA_HOME + "spacy_comments/"
139 |      #   self._vocab = Vocab.load(path + u"vocab.bin")
140 |         # Annoyingly inefficient but necessary for now
141 |         if not month is None:
142 |             path += "{:d}_{:02d}/".format(year, month) + subreddit
143 |         else:
144 |             path += "{:d}/".format(year) + subreddit
145 |         self.path = path
146 |         self._len = None
147 |         self.clean_deleted = clean_deleted
148 |         self.clean_bots = clean_bots
149 | 
150 |     def _parse_info(self, line):
151 |         info = line.split("\t")
152 |         comment_info = {"id" : info[0],
153 |                 "timestamp" : int(info[1]),
154 |                 "author" : info[2],
155 |                 "score" : int(info[3]),
156 |                 "parent" : info[4], 
157 |                 "post" : info[5].strip()}
158 |         return comment_info
159 | 
160 |     def __len__(self):
161 |         if self._len == None:
162 |             i = -1
163 |             with open(self.path + ".info") as fp:
164 |                 for i, _ in enumerate(fp):
165 |                     pass
166 |             self._len = i + 1
167 |         return self._len
168 | 
169 |     def __iter__(self):
170 |         with open(self.path + ".info")  as info:
171 |             for line in info:
172 |                 comment_info = self._parse_info(line)
173 |                 if self.clean_deleted and comment_info["author"] == "[deleted]":
174 |                     continue
175 |                 if self.clean_bots and (is_bot(comment_info["author"]) or 
176 |                     comment_info["author"] in FILTERED_USERS):
177 |                     continue
178 |                 yield comment_info
179 | 
180 | class SpacyComments():
181 |     """
182 |     Iterator over spacy comments.
183 |     """
184 | 
185 |     def __init__(self, subreddit, year, month=None, path=None, 
186 |             include_punct=True, down_sample=None, clean_bots=True, clean_deleted=True):
187 |         if path == None:
188 |             path = constants.DATA_HOME + "spacy_comments/"
189 |         self._vocab = SPACY_VOCAB
190 |         if not month is None:
191 |             path += "{:d}_{:02d}/".format(year, month) + subreddit
192 |         else:
193 |             path += "{:d}/".format(year) + subreddit
194 |         self.path = path
195 |         self._len = None
196 |         self.clean_bots = clean_bots
197 |         self.clean_deleted = clean_deleted
198 |         self.include_punct = include_punct
199 |         self.subreddit = subreddit
200 | 
201 |     def _spacy_string_clean(self, token):
202 |         if token.like_url:
203 |             return "<URL>"
204 |         elif token.like_num:
205 |             return "<NUM>"
206 |         elif (not self.include_punct) and token.is_punct and (not token.tag_ == "."):
207 |             return ""
208 |         else:
209 |             return token.lower_
210 | 
211 |     def _text_from_doc(self, doc):
212 |         return " ".join([self._spacy_string_clean(token) for token in doc])
213 | 
214 |     def _parse_info(self, line):
215 |         info = line.split("\t")
216 |         comment_info = {"id" : info[0],
217 |                 "timestamp" : int(info[1]),
218 |                 "author" : info[2],
219 |                 "score" : int(info[3]),
220 |                 "parent" : info[4], 
221 |                 "subreddit" : self.subreddit, 
222 |                 "post" : info[5].strip()}
223 |         return comment_info
224 | 
225 |     def __len__(self):
226 |         if self._len == None:
227 |             with open(self.path + ".info") as fp:
228 |                 for i, _ in enumerate(fp):
229 |                     pass
230 |             self._len = i + 1
231 |         return self._len
232 | 
233 |     def __iter__(self, week=None):
234 |         with open(self.path + ".bin", "rb") as bin:
235 |             with open(self.path + ".info")  as info:
236 |                 for byte_string in Doc.read_bytes(bin):
237 |                     comment_info = self._parse_info(info.next())
238 |                     if (not week is None) and get_week(comment_info["timestamp"]) != week:
239 |                         continue
240 |                     if self.clean_deleted and comment_info["author"] == "[deleted]":
241 |                         continue
242 |                     if self.clean_bots and (is_bot(comment_info["author"]) or 
243 |                         comment_info["author"] in FILTERED_USERS):
244 |                         continue
245 |                     doc = Doc(self._vocab).from_bytes(byte_string)
246 |                     comment_info["doc"] = doc
247 |                     comment_info["text"] = self._text_from_doc(doc)
248 |                     yield comment_info
249 | 


--------------------------------------------------------------------------------
/network_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Extracting a multilayer network from Reddit data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### First we load the network and check some basic stats"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from redditnetwork.network_extractor import extract_week_network\n",
 26 |     "import networkx as nx"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Warning: Using week argument and ignoring month\n",
 41 |       "Processed 45097 comments, of which 12378 were removed for missing post and 5888 for missing parent\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "# We will use /r/politics subreddit as the running example\n",
 47 |     "# We extract a network for this subreddit, corresponding to the first week of 2014\n",
 48 |     "politics_net = extract_week_network(\"politics\", 2014, 1)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Ignore the warning about using the week argument instead of month. (This is just an internal complication due to the fact that the data is stored at the monthly level but we are accessing weeks).\n",
 56 |     "\n",
 57 |     "Once the data finishes processing it will say that it processed a certain number of comments and removed some due to having a missing parent or post (e.g., they were replying to an old post from an earlier week).\n",
 58 |     "\n",
 59 |     "The returned object is a networkx DiGraph (directed graph)."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "There are 8992 users, 26832 comments, and 2368 posts in the graph\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "## some basic stats:\n",
 79 |     "print \"There are {:d} users, {:d} comments, and {:d} posts in the graph\"\\\n",
 80 |     "            .format(len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"user\"]),\n",
 81 |     "                   len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"comment\"]),\n",
 82 |     "                   len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"post\"]))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### Okay, and now some details on the data \n",
 90 |     "\n",
 91 |     "The underlying structure is a directed graph (DiGraph) and additional information is stored as node and edge attributes.\n",
 92 |     "\n",
 93 |     "#### Node types\n",
 94 |     "\n",
 95 |     "Every node as an \"type\" attribute that is one of \"user\", \"comment\", or \"post\".\n",
 96 |     "Users are indexed by their username and post/comments by unique string ids. \n",
 97 |     "\n",
 98 |     "#### Edge types\n",
 99 |     "\n",
100 |     "Every edge has a \"type\" attribute as well, which is one of the following:\n",
101 |     "* \"user_post\": a directed edge from a user to a post they made.\n",
102 |     "* \"user_comment\": a directed edge from a user to a comment they made.\n",
103 |     "* \"post_comment\": a directed edge from a post to a top-level comment in that post.\n",
104 |     "* \"comment_comment\": a directed edge from a comment to a comment that replies to it. \n",
105 |     "\n",
106 |     "#### Node attributes/features\n",
107 |     "\n",
108 |     "Comment nodes and post nodes also additional features/attributes (which can be listed by running politics_net.graph; see the example below). User nodes currently have no features (besides those that are implicit in the graph structure). \n",
109 |     "\n",
110 |     "##### Comment features\n",
111 |     "* score: score that comment received\n",
112 |     "* time: describes when the comment was made during the week (hour offset from 12:00am on Monday of that week).\n",
113 |     "* post_time_offset: how old was the post when the comment was made (in hours)\n",
114 |     "* length: how many words in the comment\n",
115 |     "* word_vec: 300 dimensional vector embedding of the comment (tf-idf average of GloVe vectors)\n",
116 |     "\n",
117 |     "##### Post features\n",
118 |     "* score: score that the post recieved\n",
119 |     "* time: when was the post made during the week (hour offset from 12:00 on Monday of that week)\n",
120 |     "* length: number of words in the title\n",
121 |     "* word_vec: vector embedding of post title (average of Glove vectors)\n",
122 |     "\n",
123 |     "*NOTE THAT NONE OF THESE FEATURES ARE THE \"LABELS\" WE WANT TO PREDICT.* That data is stored elsewhere for now because I don't want to clutter the network representations and because the \"labels\" are in flux. See the bottom of this notebook for an example of how to get the labels for predictions."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 4,
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "{'comment_feats': {'length': 1,\n",
137 |        "  'post_time_offset': 1,\n",
138 |        "  'score': 1,\n",
139 |        "  'subreddit': 1,\n",
140 |        "  'time': 1,\n",
141 |        "  'word_vecs': 300},\n",
142 |        " 'post_feats': {'length': 1,\n",
143 |        "  'num_comments': 1,\n",
144 |        "  'score': 1,\n",
145 |        "  'subreddit': 1,\n",
146 |        "  'time': 1,\n",
147 |        "  'word_vecs': 300},\n",
148 |        " 'user_feats': {}}"
149 |       ]
150 |      },
151 |      "execution_count": 4,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "# this prints info about what features there are and the dimensionality of these features\n",
158 |     "politics_net.graph"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 5,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "['cejaksn']\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "# lets access the node for a random user \n",
178 |     "# and get all comments and posts that this user made\n",
179 |     "user_out_nodes = politics_net.successors(\"RedSquirrelFtw\")\n",
180 |     "print user_out_nodes"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 6,
186 |    "metadata": {
187 |     "collapsed": false
188 |    },
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "{'word_vecs': array([  2.45881882e-02,  -8.85956455e-03,   4.07702522e-03,\n",
195 |       "        -3.59144271e-03,  -5.35505451e-03,   3.04689351e-03,\n",
196 |       "        -2.86572031e-05,   6.46826986e-04,   3.98649042e-03,\n",
197 |       "        -3.48688639e-03,   3.40964980e-02,   3.39702074e-03,\n",
198 |       "        -2.66911592e-02,   9.43523180e-03,  -2.05980968e-02,\n",
199 |       "        -2.33542006e-02,  -2.23564263e-02,  -4.97682840e-02,\n",
200 |       "         2.15058471e-03,   6.99266186e-03,  -1.03599476e-02,\n",
201 |       "        -3.42106936e-03,  -1.32135861e-03,  -3.16169374e-02,\n",
202 |       "         1.49107622e-02,   4.38282602e-02,  -1.15861988e-03,\n",
203 |       "        -5.54729579e-03,  -6.17341464e-03,   1.52532337e-02,\n",
204 |       "         1.30888699e-02,   1.42863719e-02,   5.32958051e-03,\n",
205 |       "         6.43259101e-03,  -2.33824583e-04,  -1.21295080e-02,\n",
206 |       "        -4.83304122e-03,  -6.96073147e-03,  -6.06134126e-04,\n",
207 |       "         1.71746537e-02,   8.68919492e-03,   1.78009700e-02,\n",
208 |       "         1.27696199e-02,  -1.09810466e-02,  -3.44701274e-03,\n",
209 |       "        -4.43779491e-03,  -2.83656735e-03,  -1.35982307e-02,\n",
210 |       "         9.29598790e-03,   1.40077816e-02,   4.03900212e-03,\n",
211 |       "        -1.54639157e-02,   1.17861321e-02,  -1.46402763e-02,\n",
212 |       "        -3.31898057e-03,   3.37800458e-02,   1.95675101e-02,\n",
213 |       "        -3.57995667e-02,   2.05528438e-02,  -1.94929559e-02,\n",
214 |       "        -3.00960094e-02,   6.92273676e-03,  -3.29098515e-02,\n",
215 |       "        -7.45723071e-03,  -1.42299989e-03,  -8.70021619e-03,\n",
216 |       "         1.11240067e-03,  -1.51177878e-02,   2.87857745e-02,\n",
217 |       "         3.37396264e-02,   9.56202578e-03,  -1.05087310e-02,\n",
218 |       "        -1.07406760e-02,   1.49361016e-02,   2.01773494e-02,\n",
219 |       "         2.91823908e-05,   4.99374466e-03,  -1.14855031e-02,\n",
220 |       "         5.58512053e-03,  -8.54704715e-03,   4.02356274e-02,\n",
221 |       "        -3.03811803e-02,   8.61220621e-03,   4.28446494e-02,\n",
222 |       "        -9.69701540e-03,  -5.98392868e-03,  -9.92416963e-03,\n",
223 |       "         9.10818484e-03,   1.59019697e-02,  -5.44882799e-03,\n",
224 |       "         1.45760819e-03,  -3.55789065e-03,  -2.28537503e-03,\n",
225 |       "        -2.61299759e-02,   1.90544985e-02,  -1.77889783e-02,\n",
226 |       "        -1.39860585e-02,   6.76063960e-03,   1.38605768e-02,\n",
227 |       "         9.62258037e-03,  -1.29944673e-02,  -2.23032534e-02,\n",
228 |       "        -1.12707019e-02,  -3.23294364e-02,  -1.73050631e-02,\n",
229 |       "         1.61380600e-03,   7.97309820e-03,   2.07648035e-02,\n",
230 |       "        -2.09515914e-02,  -1.72264632e-02,   2.98629012e-02,\n",
231 |       "        -1.17384242e-02,   1.86214726e-02,   9.00912005e-03,\n",
232 |       "         1.10343313e-02,   4.74179303e-03,   3.79112177e-02,\n",
233 |       "         2.15634611e-02,  -1.51630966e-02,   2.94514317e-02,\n",
234 |       "         1.61693618e-02,   2.19558049e-02,   7.10119260e-03,\n",
235 |       "         1.03538474e-02,   7.77509485e-05,   3.03796474e-02,\n",
236 |       "         1.42980209e-02,   2.52941120e-02,  -7.30073452e-03,\n",
237 |       "         2.67049880e-03,  -3.23295183e-02,  -4.56356490e-03,\n",
238 |       "        -1.13542946e-02,   1.67651456e-02,   1.88513268e-02,\n",
239 |       "         1.45909078e-02,   1.26617374e-02,   2.57562962e-03,\n",
240 |       "         2.61628558e-03,  -9.08431411e-03,   1.14472574e-02,\n",
241 |       "         7.94017408e-03,   1.20475926e-02,   6.06606854e-03,\n",
242 |       "         1.72127299e-02,   3.30692828e-02,   2.73634796e-03,\n",
243 |       "        -1.44931115e-03,  -1.57310385e-02,  -9.87053290e-03,\n",
244 |       "         1.01823714e-02,   1.41003141e-02,  -2.85259262e-03,\n",
245 |       "        -2.85569229e-03,  -2.15815436e-02,   3.00777871e-02,\n",
246 |       "        -9.35850013e-03,  -1.59715936e-02,  -9.89310350e-03,\n",
247 |       "        -6.48096018e-03,   8.47815443e-03,  -1.39471488e-02,\n",
248 |       "        -1.54531682e-02,   1.39459819e-02,  -3.64065021e-02,\n",
249 |       "        -1.36025399e-02,   1.38182156e-02,  -6.87898695e-03,\n",
250 |       "        -1.50948400e-02,  -2.98325270e-02,  -6.52712537e-03,\n",
251 |       "         1.41928094e-02,  -1.52701763e-02,   9.46271932e-04,\n",
252 |       "         3.33177461e-03,  -7.86158908e-03,  -1.73139188e-03,\n",
253 |       "         2.13753339e-02,   1.40343681e-02,   6.07236812e-04,\n",
254 |       "        -3.50858620e-03,   3.21572740e-03,   1.88201424e-02,\n",
255 |       "         5.38636697e-04,  -1.99504918e-03,   8.32799729e-03,\n",
256 |       "        -9.34114214e-03,   1.05030613e-03,   2.49872357e-02,\n",
257 |       "        -1.37836263e-02,  -5.93390130e-03,   3.22293714e-02,\n",
258 |       "         5.87111758e-03,  -3.06474995e-02,  -2.33445037e-02,\n",
259 |       "        -2.20932271e-02,   1.08240033e-03,  -1.84142999e-02,\n",
260 |       "        -1.48790656e-02,  -1.64782442e-02,  -7.43377954e-03,\n",
261 |       "        -1.01498319e-02,   1.35151120e-02,   1.11000063e-02,\n",
262 |       "         2.05843598e-02,   1.21982545e-02,   2.34148884e-03,\n",
263 |       "         1.59241911e-02,   2.14853045e-02,   1.02746207e-02,\n",
264 |       "         2.46614888e-02,  -9.97475255e-03,   2.22080369e-02,\n",
265 |       "        -2.92910635e-03,  -1.70956121e-03,   8.48170649e-03,\n",
266 |       "         2.91286502e-02,   7.95706734e-03,   1.02885272e-02,\n",
267 |       "        -7.11069396e-03,  -9.60137043e-03,   3.87477353e-02,\n",
268 |       "         5.41670388e-03,  -7.04232231e-03,   6.28765486e-03,\n",
269 |       "         6.94147125e-03,  -1.28068291e-02,  -3.23408772e-03,\n",
270 |       "         1.30555267e-02,   2.38443818e-02,  -1.96164623e-02,\n",
271 |       "        -1.58533361e-02,   9.44936834e-03,  -1.23057445e-03,\n",
272 |       "        -1.55784115e-02,   1.23042492e-02,   2.84389127e-02,\n",
273 |       "         1.14323832e-02,  -2.29188725e-02,   2.06083413e-02,\n",
274 |       "        -1.55406876e-03,   7.49366404e-03,  -2.04990674e-02,\n",
275 |       "         5.44690294e-03,   1.76218394e-02,  -2.92982757e-02,\n",
276 |       "        -1.70645968e-03,  -1.92762853e-03,  -3.71797127e-04,\n",
277 |       "        -2.82948818e-02,   2.10400019e-02,   4.07306617e-03,\n",
278 |       "         2.54461095e-02,   3.17274220e-02,  -3.35782184e-03,\n",
279 |       "        -4.31606658e-02,  -1.00767994e-02,   4.30808449e-03,\n",
280 |       "         3.92695935e-03,  -1.54349012e-02,   8.19602143e-03,\n",
281 |       "        -1.41330715e-03,  -2.46893838e-02,   1.46608660e-02,\n",
282 |       "         2.21408587e-02,   6.10332601e-02,  -8.76054820e-03,\n",
283 |       "         8.68958142e-03,  -6.90774480e-03,  -2.69666910e-02,\n",
284 |       "         3.16602848e-02,  -1.31279230e-02,  -1.95450392e-02,\n",
285 |       "        -5.54493815e-03,  -1.54730147e-02,   1.84348375e-02,\n",
286 |       "         7.17827678e-03,  -1.07575832e-02,  -2.36013550e-02,\n",
287 |       "         8.04622378e-03,  -1.49186878e-02,  -3.87179069e-02,\n",
288 |       "        -1.70071388e-03,   1.06324144e-02,  -4.39525116e-04,\n",
289 |       "         2.57094787e-03,   2.30304878e-02,   2.87001040e-02,\n",
290 |       "         1.92963984e-02,  -2.31728554e-02,  -3.97634273e-03,\n",
291 |       "         7.90031161e-03,   3.72355897e-03,  -2.97002345e-02,\n",
292 |       "         1.95561489e-03,  -4.79521276e-03,  -2.14669239e-02,\n",
293 |       "         1.08616846e-02,   2.25015227e-02,   3.29043274e-03], dtype=float32), 'subreddit': 'politics', 'post_time_offset': 12.575833333333334, 'length': 26, 'score': 28, 'time': 196.05277777777778, 'type': 'comment'}\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "# this user made only one comment... but I think you get the picture\n",
299 |     "# e.g., we could access the attributes for this comment \n",
300 |     "print politics_net.node[user_out_nodes[0]]"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "There is still lots of graph management stuff that is left unspecificed (e.g., what's the best way to get all nodes of a certain type), but I figure this is just networkx/bookkeeping stuff and doesn't need to be baked in to the representation."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "## We can also extract networks for multiple subreddits...."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 1,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "from redditnetwork.network_extractor import extract_week_network_multisubreddits"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 2,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [
335 |     {
336 |      "name": "stdout",
337 |      "output_type": "stream",
338 |      "text": [
339 |       "Warning: Using week argument and ignoring month\n",
340 |       "Warning: Using week argument and ignoring month\n",
341 |       "Processed 57517 comments, of which 11175 were removed for missing post and 8704 for missing parent\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "multi_test = extract_week_network_multisubreddits([\"politics\", \"Libertarian\"], 2014, 2)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 4,
352 |    "metadata": {
353 |     "collapsed": false
354 |    },
355 |    "outputs": [
356 |     {
357 |      "data": {
358 |       "text/plain": [
359 |        "11196"
360 |       ]
361 |      },
362 |      "execution_count": 4,
363 |      "metadata": {},
364 |      "output_type": "execute_result"
365 |     }
366 |    ],
367 |    "source": [
368 |     "len([node for node in multi_test.nodes(data=True) if node[1][\"type\"] == \"post\"])"
369 |    ]
370 |   }
371 |  ],
372 |  "metadata": {
373 |   "kernelspec": {
374 |    "display_name": "Python 2",
375 |    "language": "python",
376 |    "name": "python2"
377 |   },
378 |   "language_info": {
379 |    "codemirror_mode": {
380 |     "name": "ipython",
381 |     "version": 2
382 |    },
383 |    "file_extension": ".py",
384 |    "mimetype": "text/x-python",
385 |    "name": "python",
386 |    "nbconvert_exporter": "python",
387 |    "pygments_lexer": "ipython2",
388 |    "version": "2.7.13"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 2
393 | }
394 | 


--------------------------------------------------------------------------------