├── redditnetwork ├── __init__.py ├── utils │ ├── __init__.py │ ├── ioutils.py │ ├── dateutils.py │ ├── fastfreqdist.py │ ├── stringutils.py │ └── datautils.py ├── constants.py ├── network_extractor.py └── corpus_reader.py ├── setup.py ├── .gitignore ├── README.md └── network_example.ipynb /redditnetwork/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redditnetwork/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /redditnetwork/constants.py: -------------------------------------------------------------------------------- 1 | DATA_HOME="/dfs/scratch0/reddit/" 2 | SPACY_COMMENTS=DATA_HOME+"spacy_comments/" 3 | -------------------------------------------------------------------------------- /redditnetwork/utils/ioutils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cPickle as pickle 3 | import json 4 | 5 | def mkdir(directory): 6 | if not os.path.exists(directory): 7 | os.makedirs(directory) 8 | 9 | def write_pickle(data, filename): 10 | fp = open(filename, "wb") 11 | pickle.dump(data, fp) 12 | 13 | def load_pickle(filename): 14 | fp = open(filename, "rb") 15 | return pickle.load(fp) 16 | 17 | def write_json(data, filename): 18 | fp = open(filename, "wb") 19 | json.dump(data, fp) 20 | 21 | def load_json(filename): 22 | fp = open(filename, "rb") 23 | return json.load(fp) 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | setup(name='redditnetwork', 5 | version='1.0', 6 | description='Code for accessing processing Reddit data', 7 | author='William L. Hamilton', 8 | author_email='will.leif.hamiltion@gmail.com', 9 | license='MIT', 10 | install_requires=['spacy==1.2.0', 11 | 'networkx', 12 | 'numpy', 13 | 'isoweek', 14 | 'pandas', 15 | 'nltk' 16 | ], 17 | package_data={'redditnetwork': ['README.md']}, 18 | packages=find_packages()) 19 | -------------------------------------------------------------------------------- /redditnetwork/utils/dateutils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from isoweek import Week 3 | import calendar 4 | 5 | def get_week(timestamp): 6 | timestamp = datetime.datetime.utcfromtimestamp(float(timestamp)) 7 | date = timestamp.date() 8 | iso_info = date.isocalendar() 9 | week = iso_info[1] - 1 10 | return week 11 | 12 | def get_week_timestamp(year, week): 13 | d = Week(year, week).monday() 14 | return calendar.timegm(d.timetuple()) 15 | 16 | def day_week(timestamp): 17 | timestamp = datetime.datetime.utcfromtimestamp(float(timestamp)) 18 | date = timestamp.date() 19 | iso_info = date.isocalendar() 20 | week = iso_info[1] - 1 21 | day = week * 7 + iso_info[2] - 1 22 | return day, week 23 | 24 | def month_year(timestamp): 25 | date = datetime.datetime.utcfromtimestamp(int(timestamp)) 26 | return (date.year, date.month) 27 | 28 | def month_year_add(month_year, increment): 29 | month_year = (month_year[0] + (increment + month_year[1] - 1) / 12, 30 | (month_year[1] + increment - 1) % 12 + 1) 31 | return month_year 32 | 33 | def previous_month_year(month_year): 34 | month_year = (month_year[0], month_year[1]-1) 35 | if month_year[1] < 1: 36 | month_year= (month_year[0] - 1, 12) 37 | return month_year 38 | -------------------------------------------------------------------------------- /redditnetwork/utils/fastfreqdist.py: -------------------------------------------------------------------------------- 1 | from nltk.probability import FreqDist, MLEProbDist 2 | import numpy as np 3 | 4 | class CachedFreqDist(FreqDist): 5 | """ 6 | A read only version of nltk's FreqDist that caches the sample size for speed. 7 | DO NOT UPDATE COUNTS OF THIS OBJECT, resulting frequencies will not sum to one. 8 | """ 9 | def __init__(self, freqdist): 10 | FreqDist.__init__(self, freqdist) 11 | self._N = np.sum(self.values()) 12 | 13 | def N(self): 14 | return self._N 15 | 16 | # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, 17 | # here, freq() does probs 18 | def freq(self, sample): 19 | if self.N() == 0: 20 | return 0 21 | return float(self[sample]) / self.N() 22 | 23 | class MultiGenMLEProbDist(MLEProbDist): 24 | """ 25 | An extension of nltk's MLEProbDist that allows for fast sampling for larger sample sizes 26 | """ 27 | 28 | def __init__(self, freqdist, bins=None): 29 | MLEProbDist.__init__(self, freqdist, bins) 30 | self._probarray = np.zeros((len(freqdist),)) 31 | self._probmap = {} 32 | for i, item in enumerate(freqdist.keys()): 33 | self._probarray[i] = freqdist.freq(item) 34 | self._probmap[i] = item 35 | 36 | def generate_many(self, n): 37 | return {self._probmap[i]:count for i, count in 38 | enumerate(np.random.multinomial(n, self._probarray)) if count != 0} 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project specific 2 | seaborn 3 | scikits-bootstrap 4 | pattern 5 | statsmodels 6 | .git_old 7 | scikit-learn 8 | *mallet* 9 | venv 10 | *egg-info* 11 | build 12 | dist 13 | .ipynb_checkpoints 14 | 15 | *.class 16 | 17 | *.c 18 | 19 | # file system nonsense 20 | *.DS_Store 21 | *.__afs* 22 | 23 | # swp files 24 | *.swo 25 | *.swp 26 | 27 | # Cython and c build remnants 28 | build 29 | *.o 30 | *.so 31 | 32 | # Python build remnants 33 | *.pyc 34 | 35 | ## Core latex/pdflatex auxiliary files: 36 | *.pdf 37 | *.aux 38 | *.lof 39 | *.log 40 | *.lot 41 | *.fls 42 | *.out 43 | *.toc 44 | 45 | ## Intermediate documents: 46 | *.dvi 47 | *-converted-to.* 48 | # these rules might exclude image files for figures etc. 49 | # *.ps 50 | # *.eps 51 | # *.pdf 52 | 53 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 54 | *.bbl 55 | *.bcf 56 | *.blg 57 | *-blx.aux 58 | *-blx.bib 59 | *.brf 60 | *.run.xml 61 | 62 | ## Build tool auxiliary files: 63 | *.fdb_latexmk 64 | *.synctex.gz 65 | *.synctex.gz(busy) 66 | *.pdfsync 67 | 68 | ## Auxiliary and intermediate files from other packages: 69 | 70 | # algorithms 71 | *.alg 72 | *.loa 73 | 74 | # amsthm 75 | *.thm 76 | 77 | # beamer 78 | *.nav 79 | *.snm 80 | *.vrb 81 | 82 | #(e)ledmac/(e)ledpar 83 | *.end 84 | *.[1-9] 85 | *.[1-9][0-9] 86 | *.[1-9][0-9][0-9] 87 | *.[1-9]R 88 | *.[1-9][0-9]R 89 | *.[1-9][0-9][0-9]R 90 | *.eledsec[1-9] 91 | *.eledsec[1-9]R 92 | *.eledsec[1-9][0-9] 93 | *.eledsec[1-9][0-9]R 94 | *.eledsec[1-9][0-9][0-9] 95 | *.eledsec[1-9][0-9][0-9]R 96 | 97 | # glossaries 98 | *.acn 99 | *.acr 100 | *.glg 101 | *.glo 102 | *.gls 103 | 104 | # hyperref 105 | *.brf 106 | 107 | # listings 108 | *.lol 109 | 110 | # makeidx 111 | *.idx 112 | *.ilg 113 | *.ind 114 | *.ist 115 | 116 | # minitoc 117 | *.maf 118 | *.mtc 119 | *.mtc0 120 | 121 | # minted 122 | *.pyg 123 | 124 | # morewrites 125 | *.mw 126 | 127 | # nomencl 128 | *.nlo 129 | 130 | # sagetex 131 | *.sagetex.sage 132 | *.sagetex.py 133 | *.sagetex.scmd 134 | 135 | # sympy 136 | *.sout 137 | *.sympy 138 | sympy-plots-for-*.tex/ 139 | 140 | # todonotes 141 | *.tdo 142 | 143 | # xindy 144 | *.xdy 145 | -------------------------------------------------------------------------------- /redditnetwork/utils/stringutils.py: -------------------------------------------------------------------------------- 1 | import string 2 | import nltk 3 | import re 4 | 5 | NLTK_STOP = set(nltk.corpus.stopwords.words('english')) 6 | PUNCTUATION = set(string.punctuation) 7 | URL = ['www', '.com', '.net', '.org', '.edu', '//', '/u/', '/r/', 'http'] 8 | lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() 9 | HTTP = re.compile("https?$") 10 | BOT = re.compile("bot\d*$") 11 | 12 | def is_bot(word): 13 | word = word.lower() 14 | return bool(BOT.search(word)) 15 | 16 | def is_punkt(word): 17 | return word in PUNCTUATION 18 | 19 | def is_http(word): 20 | return bool(HTTP.match(word)) 21 | 22 | def has_numbers(input_string): 23 | return bool(re.search(r'\d', input_string)) 24 | 25 | def is_ascii(s): 26 | try: 27 | s.decode("ascii") 28 | except UnicodeEncodeError: 29 | return False 30 | else: 31 | return True 32 | 33 | def is_url(word): 34 | b = [1 if t in word else 0 for t in URL] 35 | return sum(b) > 0 36 | 37 | def clean_word_replace(word): 38 | word = word.strip() 39 | word = word.strip(string.punctuation) 40 | word = word.lower() 41 | if not is_ascii(word): 42 | return "" 43 | elif is_http(word): 44 | return "" 45 | elif has_numbers(word): 46 | return "" 47 | else: 48 | return word 49 | 50 | def clean_word(word, lower=True, stem=True, remove_stop=True): 51 | word = word.strip() 52 | word = word.strip(string.punctuation) 53 | w = word.lower() 54 | if remove_stop: 55 | if word.startswith("'"): 56 | return "" 57 | if not is_ascii(word): 58 | return "" 59 | b = [1 if t in word else 0 for t in URL] 60 | if sum(b) > 0: 61 | return "" 62 | if w in PUNCTUATION or w in NLTK_STOP: 63 | return "" 64 | if has_numbers(w): 65 | return "" 66 | if stem: 67 | w = lemmatizer.lemmatize(w) # for nouns 68 | w = lemmatizer.lemmatize(w, pos = 'v') # for verbs 69 | return w 70 | 71 | def is_stop(word): 72 | if len(word) == 1: 73 | return True 74 | elif word.isdigit(): 75 | return True 76 | elif word in NLTK_STOP: 77 | return True 78 | else: 79 | return False 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # redditnetwork 2 | 3 | Code for managing Reddit dataset (assuming user has access to the Stanford Reddit Data). 4 | 5 | ## Installing / set-up 6 | 7 | The code requires that version 1.2.0 of spacy is installed, in order to load and manipulate the pre-processed text data (spacy https://spacy.io/docs/api/doc). 8 | If you are using a newer version of spacy and don't want to downgrade, it is recommended that you use a virtual environment, i.e. run the following in the root redditnetwork directory 9 | 10 | pip install virtualenv 11 | python -m virtualenv venv 12 | source venv/bin/activate 13 | 14 | After you have (optionally) set-up the virtual environment, run 15 | 16 | pip setup.py install 17 | python -m spacy.en.download all 18 | python -m nltk.downloader stopwords 19 | The first command installs the package and all necessary dependencies. 20 | The second command downloads the spacy model data. It will download around 1Gb data (including pre-trained word vectors) into the spacy installation directory, so make sure you have space in that directory. 21 | The third model downloads the standard nltk stopword lists; this should only take a few seconds. 22 | Note that these models might already be installed/downloaded. There is no need to reinstall/download if this is the case. 23 | 24 | 25 | ## Using the code 26 | 27 | There are two main use cases for this code: 28 | 1) Accessing the Reddit post and comment corpus. 29 | 2) Extracting heterogeneous/multilayer networks from the data. 30 | 31 | ### Accessing and iterating over comments and posts 32 | 33 | The `corpus_reader.py` file contains a number of useful classes that allow you to access and iterate over the processed Reddit data. 34 | If you have read access to the Stanford Reddit data (hosted by the InfoLab) then these should work out of the box. 35 | The data is designed so that you can access comments or posts from a specific subreddit for a specific time-period (month). 36 | 37 | 38 | When you iterate over comments for a particular subreddit/month, 39 | each comment will be represented by a dictionary with attributes corresponding to comments score, author, timestamp, etc. 40 | The processed text data is accessible via the "doc" attribute, which is a spacy Doc object (https://spacy.io/docs/api/doc). 41 | This Doc object contains the raw text, along with various processed/annotated versions (pos tags, lemmas, etc.). 42 | See the spacy docs for more info. 43 | 44 | ### Extracting multilayer networks from the data 45 | 46 | The `network_extractor.py` file contains code for extracting network data corresponding to one week of activity in a specific subreddit. 47 | See the `network_example.ipynb` notebook for an example an more information. 48 | -------------------------------------------------------------------------------- /redditnetwork/utils/datautils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | 6 | from redditnetwork import constants 7 | from redditnetwork.utils.ioutils import load_json 8 | 9 | 10 | DATA = constants.DATA_HOME 11 | 12 | def make_data_frame(communities, feature_dict): 13 | """ 14 | Makes a pandas dataframe for name, months, and dictionary of feature funcs. 15 | Each feature func should take name and return feature value. 16 | Constructed dataframe has flat csv style structure and missing values are removed. 17 | """ 18 | 19 | temp = collections.defaultdict(list) 20 | feature_dict["name"] = lambda name : name 21 | for name in communities: 22 | for feature, feature_func in feature_dict.iteritems(): 23 | temp[feature].append(feature_func(name)) 24 | df = pd.DataFrame(temp) 25 | df = df.replace([np.inf, -np.inf], np.nan) 26 | df = df.dropna() 27 | return df 28 | 29 | def make_data_frame_time(communities, time_range, feature_dict): 30 | """ 31 | Makes a pandas dataframe for name, months, and dictionary of feature funcs. 32 | Each feature func should take (name, month) and return feature value. 33 | Constructed dataframe has flat csv style structure and missing values are removed. 34 | """ 35 | 36 | temp = collections.defaultdict(list) 37 | feature_dict["name"] = lambda name, time : name 38 | feature_dict["time"] = lambda name, time : time 39 | for name in communities: 40 | for time in time_range: 41 | for feature, feature_func in feature_dict.iteritems(): 42 | temp[feature].append(feature_func(name, time)) 43 | df = pd.DataFrame(temp) 44 | df = df.replace([np.inf, -np.inf], np.nan) 45 | df = df.dropna() 46 | return df 47 | 48 | 49 | def read_filtered_users(): 50 | users = set() 51 | with open(DATA + 'filtered_users.txt') as fp: 52 | for line in fp: 53 | x = line.strip().split('\t') 54 | users.add(x[0]) 55 | return users 56 | 57 | def read_subreddit_names(year=None): 58 | exclude_set = set(load_json(constants.DATA_HOME + "exclude_set.json")) 59 | subs = set([]) 60 | if year == None: 61 | for year in constants.YEARS: 62 | subs.update([e.split(".")[0] for e in os.listdir(constants.DATA_HOME + "spacy_comments/" + str(year))]) 63 | else: 64 | subs.update([e.split(".")[0] for e in os.listdir(constants.DATA_HOME + "spacy_comments/" + str(year))]) 65 | return subs-exclude_set 66 | 67 | def valid_subreddits(): 68 | subreddits = [] 69 | with open(DATA + "total_comment_counts.tsv") as fp: 70 | for line in fp: 71 | subreddits.append(line.split("\t")[0]) 72 | return subreddits 73 | -------------------------------------------------------------------------------- /redditnetwork/network_extractor.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | 4 | from collections import Counter, defaultdict 5 | 6 | from redditnetwork.corpus_reader import PostMap, WeekIterWrapper, SpacyComments, MultiIterWrapper 7 | from redditnetwork.utils.dateutils import get_week_timestamp 8 | 9 | VEC_SIZE=300 10 | SIF=10e-4 11 | 12 | def extract_month_network_multisubreddits(subreddits, year, month): 13 | """ 14 | Extracts a multilayer network of users comments and posts for 15 | multiple subreddits from the specified month. 16 | """ 17 | post_map = {} 18 | for subreddit in subreddits: 19 | post_map.update(PostMap(subreddit, year, month).post_map) 20 | comment_iter = MultiIterWrapper([SpacyComments(subreddit, year, month) for 21 | subreddit in subreddits]) 22 | return extract_network(post_map, comment_iter, 0) 23 | 24 | 25 | def extract_week_network_multisubreddits(subreddits, year, week): 26 | """ 27 | Extracts a multilayer network of users comments and posts for 28 | multiple subreddits from the specified week. 29 | """ 30 | post_map = {} 31 | for subreddit in subreddits: 32 | post_map.update(PostMap(subreddit, year, -1, week=week).post_map) 33 | comment_iter = MultiIterWrapper([WeekIterWrapper(SpacyComments, week, subreddit, year) for 34 | subreddit in subreddits]) 35 | return extract_network(post_map, comment_iter, 0) 36 | 37 | def extract_year_network(subreddit, year): 38 | """ 39 | Extracts a multi-layer network of users, comments, and posts. 40 | Data is taken from a specific month (num between 1 and 12) in a specific year. 41 | """ 42 | post_map = {} 43 | for month in range(1,13): 44 | post_map.update(PostMap(subreddit, year, month).post_map) 45 | comment_iter = MultiIterWrapper([SpacyComments(subreddit, year, month) for 46 | month in range(1,13)]) 47 | base_time = get_week_timestamp(year,0) 48 | return extract_network(post_map, comment_iter, base_time) 49 | 50 | 51 | def extract_month_network(subreddit, year, month): 52 | """ 53 | Extracts a multi-layer network of users, comments, and posts. 54 | Data is taken from a specific month (num between 1 and 12) in a specific year. 55 | """ 56 | post_map = PostMap(subreddit, year, month) 57 | comment_iter = SpacyComments(subreddit, year, month) 58 | #TODO: Actually do this... It is not a big deal since the values 59 | # will be internally consistent, but still... 60 | month_base_time = get_week_timestamp(year, month/4-2) 61 | return extract_network(post_map.post_map, comment_iter, month_base_time) 62 | 63 | def extract_week_network(subreddit, year, week): 64 | """ 65 | Extracts a multi-layer network of users, comments, and posts. 66 | Data is taken from a specific week (num between 1 and 50) in a specific year. 67 | """ 68 | post_map = PostMap(subreddit, year, -1, week=week) 69 | comment_iter = WeekIterWrapper(SpacyComments, week, subreddit, year) 70 | week_base_time = get_week_timestamp(year, week) 71 | 72 | return extract_network(post_map.post_map, comment_iter, week_base_time) 73 | 74 | def _get_embedding(doc, counter, total_count): 75 | vecs = [word.vector*(SIF / (counter[word.lower_]/total_count + SIF)) for word in doc if word.has_vector] 76 | if len(vecs) == 0 or np.isnan(np.sum(np.array(vecs))): 77 | return np.zeros((VEC_SIZE,)) 78 | else: 79 | vecs = np.array(vecs) 80 | vecs = np.mean(vecs, axis=0) 81 | return vecs 82 | 83 | def extract_network(post_map, comment_iter, base_time, idf=True): 84 | 85 | if idf: 86 | df = Counter() 87 | total_count = 0. 88 | for comment in comment_iter: 89 | for word in comment["doc"]: 90 | df[word.lower_] += 1 91 | total_count += 1. 92 | else: 93 | df = defaultdict(float) 94 | total_count = 1. 95 | 96 | graph = nx.DiGraph(user_feats={}, 97 | post_feats = {"score" : 1, "time": 1, "num_comments": 1, "subreddit" : 1, "length" : 1, "word_vecs" : VEC_SIZE}, 98 | comment_feats = {"score" : 1, "time" : 1, "post_time_offset": 1, "length" : 1, "subreddit" : 1, "word_vecs" : VEC_SIZE}) 99 | 100 | ## Add all posts as nodes connected to their authors 101 | for post in post_map.values(): 102 | graph.add_node(post["id"], 103 | type="post", 104 | score=post["score"], 105 | num_comments=post["num_comments"], 106 | subreddit=post["subreddit"], 107 | time=(int(post["timestamp"])-base_time)/3600., 108 | length=len(post["doc"]), 109 | word_vecs=_get_embedding(post["doc"], df, total_count)) 110 | if not graph.has_node(post["author"]): 111 | graph.add_node(post["author"], type="user") 112 | graph.add_edge(post["author"], post["id"], type="user_post") 113 | 114 | skipped_missing_parent = 0 115 | skipped_missing_post = 0 116 | for i, comment in enumerate(comment_iter): 117 | # skip comments that don't respond to a post from this week 118 | if not comment["post"] in post_map: 119 | skipped_missing_post += 1 120 | continue 121 | # skip comments that don't respond to a parent from this week 122 | if comment["parent"] != comment["post"] and not graph.has_node(comment["parent"]): 123 | skipped_missing_parent += 1 124 | continue 125 | 126 | # add author node if necessary 127 | if not graph.has_node(comment["author"]): 128 | graph.add_node(comment["author"], type="user") 129 | 130 | # add comment node 131 | graph.add_node(comment["id"], 132 | type="comment", 133 | score=comment["score"], 134 | subreddit=comment["subreddit"], 135 | time=(comment["timestamp"]-base_time)/3600., 136 | post_time_offset=(comment["timestamp"]-int(post["timestamp"]))/3600., 137 | length=len(comment["doc"]), 138 | word_vecs=_get_embedding(comment["doc"], df, total_count)) 139 | 140 | # Add edges 141 | graph.add_edge(comment["author"], comment["id"], type="user_comment") 142 | if comment["parent"] != comment["post"]: 143 | graph.add_edge(comment["parent"], comment["id"], type="comment_comment") 144 | else: 145 | graph.add_edge(comment["post"], comment["id"], type="post_comment") 146 | 147 | print "Processed {:d} comments, of which {:d} were removed for missing post and {:d} for missing parent".format( 148 | i, skipped_missing_post, skipped_missing_parent) 149 | return graph 150 | -------------------------------------------------------------------------------- /redditnetwork/corpus_reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various objects for iterating over and accessing processed Reddit data. 3 | """ 4 | 5 | import json 6 | 7 | from redditnetwork import constants 8 | from redditnetwork.utils.datautils import read_filtered_users 9 | from redditnetwork.utils.stringutils import is_bot 10 | from redditnetwork.utils.dateutils import get_week 11 | 12 | from spacy.tokens.doc import Doc 13 | from spacy.en import English 14 | 15 | SPACY_VOCAB = English().vocab 16 | FILTERED_USERS = read_filtered_users() 17 | 18 | class MultiIterWrapper(): 19 | def __init__(self, iters): 20 | self.iters = iters 21 | 22 | def __iter__(self): 23 | for _iter in self.iters: 24 | for item in _iter: 25 | yield item 26 | 27 | class WeekIterWrapper(): 28 | """ 29 | Gets an iterator over a specific week. 30 | Takes a comment or post iterator class as an argument. 31 | Annoyingly weeks and months are not aligned.... 32 | """ 33 | def __init__(self, cls, week, subreddit, year, **kw_args): 34 | # Throws an assertion if you try to get weeks that cross between 35 | # years 36 | assert week != 0 and week < 51 37 | self.week = week 38 | month = week / 4 + 1 39 | ## Week 1 for us == week 2 for ISO weeks 40 | self.iter1 = cls(subreddit, year, month, **kw_args) 41 | self.iter2 = cls(subreddit, year, month+1, **kw_args) 42 | 43 | def __iter__(self): 44 | for item in self.iter1.__iter__(week=self.week): 45 | yield item 46 | for item in self.iter2.__iter__(week=self.week): 47 | yield item 48 | 49 | class PostMap(): 50 | """ 51 | Map into post data. 52 | """ 53 | def __init__(self, subreddit, year, month, week=None, path=None, 54 | clean_deleted=True, clean_bots=True): 55 | if path == None: 56 | path = constants.DATA_HOME + "spacy_posts/" 57 | if not week is None: 58 | print "Warning: Using week argument and ignoring month" 59 | self.post_map = self._make_map( 60 | WeekIterWrapper(PostIterator, week, subreddit, 61 | year, path=path, 62 | clean_deleted=True, clean_bots=True)) 63 | else: 64 | self.post_map = self._make_map( 65 | PostIterator(subreddit, year, month, path=path, 66 | clean_deleted=True, clean_bots=True)) 67 | 68 | def _make_map(self, info_iterator): 69 | post_map = {} 70 | for entry in info_iterator: 71 | post_map[entry["id"]] = entry 72 | return post_map 73 | 74 | def get_post(self, id): 75 | return self.post_map[id] 76 | 77 | def __getitem__(self, id): 78 | return self.get_post(id) 79 | 80 | def __contains__(self, id): 81 | return id in self.post_map 82 | 83 | 84 | class PostIterator(): 85 | """ 86 | Iterator over post metadata only 87 | """ 88 | def __init__(self, subreddit, year, month, path=None, 89 | clean_deleted=True, clean_bots=True): 90 | if path == None: 91 | path = constants.DATA_HOME + "spacy_posts/" 92 | path += "{:d}_{:02d}/".format(year, month) + subreddit 93 | self._vocab = SPACY_VOCAB 94 | self.path = path 95 | self._len = None 96 | self.clean_bots = clean_bots 97 | self.clean_deleted = clean_deleted 98 | self.subreddit = subreddit 99 | 100 | def _parse_info(self, line): 101 | info = json.loads(line) 102 | info["subreddit"] = self.subreddit 103 | return info 104 | 105 | def __len__(self): 106 | if self._len == None: 107 | i = -1 108 | with open(self.path + ".info") as fp: 109 | for i, _ in enumerate(fp): 110 | pass 111 | self._len = i + 1 112 | return self._len 113 | 114 | def __iter__(self, week=None): 115 | with open(self.path + ".info") as info: 116 | with open(self.path + ".title.bin") as title_bin: 117 | for byte_string in Doc.read_bytes(title_bin): 118 | info_line = info.readline() 119 | comment_info = self._parse_info(info_line) 120 | if not (week is None) and get_week(comment_info["timestamp"]) != week: 121 | continue 122 | if self.clean_deleted and comment_info["author"] == "[deleted]": 123 | continue 124 | if self.clean_bots and (is_bot(comment_info["author"]) or 125 | comment_info["author"] in FILTERED_USERS): 126 | continue 127 | comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string) 128 | yield comment_info 129 | 130 | 131 | class InfoIterator(): 132 | """ 133 | Iterator over comment metadata only 134 | """ 135 | def __init__(self, subreddit, year, month=None, path=None, 136 | clean_deleted=True, clean_bots=True): 137 | if path == None: 138 | path = constants.DATA_HOME + "spacy_comments/" 139 | # self._vocab = Vocab.load(path + u"vocab.bin") 140 | # Annoyingly inefficient but necessary for now 141 | if not month is None: 142 | path += "{:d}_{:02d}/".format(year, month) + subreddit 143 | else: 144 | path += "{:d}/".format(year) + subreddit 145 | self.path = path 146 | self._len = None 147 | self.clean_deleted = clean_deleted 148 | self.clean_bots = clean_bots 149 | 150 | def _parse_info(self, line): 151 | info = line.split("\t") 152 | comment_info = {"id" : info[0], 153 | "timestamp" : int(info[1]), 154 | "author" : info[2], 155 | "score" : int(info[3]), 156 | "parent" : info[4], 157 | "post" : info[5].strip()} 158 | return comment_info 159 | 160 | def __len__(self): 161 | if self._len == None: 162 | i = -1 163 | with open(self.path + ".info") as fp: 164 | for i, _ in enumerate(fp): 165 | pass 166 | self._len = i + 1 167 | return self._len 168 | 169 | def __iter__(self): 170 | with open(self.path + ".info") as info: 171 | for line in info: 172 | comment_info = self._parse_info(line) 173 | if self.clean_deleted and comment_info["author"] == "[deleted]": 174 | continue 175 | if self.clean_bots and (is_bot(comment_info["author"]) or 176 | comment_info["author"] in FILTERED_USERS): 177 | continue 178 | yield comment_info 179 | 180 | class SpacyComments(): 181 | """ 182 | Iterator over spacy comments. 183 | """ 184 | 185 | def __init__(self, subreddit, year, month=None, path=None, 186 | include_punct=True, down_sample=None, clean_bots=True, clean_deleted=True): 187 | if path == None: 188 | path = constants.DATA_HOME + "spacy_comments/" 189 | self._vocab = SPACY_VOCAB 190 | if not month is None: 191 | path += "{:d}_{:02d}/".format(year, month) + subreddit 192 | else: 193 | path += "{:d}/".format(year) + subreddit 194 | self.path = path 195 | self._len = None 196 | self.clean_bots = clean_bots 197 | self.clean_deleted = clean_deleted 198 | self.include_punct = include_punct 199 | self.subreddit = subreddit 200 | 201 | def _spacy_string_clean(self, token): 202 | if token.like_url: 203 | return "" 204 | elif token.like_num: 205 | return "" 206 | elif (not self.include_punct) and token.is_punct and (not token.tag_ == "."): 207 | return "" 208 | else: 209 | return token.lower_ 210 | 211 | def _text_from_doc(self, doc): 212 | return " ".join([self._spacy_string_clean(token) for token in doc]) 213 | 214 | def _parse_info(self, line): 215 | info = line.split("\t") 216 | comment_info = {"id" : info[0], 217 | "timestamp" : int(info[1]), 218 | "author" : info[2], 219 | "score" : int(info[3]), 220 | "parent" : info[4], 221 | "subreddit" : self.subreddit, 222 | "post" : info[5].strip()} 223 | return comment_info 224 | 225 | def __len__(self): 226 | if self._len == None: 227 | with open(self.path + ".info") as fp: 228 | for i, _ in enumerate(fp): 229 | pass 230 | self._len = i + 1 231 | return self._len 232 | 233 | def __iter__(self, week=None): 234 | with open(self.path + ".bin", "rb") as bin: 235 | with open(self.path + ".info") as info: 236 | for byte_string in Doc.read_bytes(bin): 237 | comment_info = self._parse_info(info.next()) 238 | if (not week is None) and get_week(comment_info["timestamp"]) != week: 239 | continue 240 | if self.clean_deleted and comment_info["author"] == "[deleted]": 241 | continue 242 | if self.clean_bots and (is_bot(comment_info["author"]) or 243 | comment_info["author"] in FILTERED_USERS): 244 | continue 245 | doc = Doc(self._vocab).from_bytes(byte_string) 246 | comment_info["doc"] = doc 247 | comment_info["text"] = self._text_from_doc(doc) 248 | yield comment_info 249 | -------------------------------------------------------------------------------- /network_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Extracting a multilayer network from Reddit data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### First we load the network and check some basic stats" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from redditnetwork.network_extractor import extract_week_network\n", 26 | "import networkx as nx" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Warning: Using week argument and ignoring month\n", 41 | "Processed 45097 comments, of which 12378 were removed for missing post and 5888 for missing parent\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "# We will use /r/politics subreddit as the running example\n", 47 | "# We extract a network for this subreddit, corresponding to the first week of 2014\n", 48 | "politics_net = extract_week_network(\"politics\", 2014, 1)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Ignore the warning about using the week argument instead of month. (This is just an internal complication due to the fact that the data is stored at the monthly level but we are accessing weeks).\n", 56 | "\n", 57 | "Once the data finishes processing it will say that it processed a certain number of comments and removed some due to having a missing parent or post (e.g., they were replying to an old post from an earlier week).\n", 58 | "\n", 59 | "The returned object is a networkx DiGraph (directed graph)." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "There are 8992 users, 26832 comments, and 2368 posts in the graph\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "## some basic stats:\n", 79 | "print \"There are {:d} users, {:d} comments, and {:d} posts in the graph\"\\\n", 80 | " .format(len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"user\"]),\n", 81 | " len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"comment\"]),\n", 82 | " len([node for node in politics_net.nodes(data=True) if node[1][\"type\"] == \"post\"]))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Okay, and now some details on the data \n", 90 | "\n", 91 | "The underlying structure is a directed graph (DiGraph) and additional information is stored as node and edge attributes.\n", 92 | "\n", 93 | "#### Node types\n", 94 | "\n", 95 | "Every node as an \"type\" attribute that is one of \"user\", \"comment\", or \"post\".\n", 96 | "Users are indexed by their username and post/comments by unique string ids. \n", 97 | "\n", 98 | "#### Edge types\n", 99 | "\n", 100 | "Every edge has a \"type\" attribute as well, which is one of the following:\n", 101 | "* \"user_post\": a directed edge from a user to a post they made.\n", 102 | "* \"user_comment\": a directed edge from a user to a comment they made.\n", 103 | "* \"post_comment\": a directed edge from a post to a top-level comment in that post.\n", 104 | "* \"comment_comment\": a directed edge from a comment to a comment that replies to it. \n", 105 | "\n", 106 | "#### Node attributes/features\n", 107 | "\n", 108 | "Comment nodes and post nodes also additional features/attributes (which can be listed by running politics_net.graph; see the example below). User nodes currently have no features (besides those that are implicit in the graph structure). \n", 109 | "\n", 110 | "##### Comment features\n", 111 | "* score: score that comment received\n", 112 | "* time: describes when the comment was made during the week (hour offset from 12:00am on Monday of that week).\n", 113 | "* post_time_offset: how old was the post when the comment was made (in hours)\n", 114 | "* length: how many words in the comment\n", 115 | "* word_vec: 300 dimensional vector embedding of the comment (tf-idf average of GloVe vectors)\n", 116 | "\n", 117 | "##### Post features\n", 118 | "* score: score that the post recieved\n", 119 | "* time: when was the post made during the week (hour offset from 12:00 on Monday of that week)\n", 120 | "* length: number of words in the title\n", 121 | "* word_vec: vector embedding of post title (average of Glove vectors)\n", 122 | "\n", 123 | "*NOTE THAT NONE OF THESE FEATURES ARE THE \"LABELS\" WE WANT TO PREDICT.* That data is stored elsewhere for now because I don't want to clutter the network representations and because the \"labels\" are in flux. See the bottom of this notebook for an example of how to get the labels for predictions." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 4, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "{'comment_feats': {'length': 1,\n", 137 | " 'post_time_offset': 1,\n", 138 | " 'score': 1,\n", 139 | " 'subreddit': 1,\n", 140 | " 'time': 1,\n", 141 | " 'word_vecs': 300},\n", 142 | " 'post_feats': {'length': 1,\n", 143 | " 'num_comments': 1,\n", 144 | " 'score': 1,\n", 145 | " 'subreddit': 1,\n", 146 | " 'time': 1,\n", 147 | " 'word_vecs': 300},\n", 148 | " 'user_feats': {}}" 149 | ] 150 | }, 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "# this prints info about what features there are and the dimensionality of these features\n", 158 | "politics_net.graph" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 5, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "['cejaksn']\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "# lets access the node for a random user \n", 178 | "# and get all comments and posts that this user made\n", 179 | "user_out_nodes = politics_net.successors(\"RedSquirrelFtw\")\n", 180 | "print user_out_nodes" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 6, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "{'word_vecs': array([ 2.45881882e-02, -8.85956455e-03, 4.07702522e-03,\n", 195 | " -3.59144271e-03, -5.35505451e-03, 3.04689351e-03,\n", 196 | " -2.86572031e-05, 6.46826986e-04, 3.98649042e-03,\n", 197 | " -3.48688639e-03, 3.40964980e-02, 3.39702074e-03,\n", 198 | " -2.66911592e-02, 9.43523180e-03, -2.05980968e-02,\n", 199 | " -2.33542006e-02, -2.23564263e-02, -4.97682840e-02,\n", 200 | " 2.15058471e-03, 6.99266186e-03, -1.03599476e-02,\n", 201 | " -3.42106936e-03, -1.32135861e-03, -3.16169374e-02,\n", 202 | " 1.49107622e-02, 4.38282602e-02, -1.15861988e-03,\n", 203 | " -5.54729579e-03, -6.17341464e-03, 1.52532337e-02,\n", 204 | " 1.30888699e-02, 1.42863719e-02, 5.32958051e-03,\n", 205 | " 6.43259101e-03, -2.33824583e-04, -1.21295080e-02,\n", 206 | " -4.83304122e-03, -6.96073147e-03, -6.06134126e-04,\n", 207 | " 1.71746537e-02, 8.68919492e-03, 1.78009700e-02,\n", 208 | " 1.27696199e-02, -1.09810466e-02, -3.44701274e-03,\n", 209 | " -4.43779491e-03, -2.83656735e-03, -1.35982307e-02,\n", 210 | " 9.29598790e-03, 1.40077816e-02, 4.03900212e-03,\n", 211 | " -1.54639157e-02, 1.17861321e-02, -1.46402763e-02,\n", 212 | " -3.31898057e-03, 3.37800458e-02, 1.95675101e-02,\n", 213 | " -3.57995667e-02, 2.05528438e-02, -1.94929559e-02,\n", 214 | " -3.00960094e-02, 6.92273676e-03, -3.29098515e-02,\n", 215 | " -7.45723071e-03, -1.42299989e-03, -8.70021619e-03,\n", 216 | " 1.11240067e-03, -1.51177878e-02, 2.87857745e-02,\n", 217 | " 3.37396264e-02, 9.56202578e-03, -1.05087310e-02,\n", 218 | " -1.07406760e-02, 1.49361016e-02, 2.01773494e-02,\n", 219 | " 2.91823908e-05, 4.99374466e-03, -1.14855031e-02,\n", 220 | " 5.58512053e-03, -8.54704715e-03, 4.02356274e-02,\n", 221 | " -3.03811803e-02, 8.61220621e-03, 4.28446494e-02,\n", 222 | " -9.69701540e-03, -5.98392868e-03, -9.92416963e-03,\n", 223 | " 9.10818484e-03, 1.59019697e-02, -5.44882799e-03,\n", 224 | " 1.45760819e-03, -3.55789065e-03, -2.28537503e-03,\n", 225 | " -2.61299759e-02, 1.90544985e-02, -1.77889783e-02,\n", 226 | " -1.39860585e-02, 6.76063960e-03, 1.38605768e-02,\n", 227 | " 9.62258037e-03, -1.29944673e-02, -2.23032534e-02,\n", 228 | " -1.12707019e-02, -3.23294364e-02, -1.73050631e-02,\n", 229 | " 1.61380600e-03, 7.97309820e-03, 2.07648035e-02,\n", 230 | " -2.09515914e-02, -1.72264632e-02, 2.98629012e-02,\n", 231 | " -1.17384242e-02, 1.86214726e-02, 9.00912005e-03,\n", 232 | " 1.10343313e-02, 4.74179303e-03, 3.79112177e-02,\n", 233 | " 2.15634611e-02, -1.51630966e-02, 2.94514317e-02,\n", 234 | " 1.61693618e-02, 2.19558049e-02, 7.10119260e-03,\n", 235 | " 1.03538474e-02, 7.77509485e-05, 3.03796474e-02,\n", 236 | " 1.42980209e-02, 2.52941120e-02, -7.30073452e-03,\n", 237 | " 2.67049880e-03, -3.23295183e-02, -4.56356490e-03,\n", 238 | " -1.13542946e-02, 1.67651456e-02, 1.88513268e-02,\n", 239 | " 1.45909078e-02, 1.26617374e-02, 2.57562962e-03,\n", 240 | " 2.61628558e-03, -9.08431411e-03, 1.14472574e-02,\n", 241 | " 7.94017408e-03, 1.20475926e-02, 6.06606854e-03,\n", 242 | " 1.72127299e-02, 3.30692828e-02, 2.73634796e-03,\n", 243 | " -1.44931115e-03, -1.57310385e-02, -9.87053290e-03,\n", 244 | " 1.01823714e-02, 1.41003141e-02, -2.85259262e-03,\n", 245 | " -2.85569229e-03, -2.15815436e-02, 3.00777871e-02,\n", 246 | " -9.35850013e-03, -1.59715936e-02, -9.89310350e-03,\n", 247 | " -6.48096018e-03, 8.47815443e-03, -1.39471488e-02,\n", 248 | " -1.54531682e-02, 1.39459819e-02, -3.64065021e-02,\n", 249 | " -1.36025399e-02, 1.38182156e-02, -6.87898695e-03,\n", 250 | " -1.50948400e-02, -2.98325270e-02, -6.52712537e-03,\n", 251 | " 1.41928094e-02, -1.52701763e-02, 9.46271932e-04,\n", 252 | " 3.33177461e-03, -7.86158908e-03, -1.73139188e-03,\n", 253 | " 2.13753339e-02, 1.40343681e-02, 6.07236812e-04,\n", 254 | " -3.50858620e-03, 3.21572740e-03, 1.88201424e-02,\n", 255 | " 5.38636697e-04, -1.99504918e-03, 8.32799729e-03,\n", 256 | " -9.34114214e-03, 1.05030613e-03, 2.49872357e-02,\n", 257 | " -1.37836263e-02, -5.93390130e-03, 3.22293714e-02,\n", 258 | " 5.87111758e-03, -3.06474995e-02, -2.33445037e-02,\n", 259 | " -2.20932271e-02, 1.08240033e-03, -1.84142999e-02,\n", 260 | " -1.48790656e-02, -1.64782442e-02, -7.43377954e-03,\n", 261 | " -1.01498319e-02, 1.35151120e-02, 1.11000063e-02,\n", 262 | " 2.05843598e-02, 1.21982545e-02, 2.34148884e-03,\n", 263 | " 1.59241911e-02, 2.14853045e-02, 1.02746207e-02,\n", 264 | " 2.46614888e-02, -9.97475255e-03, 2.22080369e-02,\n", 265 | " -2.92910635e-03, -1.70956121e-03, 8.48170649e-03,\n", 266 | " 2.91286502e-02, 7.95706734e-03, 1.02885272e-02,\n", 267 | " -7.11069396e-03, -9.60137043e-03, 3.87477353e-02,\n", 268 | " 5.41670388e-03, -7.04232231e-03, 6.28765486e-03,\n", 269 | " 6.94147125e-03, -1.28068291e-02, -3.23408772e-03,\n", 270 | " 1.30555267e-02, 2.38443818e-02, -1.96164623e-02,\n", 271 | " -1.58533361e-02, 9.44936834e-03, -1.23057445e-03,\n", 272 | " -1.55784115e-02, 1.23042492e-02, 2.84389127e-02,\n", 273 | " 1.14323832e-02, -2.29188725e-02, 2.06083413e-02,\n", 274 | " -1.55406876e-03, 7.49366404e-03, -2.04990674e-02,\n", 275 | " 5.44690294e-03, 1.76218394e-02, -2.92982757e-02,\n", 276 | " -1.70645968e-03, -1.92762853e-03, -3.71797127e-04,\n", 277 | " -2.82948818e-02, 2.10400019e-02, 4.07306617e-03,\n", 278 | " 2.54461095e-02, 3.17274220e-02, -3.35782184e-03,\n", 279 | " -4.31606658e-02, -1.00767994e-02, 4.30808449e-03,\n", 280 | " 3.92695935e-03, -1.54349012e-02, 8.19602143e-03,\n", 281 | " -1.41330715e-03, -2.46893838e-02, 1.46608660e-02,\n", 282 | " 2.21408587e-02, 6.10332601e-02, -8.76054820e-03,\n", 283 | " 8.68958142e-03, -6.90774480e-03, -2.69666910e-02,\n", 284 | " 3.16602848e-02, -1.31279230e-02, -1.95450392e-02,\n", 285 | " -5.54493815e-03, -1.54730147e-02, 1.84348375e-02,\n", 286 | " 7.17827678e-03, -1.07575832e-02, -2.36013550e-02,\n", 287 | " 8.04622378e-03, -1.49186878e-02, -3.87179069e-02,\n", 288 | " -1.70071388e-03, 1.06324144e-02, -4.39525116e-04,\n", 289 | " 2.57094787e-03, 2.30304878e-02, 2.87001040e-02,\n", 290 | " 1.92963984e-02, -2.31728554e-02, -3.97634273e-03,\n", 291 | " 7.90031161e-03, 3.72355897e-03, -2.97002345e-02,\n", 292 | " 1.95561489e-03, -4.79521276e-03, -2.14669239e-02,\n", 293 | " 1.08616846e-02, 2.25015227e-02, 3.29043274e-03], dtype=float32), 'subreddit': 'politics', 'post_time_offset': 12.575833333333334, 'length': 26, 'score': 28, 'time': 196.05277777777778, 'type': 'comment'}\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "# this user made only one comment... but I think you get the picture\n", 299 | "# e.g., we could access the attributes for this comment \n", 300 | "print politics_net.node[user_out_nodes[0]]" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "There is still lots of graph management stuff that is left unspecificed (e.g., what's the best way to get all nodes of a certain type), but I figure this is just networkx/bookkeeping stuff and doesn't need to be baked in to the representation." 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "## We can also extract networks for multiple subreddits...." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 1, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "from redditnetwork.network_extractor import extract_week_network_multisubreddits" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 2, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [ 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "Warning: Using week argument and ignoring month\n", 340 | "Warning: Using week argument and ignoring month\n", 341 | "Processed 57517 comments, of which 11175 were removed for missing post and 8704 for missing parent\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "multi_test = extract_week_network_multisubreddits([\"politics\", \"Libertarian\"], 2014, 2)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 4, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "11196" 360 | ] 361 | }, 362 | "execution_count": 4, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "len([node for node in multi_test.nodes(data=True) if node[1][\"type\"] == \"post\"])" 369 | ] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "Python 2", 375 | "language": "python", 376 | "name": "python2" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 2 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython2", 388 | "version": "2.7.13" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 2 393 | } 394 | --------------------------------------------------------------------------------