├── .gitignore
├── .pylintrc
├── LICENSE
├── README.md
├── environment.yml
├── pysmap
    ├── __init__.py
    ├── mltools
    │   ├── __init__.py
    │   ├── crowd_model.py
    │   └── smapp_model.py
    ├── twitterutil
    │   ├── __init__.py
    │   ├── smapp_collection.py
    │   └── smapp_dataset.py
    └── viz
    │   ├── __init__.py
    │   ├── networks.py
    │   └── plots.py
├── setup.py
└── test
    ├── __init__.py
    ├── data
        ├── invalid.bson
        ├── valid-single.bson.json
        ├── valid.bson
        ├── valid.bson.json
        └── valid.csv
    ├── test_crowd_model.py
    ├── test_networks.py
    ├── test_plots.py
    ├── test_smapp_collection.py
    └── test_smapp_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | .DS_Store
65 | chart_tests/
66 | 
67 | config.py


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | 
 3 | # Enable the message, report, category or checker with the given id(s). You can
 4 | # either give multiple identifier separated by comma (,) or put this option
 5 | # multiple time.
 6 | #enable=
 7 | 
 8 | # Disable the message, report, category or checker with the given id(s). You
 9 | # can either give multiple identifier separated by comma (,) or put this option
10 | # multiple time (only on the command line, not in the configuration file where
11 | # it should appear only once).
12 | disable=pointless-string-statement, too-many-branches, missing-docstring, too-many-arguments, invalid-name, line-too-long, duplicate-code, simplifiable-if-statement
13 | 
14 | [FORMAT]
15 | indent-string=\t


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 smapp
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | ```
   2 |  _ __  _   _ ___ _ __ ___   __ _ _ __
   3 | | '_ \| | | / __| '_ ` _ \ / _` | '_ \
   4 | | |_) | |_| \__ \ | | | | | (_| | |_) |
   5 | | .__/ \__, |___/_| |_| |_|\__,_| .__/
   6 | |_|    |___/                    |_|
   7 | ```
   8 | 
   9 | [![PyPI](https://img.shields.io/pypi/v/pysmap.svg)](https://pypi.python.org/pypi/pysmap) [![PyPI](https://img.shields.io/pypi/l/pysmap.svg)](https://github.com/SMAPPNYU/pysmap/blob/master/LICENSE)
  10 | 
  11 | :snake: pysmap is a high level toolkit for dealing with twitter data it also has a higher level interface for [smappdragon](https://github.com/SMAPPNYU/smappdragon). it has functionality from the old toolkit and functionality from our old util library smappPy.
  12 | - [twitterutil](#twitterutil)
  13 |     - [smapp_dataset](#smapp_dataset)
  14 |     - [smapp_collection](#smapp_collection)
  15 |         - [set_custom_filter](#set_custom_filter)
  16 |         - [get_tweets_containing](#get_tweets_containing)
  17 |         - [get_top_terms](#get_top_terms)
  18 |         - [get_tweet_texts](#get_tweet_texts)
  19 |         - [get_date_range](#get_date_range)
  20 |         - [get_geo_enabled](#get_geo_enabled)
  21 |         - [get_non_geo_enabled](#get_non_geo_enabled)
  22 |         - [get_top_entities](#get_top_entities)
  23 |         - [get_top_hashtags](#get_top_hashtags)
  24 |         - [get_top_urls](#get_top_urls)
  25 |         - [get_top_mentions](#get_top_mentions)
  26 |         - [get_top_media](#get_top_media)
  27 |         - [get_top_symbols](#get_top_symbols)
  28 |         - [find_date_range](#find_date_range)
  29 |         - [count_tweet_terms](#count_tweet_terms)
  30 |         - [count_tweets](#count_tweets)
  31 |         - [exclude_retweets](#exclude_retweets)
  32 |         - [get_retweets](#get_retweets)
  33 |         - [user_location_contains](#user_location_contains)
  34 |         - [user_description_contains](#user_description_contains)
  35 |         - [user_id_is](#user_id_is)
  36 |         - [place_name_contains_country](#place_name_contains_country)
  37 |         - [within_geobox](#within_geobox)
  38 |         - [limit_number_of_tweets](#limit_number_of_tweets)
  39 |         - [tweet_language_is](#tweet_language_is)
  40 |         - [detect_tweet_language](#detect_tweet_language)
  41 |         - [user_language_is](#user_language_is)
  42 |         - [sample](#sample)
  43 |         - [dump_to_bson](#dump_to_bson)
  44 |         - [dump_to_json](#dump_to_json)
  45 |         - [dump_to_csv](#dump_to_csv)
  46 |         - [dump_to_sqlite_db](#dump_to_sqlite_db)
  47 | - [viz](#viz)
  48 |     - [plots](#plots)
  49 |         - [bar_graph_tweet_field_grouped_by_period](#bar_graph_tweet_field_grouped_by_period)
  50 |         - [bar_graph_languages](#bar_graph_languages)
  51 |         - [bar_graph_user_languages](#bar_graph_user_languages)
  52 |         - [bar_graph_tweets](#bar_graph_tweets)
  53 |         - [bar_graph_tweets_with_urls](#bar_graph_tweets_with_urls)
  54 |         - [bar_graph_tweets_with_media](#bar_graph_tweets_with_media)
  55 |         - [bar_graph_tweets_with_mentions](#bar_graph_tweets_with_mentions)
  56 |         - [bar_graph_tweets_with_hashtags](#bar_graph_tweets_with_hashtags)
  57 |         - [bar_graph_tweets_with_symbols](#bar_graph_tweets_with_symbols)
  58 |         - [bar_graph_tweets_with_retweets](#bar_graph_tweets_with_retweets)
  59 |         - [bar_graph_tweets_with_locations](#bar_graph_tweets_with_locations)
  60 |     - [networks](#networks)
  61 |         - [retweet_network](#retweet_network)
  62 | - [models](#models)
  63 |     - [crowd_model](#crowd_model)
  64 | 
  65 | # installation
  66 | 
  67 | `pip install pysmap`
  68 | 
  69 | `pip install pysmap --upgrade`
  70 | 
  71 | # twitterutil
  72 | 
  73 | the package with an array of twitter tools.
  74 | 
  75 | # smapp_collection
  76 | 
  77 | this is the smapp_collection class, an abstraction of smappdragon collections.
  78 | 
  79 | abstract:
  80 | ```python
  81 | from pysmap import SmappCollection
  82 | 
  83 | collection = SmappCollection(DATA_TYPE, OTHER_INPUTS)
  84 | ```
  85 | 
  86 | practical:
  87 | ```python
  88 | from pysmap import SmappCollection
  89 | 
  90 | collection = SmappCollection('bson', '/path/to/my/bson/file.bson')
  91 | # or
  92 | collection = SmappCollection('mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1')
  93 | # or
  94 | collection = SmappCollection('json', '/path/to/my/file.json')
  95 | # or
  96 | collection = SmappCollection('csv', '/path/to/my/csv/file.csv')
  97 | ```
  98 | 
  99 | *returns* a collection object that you can use to call methods below on
 100 | 
 101 | # smapp_dataset
 102 | 
 103 | this is the dataset class, it can be used anywhere one might use a [SmappCollection](#smapp_collection) object. it lets you combine collections and other datasets at will.
 104 | 
 105 | abstract:
 106 | ```python
 107 | 
 108 | # standard
 109 | 
 110 | dataset = SmappDataset([TYPE_OF INPUT, FILE_PATH], [TYPE_OF_INPUT, MONGO_INPUTS])
 111 | 
 112 | # or with regex for matching mongo databases/collections
 113 | # this is only for mongo and not for files
 114 | 
 115 | dataset = SmappDataset(collection_regex=REGEX, database_regex=REGEX, [MONGO_INPUT, MONGO_INPUT, etc])
 116 | 
 117 | dataset = SmappDataset(collection_regex=REGEX, [MONGO_INPUT, MONGO_INPUT, etc])
 118 | 
 119 | # or with a unix style file pattern for matching file paths (this is not regex)
 120 | # this is only for files and not for mongo
 121 | 
 122 | dataset = SmappDataset([TYPE_OF_INPUT, 'file_pattern', FILE_PATTTERN], [TYPE_OF_INPUT, 'file_pattern', FILE_PATTTERN], etc)
 123 | ```
 124 | 
 125 | practical:
 126 | ```python
 127 | # combine collections of the same type
 128 | dataset = SmappDataset(['bson', '/path/to/my/bson/file1.bson'], ['bson', '/path/to/my/bson/file2.bson'], ['bson', '/path/to/my/bson/file3.bson'])
 129 | 
 130 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_2'])
 131 | 
 132 | # combine collections of different types
 133 | 
 134 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['bson', '/path/to/my/bson/file1.bson'], ['json', '/path/to/my/bson/json_file.json'])
 135 | 
 136 | # or combine collections and datasets
 137 | 
 138 | collection = SmappCollection('csv', '/path/to/my/csv/file.csv')
 139 | 
 140 | dataset_one = SmappDataset(['bson', '/path/to/my/bson/file1.bson'], ['bson', '/path/to/my/bson/file2.bson'], ['bson', '/path/to/my/bson/file3.bson'])
 141 | 
 142 | dataset_two =  SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_2'])
 143 | 
 144 | final_dataset = SmappDataset(['json', '/path/to/my/bson/json_file.json'], dataset_one, dataset_two, collection)
 145 | 
 146 | # or use regex to match for multiple collections/dbs
 147 | 
 148 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler'], collection_regex='(^data$|^tweets$|^tweets_\d+$)')
 149 | 
 150 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD'], collection_regex='(^tweets$|^tweets_\d+$)', database_regex='(^GERMANY_ELECTION_2015_Nagler_\d+$)')
 151 | 
 152 | # or use a file pattern to match many files
 153 | dataset_one = SmappDataset(['bson', 'file_pattern', '~/smappwork/data_*.bson'])
 154 | 
 155 | dataset_two = SmappDataset(['json', 'file_pattern', '~/smappwork/data_*.json'], ['csv', 'file_pattern', '/Users/yvan/data/counts_*.csv'])
 156 | 
 157 | dataset_three = SmappDataset(['json', '/non/pattern/path/to/my/bson/json_file.json'], dataset_one, dataset_two)
 158 | ```
 159 | 
 160 | `regex` - regex stands for 'regular expression' its the way programmers pattern match on words, so regex inputs for SmappDataset allow you to pattern match data sources, you must use regex type input patterns or lists+collections+datasets as inputs you cannot use both
 161 | 
 162 | `collection_regex` - this is required, to grab all collections named tweets_X (backwards compatiblilty) use `(^tweets$|^tweets_\d+$)` for new/regular collections use `(^data$)` or `(^data$|^tweets$|^tweets_\d+$)` for compatilibly backwards and forwards, if you have a different naming convention you can use a regex to match for that.
 163 | 
 164 | `database_regex` - only required for mongo datasets, you can omit this variable if you are not using regex to try to match databases
 165 | 
 166 | `file_pattern` - use to select multiple file paths based off a unix style pattern. pysmap smapp_dataset uses [glob](https://docs.python.org/2/library/glob.html#module-glob) under the hood to match the filepaths. pysmap also includes tilde `~` expansion which is not included by glob. so for example:
 167 | ```
 168 | /scratch/smapp/test_dumps_dec1/dump_*.json
 169 | #would match 
 170 | /scratch/smapp/test_dumps_dec1/dump_1.json
 171 | /scratch/smapp/test_dumps_dec1/dump_blah_blah.json 
 172 | #and
 173 | try_dump_dat_parallel_?.bson
 174 | #would match
 175 | try_dump_dat_parallel_0.bson
 176 | try_dump_dat_parallel_1.bson
 177 | #and
 178 | try_dump_dat_parallel_[0-9].* 
 179 | #would match
 180 | try_dump_dat_parallel_0.bson
 181 | try_dump_dat_parallel_0.csv 
 182 | try_dump_dat_parallel_0.db 
 183 | try_dump_dat_parallel_0.json 
 184 | try_dump_dat_parallel_1.bson 
 185 | try_dump_dat_parallel_1.csv 
 186 | try_dump_dat_parallel_1.db
 187 | try_dump_dat_parallel_1.json
 188 | ```
 189 | read about [unix file patterns here](http://www.robelle.com/smugbook/wildcard.html).
 190 | 
 191 | regex explanation example in the statement:
 192 | 
 193 | ```python
 194 | dataset = SmappDataset(collection_regex='(^tweets$|^tweets_\d+$)', database_regex='(^GERMANY_ELECTION_2015_Nagler_\d+$)', ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD'])
 195 | ```
 196 | 
 197 | the collection regex `(^tweets$|^tweets_\d+$)` means match every collection that is called tweets or tweets_\d where `\d` is some number. so tweets, tweets_1, tweets_2, etc
 198 | 
 199 | the database regex `(^GERMANY_ELECTION_2015_Nagler_\d+$)` means match every database that has GERMANY_ELECTION_2015_Nagler_\d where `\d` is some number. so GERMANY_ELECTION_2015_Nagler_1, GERMANY_ELECTION_2015_Nagler_2, etc. the regex will not match 'GERMANY_ELECTION_2015_Nagler' in this case as it lacks the term '^GERMANY_ELECTION_2015_Nagler$'.
 200 | 
 201 | *input* several `SmappDataset` objects and/or `SmappCollection` objects
 202 | 
 203 | *output* a SmappDataset object that can be used the same way a [SmappCollection](#smapp_collection) can be 
 204 | 
 205 | #iterate through tweets
 206 | 
 207 | iterate through the tweets in the collection you've made.
 208 | 
 209 | abstract:
 210 | ```python
 211 | for tweet in collection:
 212 |     print(tweet)
 213 | ```
 214 | 
 215 | practical:
 216 | ```python
 217 | for tweet in collection.get_tweets_containing('cat').tweet_language_is('fr'):
 218 |     print(tweet)
 219 | ```
 220 | 
 221 | note:
 222 | 
 223 | if on nyu hpc, print will not work, totally out of my control. you gotta change locale. 
 224 | 
 225 | to fix it, you need to reset the default bash encoding BEFORE opening/running python. just type in bash:
 226 | ```
 227 | LANG=en_US.utf8 
 228 | ```
 229 | 
 230 | # set_custom_filter
 231 | 
 232 | sets a user defined function to act as a filter
 233 | 
 234 | abstract:
 235 | ```python
 236 | collection.set_custom_filter(TERM)
 237 | ```
 238 | 
 239 | practical:
 240 | ```python
 241 | def my_cust_filter(tweet):
 242 |     if 'text' in tweet and 'cats' in tweet['text']:
 243 |         return True
 244 |     else:
 245 |         return False
 246 | 
 247 | collection.set_custom_filter(my_cust_filter)
 248 | ```
 249 | 
 250 | *returns* a collection or dataset whese all tweets will be passed through the filter
 251 | 
 252 | note this is just a wrapper for smappdragons [set_custom_filter](https://github.com/SMAPPNYU/smappdragon#set_custom_filter) function.
 253 | 
 254 | # get_tweets_containing
 255 | 
 256 | gets tweets containing the specified term.
 257 | 
 258 | abstract:
 259 | ```python
 260 | collection.get_tweets_containing(TERM)
 261 | ```
 262 | 
 263 | practical:
 264 | ```python
 265 | collection.get_tweets_containing('cats')
 266 | ```
 267 | 
 268 | *returns* a collection which will filter out any tweets that do no have the specified term
 269 | 
 270 | # count_tweet_terms
 271 | 
 272 | counts the number of tweets that contain all these terms
 273 | 
 274 | abstract:
 275 | ```python
 276 | collection.count_tweet_terms(TERM_1, TERM_2, ETC)
 277 | ```
 278 | 
 279 | practical:
 280 | ```python
 281 | count = collection.count_tweet_terms('cats', 'dogs')
 282 | print(count)
 283 | ```
 284 | 
 285 | *returns* an integer value that counts all the tweets containing the terms
 286 | 
 287 | # count_tweets
 288 | 
 289 | counts the number of tweets in a collection
 290 | 
 291 | abstract:
 292 | ```python
 293 | collection.count_tweets()
 294 | ```
 295 | 
 296 | practical:
 297 | ```python
 298 | count = collection.count_tweets()
 299 | print(count)
 300 | ```
 301 | 
 302 | *returns* an integer value that counts all the tweets in a collection
 303 | 
 304 | # get_top_terms
 305 | 
 306 | counts thet top words in a collection, [english stop words](https://github.com/Alir3z4/stop-words/blob/25c6a0aea665871e887f155b883e950c3743ce50/english.txt) are automatically included, otherwise you can specify your own set of stopwords with python stop-wrods. the stopwords are words taht get ignored and dwill not return in the final counts
 307 | 
 308 | abstract:
 309 | ```python
 310 | collection.count_tweet_terms(MUMBER_OF_TERMS, LIST_OF_STOP_WORDS)
 311 | ```
 312 | 
 313 | practical:
 314 | ```python
 315 | count = collection.get_top_terms(5)
 316 | #or
 317 | count = collection.get_top_terms(5, ['blah', 'it', 'cat'])
 318 | print(count)
 319 | ```
 320 | 
 321 | *note* `LIST_OF_STOP_WORDS` is optional, it is set to englis hby default
 322 | 
 323 | *returns* a dictionary that has all the top_X terms 
 324 | 
 325 | # get_tweet_texts
 326 | 
 327 | returns a new collection where the only key will be tweets.
 328 | 
 329 | abstract:
 330 | ```python
 331 | for text in collection.get_tweet_texts():
 332 |     print(text)
 333 | ```
 334 | 
 335 | practical:
 336 | ```python
 337 | for text in collection.get_tweet_texts():
 338 |     print(text)
 339 | ```
 340 | 
 341 | *returns* an iterator that returns just the text of each tweet
 342 | 
 343 | # get_date_range
 344 | 
 345 | gets tweets in a date range specified by python datmetime objects
 346 | 
 347 | abstract:
 348 | ```python
 349 | collection.get_date_range(START, END)
 350 | ```
 351 | 
 352 | practical:
 353 | ```python
 354 | from datetime import datetime
 355 | collection.get_date_range(datetime(2014,1,30), datetime(2014,4,30))
 356 | ```
 357 | 
 358 | *returns* a collection that will only return tweets from the specified datetime range
 359 | 
 360 | # find_date_range
 361 | 
 362 | finds the date range (min/max date in a collection)
 363 | 
 364 | abstract:
 365 | ```python
 366 | collection.find_date_range()
 367 | ```
 368 | 
 369 | practical:
 370 | ```python
 371 | from datetime import datetime
 372 | range = collection.find_date_range()
 373 | print(range)
 374 | # or compare to datetime objects
 375 | if range['date_min'] > datetime.now()
 376 |     print('greater')
 377 | elif range['date_max'] < datetime.now():
 378 |     print('less')
 379 |     print('whatever')
 380 | ```
 381 | 
 382 | *output* 
 383 | ```
 384 | {"date_min":datetime(2016,5,23),"date_max":datetime(2016,5,24)}
 385 | ```
 386 | 
 387 | *returns* a dictionary with two datetime objects
 388 | 
 389 | # tweet_language_is
 390 | 
 391 | only returns tweets where the language is the specified one (differs from [detect_tweet_language](#detect_tweet_language)  just checks the field on the tweet object reported by twitter, does not detect)
 392 | 
 393 | abstract:
 394 | ```python
 395 | collection.tweet_language_is(LANGUAGE_CODES)
 396 | ```
 397 | 
 398 | practical:
 399 | ```python
 400 | #get tweets in english and french
 401 | collection.tweet_language_is('en', 'fr')
 402 | ```
 403 | 
 404 | *returns* a collection where all the tweets have their text language as the specified language
 405 | 
 406 | # detect_tweet_language
 407 | 
 408 | a filter that filters tweets based on language detetction. (differs from [tweet_language_is](#tweet_language_is) because it actually detects the language, tweet_language_is just checks the field on the tweet object reported by twitter)
 409 | 
 410 | abstract:
 411 | ```python
 412 | collection.detect_tweet_language(LANGUAGE_CODES)
 413 | ```
 414 | 
 415 | practical:
 416 | ```python
 417 | #get tweets in english
 418 | collection.detect_tweet_language('en')
 419 | #get tweetsi n english and french
 420 | collection.detect_tweet_language('en', 'fr')
 421 | ```
 422 | 
 423 | *returns* a collection where all the tweets have their text language as the specified language
 424 | 
 425 | note: uses [langdetect](https://pypi.python.org/pypi/langdetect?) under the hood. it is a pythoh port of google language detection tool.
 426 | 
 427 | 
 428 | # user_language_is
 429 | 
 430 | only returns tweets where the user's specified language is the specified one
 431 | 
 432 | abstract:
 433 | ```python
 434 | collection.user_language_is(LANGUAGE_CODE)
 435 | ```
 436 | 
 437 | practical:
 438 | ```python
 439 | collection.user_language_is('en')
 440 | ```
 441 | 
 442 | *returns* a collection where all the tweets will come from users whose specified language matches the input
 443 | 
 444 | # exclude_retweets
 445 | 
 446 | exclueds retweets from your collection
 447 | 
 448 | abstract:
 449 | ```python
 450 | collection.exclude_retweets()
 451 | ```
 452 | 
 453 | practical:
 454 | ```python
 455 | collection.exclude_retweets()
 456 | ```
 457 | 
 458 | *returns* a collection where there are no retweets
 459 | 
 460 | 
 461 | # get_retweets
 462 | 
 463 | gets all tweets that are retweets from the collection
 464 | 
 465 | abstract:
 466 | ```python
 467 | collection.get_retweets()
 468 | ```
 469 | 
 470 | practical:
 471 | ```python
 472 | collection.get_retweets()
 473 | ```
 474 | 
 475 | *returns* a collection where there are only retweets
 476 | 
 477 | # user_location_contains
 478 | 
 479 | returns tweets that have a user location that contain one of the listed terms
 480 | 
 481 | abstract:
 482 | ```python
 483 | collection.user_location_contains(PLACE_TERM, ANOTHER_PLACE_TERM, ETC)
 484 | ```
 485 | 
 486 | practical:
 487 | ```python
 488 | collection.tweets_with_user_location('CA', 'FL', 'NY', 'palm springs')
 489 | ```
 490 | 
 491 | *returns* a collection where the user location field of that tweet has any of the specified places
 492 | 
 493 | # user_description_contains
 494 | 
 495 | returns tweets where the user description (for the user tweeting) contained the requested terms
 496 | 
 497 | abstract:
 498 | ```python
 499 | collection.user_description_contains(TERM, TERM, ETC)
 500 | ```
 501 | 
 502 | practical:
 503 | ```python
 504 | collection.user_description_contains('dad', 'conservative', 'texas', 'mother')
 505 | ```
 506 | 
 507 | *returns* a collection where the user location field of that tweet has any of the specified places
 508 | 
 509 | # user_id_is
 510 | 
 511 | returns tweets that match one of the passed in user ids
 512 | 
 513 | abstract:
 514 | ```python
 515 | collection.user_id_is(ID, ID, ETC)
 516 | ```
 517 | 
 518 | practical:
 519 | ```python
 520 | collection.user_id_is(379851447, 149751818)
 521 | ```
 522 | 
 523 | *returns* a collection where the user id field matches one of the passed in ids
 524 | 
 525 | # place_name_contains_country
 526 | 
 527 | returns tweets that have a user location
 528 | 
 529 | abstract:
 530 | ```python
 531 | collection.place_name_contains_country(PLACE_TERM, ANOTHER_PLACE_TERM, ETC)
 532 | ```
 533 | 
 534 | practical:
 535 | ```python
 536 | collection.place_name_contains_country('United States', 'France', 'Spain')
 537 | ```
 538 | 
 539 | *returns* a collection where the places field of that tweet has the specified place
 540 | 
 541 | note: for more information about places see https://dev.twitter.com/overview/api/places
 542 | 
 543 | # within_geobox
 544 | 
 545 | returns tweets that ari within a geobox
 546 | 
 547 | abstract:
 548 | ```python
 549 | collection.within_geobox(sw_longitude, sw_latitude, ne_longitude, ne_latitude)
 550 | ```
 551 | 
 552 | practical:
 553 | ```python
 554 | collection.within_geobox(-77.042484, 38.886323, -77.010384, 38.894006)
 555 | ```
 556 | 
 557 | *returns* a collection where the tweets streaming through will be from the stated geobox
 558 | 
 559 | note: 
 560 | sw_longitude, sw_latitude - the southwest corner
 561 | ne_longitude, ne_latitude - the northeast corner
 562 | geobox specified by points (longitude, latitude)
 563 | 
 564 | # get_geo_enabled
 565 | 
 566 | returns only geotagged tweets
 567 | 
 568 | abstract:
 569 | ```python
 570 | collection.get_geo_enabled()
 571 | ```
 572 | 
 573 | practical:
 574 | ```python
 575 | collection.get_geo_enabled()
 576 | ```
 577 | 
 578 | *returns* a collection that only produces geo tagged tweets
 579 | 
 580 | # get_non_geo_enabled
 581 | 
 582 | returns only non geotagged tweets
 583 | 
 584 | abstract:
 585 | ```python
 586 | collection.get_non_geo_enabled()
 587 | ```
 588 | 
 589 | practical:
 590 | ```python
 591 | collection.get_non_geo_enabled()
 592 | ```
 593 | 
 594 | *returns* a collection that only produces non geo tagged tweets
 595 | 
 596 | # limit_number_of_tweets
 597 | 
 598 | limits the # of tweets a collection can output
 599 | 
 600 | abstract:
 601 | ```python
 602 | collection.limit_number_of_tweets(LIMIT_NUMEBER)
 603 | ```
 604 | 
 605 | practical:
 606 | ```python
 607 | collection.limit_number_of_tweets(145)
 608 | 
 609 | for tweet in collection.limit_number_of_tweets(145):
 610 |     print(tweet)
 611 | ```
 612 | 
 613 | *returns* a collection that is limited on terms of the number of tweets it can output
 614 | 
 615 | node: works differently than expected on datasets, it will apply this limit to each sub collection/file in the dataset, so if you have 5 files in a dataset it would apply a liit of 145 to each file in the dataset, and
 616 | you would end up with 145 x 5 = 725 tweets.
 617 | 
 618 | # sample
 619 | 
 620 | gets a sample of tweets from a collection using reservior sampling
 621 | 
 622 | abstract:
 623 | ```python
 624 | collection.sample(NUMBER_OF_TWEETS_TO_SAMPLE)
 625 | ```
 626 | 
 627 | practical:
 628 | ```python
 629 | 
 630 | for tweet in collection.sample(10):
 631 |     print(tweet)
 632 | ```
 633 | 
 634 | *returns* a collection that only returns a sample of tweets as big as the number of tweets you specified
 635 | 
 636 | note: you can [read more about reservior sampling here](http://www.geeksforgeeks.org/reservoir-sampling/) and [here](https://en.wikipedia.org/wiki/Reservoir_sampling). reservior sampling allows us to sample a data set in one pass without knowing ahead of time how man ythings are in taht dataset and still match the underlying distribution of the data.
 637 | 
 638 | note: if you try to sample more tweets than are in a collection or dataset this method will throw an error. this is because reservior sampling does not work in this scenario. count your datasets first if you are unsure how many data points are in them.
 639 | 
 640 | # dump_to_bson
 641 | 
 642 | abstract:
 643 | ```python
 644 | collection.dump_to_bson(output_file)
 645 | ```
 646 | 
 647 | practical:
 648 | ```python
 649 | collection.dump_to_bson('/Users/blah/your_data.bson')
 650 | # or with a dataset dumping to one file
 651 | dataset.dump_to_bson('/Users/blah/your_data.bson')
 652 | # or with a dataset dumping to one file for each input
 653 | dataset.dump_to_bson('/Users/blah/your_data.bson', parallel=True)
 654 | ```
 655 | 
 656 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file.
 657 | 
 658 | *input* a path to a bson file
 659 | 
 660 | *output* a bson file with the data from your SmappCollection
 661 | 
 662 | note: if you use the [sample](#sample) method you can no longer use the 'parallel' argument to any dump methods, sample has to override the iterators for aech collection, essentially stripping us of the original iterators.
 663 | 
 664 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps.
 665 | 
 666 | # dump_to_json
 667 | 
 668 | abstract:
 669 | ```python
 670 | collection.dump_to_json(output_file)
 671 | ```
 672 | 
 673 | practical:
 674 | ```python
 675 | collection.dump_to_json('/Users/blah/your_data.json')
 676 | # or with a dataset dumping to one file
 677 | dataset.dump_to_json('/Users/blah/your_data.json')
 678 | # or with a dataset dumping to one file for each input
 679 | dataset.dump_to_json('/Users/blah/your_data.json', parallel=True)
 680 | ```
 681 | 
 682 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file.
 683 | 
 684 | *input* a path to a json file
 685 | 
 686 | *output* a json file with the data from your SmappCollection
 687 | 
 688 | note: if you use the [sample](#sample) method you can no longer use the 'parallel' argument to any dump methods, sample has to override the iterators for aech collection, essentially stripping us of the original iterators.
 689 | 
 690 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps.
 691 | 
 692 | # dump_to_csv
 693 | 
 694 | dumps a collection/dataset to a csv based on the fields you specify. can see the fields inside a tweet object [here](https://dev.twitter.com/overview/api/tweets).
 695 | 
 696 | abstract:
 697 | ```python
 698 | collection.dump_to_csv('/PATH/TO/OUTPUT/FILE.csv', ['FIELD1', 'FIELD2', 'FIELD3.SUBFIELD', ETC])
 699 | ```
 700 | 
 701 | practical:
 702 | ```python
 703 | collection.dump_to_csv('~/smappstuff/file.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
 704 | # or 
 705 | collection.limit_number_of_tweets(5).dump_to_csv('/Users/kevin/work/smappwork/file.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
 706 | # or with a dataset dumping to one file
 707 | dataset.dump_to_csv('/Users/blah/your_data.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
 708 | # or with a dataset dumping to one file for each input
 709 | dataset.dump_to_csv('/Users/blah/your_data.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], parallel=True)
 710 | # or if you have '.' in input fields that you want interpreted literally
 711 | collection.dump_to_csv('out_file.csv', ['id_str'], top_level=True)
 712 | # or if you want to omit the header
 713 | collection.dump_to_csv('out_file.csv', ['id_str'], top_level=False)
 714 | ```
 715 | 
 716 | *input* a path to a csv file and fields to keep
 717 | 
 718 | ```python
 719 | import pysmap
 720 | 
 721 | collection = pysmap.SmappCollection('json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json')
 722 | # or dataset
 723 | dataset = pysmap.SmappDataset(
 724 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json'],
 725 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_19_2016__00_00_00__23_59_59.json'],
 726 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_20_2016__00_00_00__23_59_59.json']
 727 | )
 728 | 
 729 | field_list = ['id_str',
 730 | 'coordinates.coordinates.0',
 731 | 'coordinates.coordinates.1',
 732 | 'user.id_str',
 733 | 'user.lang',
 734 | 'lang',
 735 | 'text',
 736 | 'user.screen_name',
 737 | 'user.location',
 738 | 'user.description',
 739 | 'created_at',
 740 | 'user.friends_count',
 741 | 'user.followers_count',
 742 | 'retweet_count',
 743 | 'entities.urls.0.expanded_url',
 744 | 'entities.urls.1.expanded_url',
 745 | 'entities.urls.2.expanded_url',
 746 | 'entities.urls.3.expanded_url',
 747 | 'entities.urls.4.expanded_url']
 748 | 
 749 | dataset.dump_to_csv('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.csv', field_list)
 750 | ```
 751 | 
 752 | *output* a csv file with the data from your SmappCollection, but only the fields you chose to keep
 753 | 
 754 | ```csv
 755 | id_str,coordinates.coordinates.0,coordinates.coordinates.1,user.id_str,user.lang,lang,text,user.screen_name,user.location,user.description,created_at,user.friends_count,user.followers_count,retweet_count,entities.urls.0.expanded_url,entities.urls.1.expanded_url,entities.urls.2.expanded_url,entities.urls.3.expanded_url,entities.urls.4.expanded_url
 756 | 
 757 | 788556059375874048,50,50,2240756971,en,en,RT @dailypenn: The DP and @WellesleyNews are jointly endorsing Wellesley alum @HillaryClinton over Wharton ’68 @realDonaldTrump.… ,CorrectRecord,,Correct The Record is a strategic research and rapid response team designed to defend Hillary Clinton from baseless attacks.,Wed Oct 19 01:43:09 +0000 2016,224,23080,0,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/
 758 | 
 759 | 788556059317186560,,,4655522325,fr,fr,Barack Obama conseille à Donald Trump « d’arrêter de pleurnicher » -  https://t.co/eEl1mOnIwp https://t.co/8EeOGya28r,metrodakar_net,Senegal,,Wed Oct 19 01:43:09 +0000 2016,110,657,0,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,,,,
 760 | ```
 761 | 
 762 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file.
 763 | 
 764 | note: to get things inside a list you need to refer to their list index. its better to overshoot (so if you want to get 5 entites urls where there are 5) you would use `['entities.urls.0.expanded_url','entities.urls.1.expanded_url','entities.urls.2.expanded_url','entities.urls.3.expanded_url','entities.urls.4.expanded_url']`, for tweet objects with less than 5 `urls` entities this will fill out urls up to 5 urls, if there are less than 5 the extra ones will be empty `,,` fields
 765 | 
 766 | note: empty lists `[]` will return nothing. you must specify fields.
 767 | 
 768 | note: fields that have no value will appear empty `,,`
 769 | 
 770 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps.
 771 | 
 772 | # dump_to_sqlite_db
 773 | 
 774 | dumps all tweets (only the fields you specify) to an sqlite database file
 775 | 
 776 | abstract:
 777 | ```python
 778 | collection.dump_to_sqlite_db('/PATH/TO/OUTPUT/FILE.db', ['FIELD1', 'FIELD2', 'FIELD3.SUBFIELD', ETC])
 779 | ```
 780 | 
 781 | pratical:
 782 | ```python
 783 | import pysmap
 784 | 
 785 | collection.dump_to_sqlite_db('~/smappstuff/file.db', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
 786 | # or 
 787 | collection.limit_number_of_tweets(5).dump_to_sqlite_db('/Users/kevin/work/smappwork/file.db', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
 788 | # or 
 789 | dataset = pysmap.SmappDataset(
 790 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json'],
 791 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_19_2016__00_00_00__23_59_59.json'],
 792 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_20_2016__00_00_00__23_59_59.json']
 793 | )
 794 | 
 795 | field_list = ['id_str',
 796 | 'coordinates.coordinates.0',
 797 | 'coordinates.coordinates.1',
 798 | 'user.id_str',
 799 | 'user.lang',
 800 | 'lang',
 801 | 'text',
 802 | 'user.screen_name',
 803 | 'user.location',
 804 | 'user.description',
 805 | 'created_at',
 806 | 'user.friends_count',
 807 | 'user.followers_count',
 808 | 'retweet_count',
 809 | 'entities.urls.0.expanded_url',
 810 | 'entities.urls.1.expanded_url',
 811 | 'entities.urls.2.expanded_url',
 812 | 'entities.urls.3.expanded_url',
 813 | 'entities.urls.4.expanded_url']
 814 | 
 815 | dataset.dump_to_sqlite_db('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.db', field_list)
 816 | # or with a dataset dumping to one file for each input
 817 | dataset.dump_to_sqlite_db('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.db', field_list, parallel=True)
 818 | ```
 819 | 
 820 | *input* a collection object and a list of fields/subfields
 821 | ```
 822 | [
 823 |     'id_str',
 824 |     'coordinates.coordinates.0',
 825 |     'coordinates.coordinates.1',
 826 |     'user.id_str',
 827 |     'user.lang',
 828 |     'lang',
 829 |     'text',
 830 |     'user.screen_name',
 831 |     'user.location',
 832 |     'user.description',
 833 |     'created_at',
 834 |     'user.friends_count',
 835 |     'user.followers_count',
 836 |     'retweet_count',
 837 |     'entities.urls.0.expanded_url',
 838 |     'entities.urls.1.expanded_url',
 839 |     'entities.urls.2.expanded_url',
 840 |     'entities.urls.3.expanded_url',
 841 |     'entities.urls.4.expanded_url'
 842 | ]
 843 | ```
 844 | 
 845 | *output* an sqlite db that looks like so:
 846 | ```
 847 | sqlite> .schema
 848 | CREATE TABLE data (id_str,user__id_str,text,entities__urls__0__expanded_url,entities__urls__1__expanded_url,entities__media__0__expanded_url,entities__media__1__expanded_url);
 849 | sqlite> .tables
 850 | data
 851 | sqlite> select * from data;
 852 | 686799531875405824|491074580|@_tessr @ProductHunt No one has stolen me yet. Security through obscurity.|NULL|NULL|NULL|NULL
 853 | 686661056115175425|491074580|Predictions of peach's demise already starting. Nice.|NULL|NULL|NULL|NULL
 854 | 686956278099349506|491074580|When was the state of the union first started? Ok wow since the office has existed. https://t.co/Cqgjkhr3Aa|https://en.wikipedia.org/wiki/State_of_the_Union#History|NULL|NULL|NULL
 855 | 687115788487122944|491074580|RT @lessig: Looks like the @citizenequality act got a supporter tonight. Thank you @POTUS|NULL|NULL|NULL|NULL
 856 | 686661056115175425|491074580|Predictions of peach's demise already starting. Nice.|NULL|NULL|NULL|NULL
 857 | 687008713039835136|491074580|#GOPDebate approaching. Can't wait to observer a trump in its natural habitat!|NULL|NULL|NULL|NULL
 858 | 687208777561448448|18673945|@yvanscher hey! saw u upvoted Cubeit on ProductHunt. Any feedback on how we can make Cubeit better for you? :) Thanks!|NULL|NULL|NULL|NULL
 859 | 686662539913084928|491074580|RT @PopSci: iOS 9.3 update will tint your screen at night, for your health https://t.co/zrDt4TsoXB https://t.co/yXCEGQPHWp|http://pops.ci/cJWqhM|NULL|http://twitter.com/PopSci/status/686661925267206144/photo/1|NULL
 860 | ```
 861 | 
 862 | note: the dump to sqlite method does not have a num_files (used to paralel) argument because the performance is bad with the sample method. 
 863 | 
 864 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps.
 865 | 
 866 | # get_top_entities
 867 | 
 868 | returns the top twitter entites from a tweet object, you can [read about twitter entities here](https://dev.twitter.com/overview/api/entities-in-twitter-objects)
 869 | 
 870 | abstract:
 871 | ```python
 872 | collection.top_entities({'ENTITY_FIELD':NUMBER_OF_TOP_TERMS, 'ENTITY_FIELD':NUMBER_OF_TOP_TERMS, 'ENTITY_FIELD':NUMBER_OF_TOP_TERMS})
 873 | ```
 874 | 
 875 | practical:
 876 | ```python
 877 | collection.top_entities({'user_mentions':5, 'media':3, 'hashtags':5, 'urls':0, 'user_mentions':2, 'symbols':2})
 878 | # or
 879 | collection.top_entities({'hashtags':5})
 880 | ```
 881 | 
 882 | *returns* a dictionary containing tho requested entities and the counts for each entity
 883 | 
 884 | input:
 885 | ```python
 886 | print collection.top_entities({'user_mentions':5, 'media':3, 'hashtags':5})
 887 | ```
 888 | 
 889 | output:
 890 | ```
 891 | {
 892 |         "hashtags": {
 893 |                 "JadeHelm": 118, 
 894 |                 "pjnet": 26, 
 895 |                 "jadehelm": 111, 
 896 |                 "falseflag": 32, 
 897 |                 "2a": 26
 898 |         },
 899 |         "user_mentions": {
 900 |                 "1619936671": 41, 
 901 |                 "27234909": 56, 
 902 |                 "733417892": 121, 
 903 |                 "10228272": 75, 
 904 |                 "233498836": 58
 905 |         }, 
 906 |         "media": {
 907 |                 "https://t.co/ORaTXOM2oX": 55, 
 908 |                 "https://t.co/pAfigDPcNc": 27, 
 909 |                 "https://t.co/TH8TmGuYww": 24
 910 |         }
 911 | }
 912 | ```
 913 | 
 914 | *returns* a dictionary filled with the top terms you requested
 915 | 
 916 | note: passing 0 to a field like `'hashtags':0` returns all the hashtags
 917 | 
 918 | note: no support for extended entities, retweet entities, user entites, or direct message entities.
 919 | 
 920 | note: if not enough entity objects are returned they get filled into the dictionary with null like so:
 921 | 
 922 | ```
 923 | {
 924 |     "symbols": {
 925 |             "0": null, 
 926 |             "1": null, 
 927 |             "hould": 1
 928 |     }
 929 | }
 930 | ```
 931 | 
 932 | # get_top_hashtags
 933 | 
 934 | get the top hashtags from a collection
 935 | 
 936 | abstract:
 937 | ```python
 938 | collection.get_top_hashtags(NUMBER_TOP)
 939 | ```
 940 | 
 941 | practical:
 942 | ```python
 943 | hashtags = collection.get_top_hashtags(5)
 944 | print(hashtags)
 945 | ```
 946 | 
 947 | *returns* the top hashtags as a dictionary
 948 | 
 949 | # get_top_urls
 950 | 
 951 | get the top urls from a collection
 952 | 
 953 | abstract:
 954 | ```python
 955 | collection.get_top_urls(NUMBER_TOP)
 956 | ```
 957 | 
 958 | practical:
 959 | ```python
 960 | urls = collection.get_top_urls(6)
 961 | print(urls)
 962 | ```
 963 | 
 964 | *returns* the top urls from a collection
 965 | 
 966 | # get_top_mentions
 967 | 
 968 | get the top mentions from a collection (these are @ mentions)
 969 | 
 970 | abstract:
 971 | ```python
 972 | collection.get_top_mentions(NUMBER_TOP)
 973 | ```
 974 | 
 975 | practical:
 976 | ```python
 977 | mentions = collection.get_top_mentions(40)
 978 | ```
 979 | 
 980 | *returns* the top @ mentions from a collection
 981 | 
 982 | # get_top_media
 983 | 
 984 | get the top media url references
 985 | 
 986 | abstract:
 987 | ```python
 988 | collection.get_top_media(NUMBER_TOP)
 989 | ```
 990 | 
 991 | practical:
 992 | ```python
 993 | media = collection.get_top_media(3)
 994 | print(media)
 995 | ```
 996 | 
 997 | *returns* the top media urls from a collection
 998 | 
 999 | # get_top_symbols
1000 | 
1001 | get the top symbols in a collection
1002 | 
1003 | abstract:
1004 | ```python
1005 | collection.get_top_symbols(NUMBER_TOP)
1006 | ```
1007 | 
1008 | practical:
1009 | ```python
1010 | symbols = collection.get_top_symbols(10)
1011 | print(symbols)
1012 | ```
1013 | 
1014 | *returns* the top symbols from a collection the number of top symbols depends on how man yspecified for input
1015 | 
1016 | # contributors
1017 | 
1018 | you might ask the difference between, pysmap and smappdragon. pysmap is easier to use but less flexible/more rigid in its implementation. smappdragon is a flexible tool fro programmers to use, you can build arbitray filters for data, pysmap is just a set of filters.
1019 | 
1020 | methods on smappdragon are lower level and more general. whereas methods on pysmap would be specific and rigid. so for example on smappdragon, you could [get all the entities](https://github.com/SMAPPNYU/smappdragon#top_entities), on pysmap you would have to ask for hashtags, mentions, etc. (which are all entities).
1021 | 
1022 | another example, something like [apply_labels](https://github.com/SMAPPNYU/smapp-toolkit#apply_labels) would go on smappdragon, not pysmap.
1023 | 
1024 | # viz 
1025 | 
1026 | a set of visualization tools, basically ways to graph and visualize a [SmappCollection](#smapp_collection)
1027 | 
1028 | # plots
1029 | 
1030 | a set of graph tools
1031 | 
1032 | # bar_graph_tweet_field_grouped_by_period
1033 | 
1034 | a tool that can be used to create generalized bar graphs from a smapp collection an various tweet data.
1035 | 
1036 | abstract:
1037 | ```python
1038 | bar_graph_tweet_field_grouped_by_period(SMAPP_COLLECTION, TWEET_FIELD, TWEET_FIELD_VALUES_TO_MATCH, CUSTOM_FILTER_FUNCTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1039 | ```
1040 | 
1041 | practical:
1042 | ```python
1043 | from pysmap import SmappCollection, plots
1044 | 
1045 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1046 | output_path = 'doc/output_graph.html'
1047 | 
1048 | def custom_filter(tweet):
1049 |     return True
1050 | 
1051 | plots.bar_graph_tweet_field_grouped_by_period(collection, 'user.lang', ['en', 'fr', 'es'], custom_filter, 'weeks', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1052 | ```
1053 | 
1054 | *returns* an html graph file and opens the graph in the default browser of the user
1055 | 
1056 | # bar_graph_languages
1057 | 
1058 | make a bar graph of the number of tweets containing the specified languages
1059 | 
1060 | abstract:
1061 | ```python
1062 | bar_graph_languages(SMAPP_COLLECTION, LANGUAGES_TO_MATCH, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1063 | ```
1064 | 
1065 | practical:
1066 | ```python
1067 | from pysmap import SmappCollection, plots
1068 | 
1069 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1070 | output_path = 'doc/output_graph.html'
1071 | 
1072 | plots.bar_graph_languages(collection, ['en', 'fr', 'es'], 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1073 | ```
1074 | 
1075 | *returns* an html graph file and opens the graph in the default browser of the user
1076 | 
1077 | # bar_graph_user_languages
1078 | 
1079 | graph all the tweets where the users who made the tweets have one of the specified languages
1080 | 
1081 | abstract:
1082 | ```python
1083 | bar_graph_user_languages(SMAPP_COLLECTION, LANGUAGES_TO_MATCH, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1084 | ```
1085 | 
1086 | practical:
1087 | ```python
1088 | from pysmap import SmappCollection, plots
1089 | 
1090 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1091 | output_path = 'doc/output_graph.html'
1092 | 
1093 | plots.bar_graph_user_languages(collection, ['en', 'fr', 'es'], 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1094 | ```
1095 | 
1096 | *returns* an html graph file and opens the graph in the default browser of the user
1097 | 
1098 | # bar_graph_tweets
1099 | 
1100 | graph all tweets per time period
1101 | 
1102 | abstract:
1103 | ```python
1104 | bar_graph_tweets(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1105 | ```
1106 | 
1107 | practical:
1108 | ```python
1109 | from pysmap import SmappCollection, plots
1110 | 
1111 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1112 | output_path = 'doc/output_graph.html'
1113 | 
1114 | bar_graph_tweets(collection, period_type, start, end, output_path, 'time', 'tweet count', 'tweet count v time')
1115 | ```
1116 | 
1117 | *returns* an html graph file and opens the graph in the default browser of the user
1118 | 
1119 | # bar_graph_tweets_with_urls
1120 | 
1121 | graph all tweets that contain urls by time period
1122 | 
1123 | abstract:
1124 | ```python
1125 | bar_graph_tweets_with_urls(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1126 | ```
1127 | 
1128 | practical:
1129 | ```python
1130 | from pysmap import SmappCollection, plots
1131 | 
1132 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1133 | output_path = 'doc/output_graph.html'
1134 | 
1135 | plots.bar_graph_tweets_with_urls(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1136 | ```
1137 | 
1138 | *returns* an html graph file and opens the graph in the default browser of the user
1139 | 
1140 | # bar_graph_tweets_with_media
1141 | 
1142 | graph all tweets that contain media (like images) by time period
1143 | 
1144 | abstract:
1145 | ```python
1146 | bar_graph_tweets_with_media(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1147 | ```
1148 | 
1149 | practical:
1150 | ```python
1151 | from pysmap import SmappCollection, plots
1152 | 
1153 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1154 | output_path = 'doc/output_graph.html'
1155 | 
1156 | plots.bar_graph_tweets_with_media(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1157 | ```
1158 | 
1159 | *returns* an html graph file and opens the graph in the default browser of the user
1160 | 
1161 | # bar_graph_tweets_with_mentions
1162 | 
1163 | graph all tweets that contain user mentions by time period
1164 | 
1165 | abstract:
1166 | ```python
1167 | bar_graph_tweets_with_mentions(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1168 | ```
1169 | 
1170 | practical:
1171 | ```python
1172 | from pysmap import SmappCollection, plots
1173 | 
1174 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1175 | output_path = 'doc/output_graph.html'
1176 | 
1177 | plots.bar_graph_tweets_with_mentions(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1178 | ```
1179 | 
1180 | *returns* an html graph file and opens the graph in the default browser of the user
1181 | 
1182 | # bar_graph_tweets_with_hashtags
1183 | 
1184 | graph all tweets that contain hashtags by time period
1185 | 
1186 | abstract:
1187 | ```python
1188 | bar_graph_tweets_with_hashtags(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1189 | ```
1190 | 
1191 | practical:
1192 | ```python
1193 | from pysmap import SmappCollection, plots
1194 | 
1195 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1196 | output_path = 'doc/output_graph.html'
1197 | 
1198 | plots.bar_graph_tweets_with_hashtags(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1199 | ```
1200 | 
1201 | *returns* an html graph file and opens the graph in the default browser of the user
1202 | 
1203 | # bar_graph_tweets_with_symbols
1204 | 
1205 | graph all tweets that contain symbols (like stock tickers, $AAPL, $GOOG, $TWTR) by time period
1206 | 
1207 | abstract:
1208 | ```python
1209 | bar_graph_tweets_with_symbols(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1210 | ```
1211 | 
1212 | practical:
1213 | ```python
1214 | from pysmap import SmappCollection, plots
1215 | 
1216 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1217 | output_path = 'doc/output_graph.html'
1218 | 
1219 | plots.bar_graph_tweets_with_symbols(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1220 | ```
1221 | 
1222 | *returns* an html graph file and opens the graph in the default browser of the user
1223 | 
1224 | # bar_graph_tweets_with_retweets
1225 | 
1226 | graph all tweets that are retweets by time period
1227 | 
1228 | abstract:
1229 | ```python
1230 | bar_graph_tweets_with_retweets(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1231 | ```
1232 | 
1233 | practical:
1234 | ```python
1235 | from pysmap import SmappCollection, plots
1236 | 
1237 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1238 | output_path = 'doc/output_graph.html'
1239 | 
1240 | plots.bar_graph_tweets_with_retweets(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1241 | ```
1242 | 
1243 | *returns* an html graph file and opens the graph in the default browser of the user
1244 | 
1245 | # bar_graph_tweets_with_location
1246 | 
1247 | graph all tweets that have a location field attached to them
1248 | 
1249 | abstract:
1250 | ```python
1251 | bar_graph_tweets_with_location(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE)
1252 | ```
1253 | 
1254 | practical:
1255 | ```python
1256 | from pysmap import SmappCollection, plots
1257 | 
1258 | collection = SmappCollection('json', 'docs/tweet_collection.json')
1259 | output_path = 'doc/output_graph.html'
1260 | 
1261 | plots.bar_graph_tweets_with_location(collection, 'hours',  datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time')
1262 | ```
1263 | 
1264 | *returns* an html graph file and opens the graph in the default browser of the user
1265 | 
1266 | # networks
1267 | 
1268 | code for making network graphs of twitter data
1269 | 
1270 | # retweet_network
1271 | 
1272 | export a retweet graph using the `networkx` library where users are nodes, retweets are directed edges.
1273 | 
1274 | abstract:
1275 | ```python
1276 | import networkx as nx
1277 | from pysmap import networks
1278 | 
1279 | digraph = networks.retweet_network(COLLECTION_OBJECT, TWEET_METADATA, USER_METADATA)
1280 | nx.write_graphml(digraph, '/path/where/you/want/your.graphml')
1281 | ```
1282 | 
1283 | practical:
1284 | ```python
1285 | import networkx as nx
1286 | from pysmap import networks
1287 | 
1288 | tweet_fields = ['id_str', 'retweeted_status.id_str', 'timestamp', 'text', 'lang']
1289 | user_fields = ['id_str', 'screen_name', 'location', 'description']
1290 | 
1291 | digraph = networks.retweet_network(collection, tweet_fields, user_fields)
1292 | nx.write_graphml(digraph, '~/smappdata/collection_retweets.graphml')
1293 | 
1294 | # or omitting metadata (which saves space)
1295 | col = collection.get_tweets_containing('cats').get_retweets()
1296 | digraph = networks.retweet_network(col, [], [])
1297 | nx.write_graphml(digraph, '~/smappdata/collection_sparse_retweets.graphml')
1298 | ```
1299 | 
1300 | *input*
1301 | 
1302 | `collection` - [smapp_dataset](#smapp_dataset) or [smapp_collection](#smapp_collection)
1303 | 
1304 | `user_fields` - is a list of fields from the User object that will be included as attributes of the nodes.
1305 | 
1306 | `tweet_fields` - is a list of the fields from the Tweet object that will be included as attributes of the edges.
1307 | 
1308 | *output*
1309 | 
1310 | a `.graphml` file may then be opened in graph analysis/visualization programs such as [Gephi](http://gephi.github.io/) or [Pajek](http://vlado.fmf.uni-lj.si/pub/networks/pajek/).
1311 | 
1312 | note: if the collection result includes non-retweets as well, users with no retweets
1313 | will also appear in the graph as isolated nodes. only retweets are edges in the resulting graph.
1314 | 
1315 | note: nodes and edges have attributes attached to them, which are customizable using the `user_fields` and `tweet_fields` arguments.
1316 | 
1317 | note: for large graphs where the structure is interesting but the tweet text itself is not, it is advisable to ommit most of the metadata.
1318 | 
1319 | note: the `networkx` library also provides algorithms for [vizualization](http://networkx.github.io/documentation/networkx-1.9.1/reference/drawing.html) and [analysis](http://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.html).
1320 | 
1321 | note: there are no defaults, you have to specify the fields you want.
1322 | 
1323 | 
1324 | # models
1325 | 
1326 | pretrained models for various tasks
1327 | 
1328 | 
1329 | # crowd_model
1330 | 
1331 | a model for detecting crowds of people
1332 | 
1333 | usage:
1334 | ```
1335 | #dowloads the model the this path and loads it
1336 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=True, talk=True)
1337 | # or just load the model from this path (default behavior)
1338 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=False, talk=False)
1339 | 
1340 | # predict from filenames
1341 | files = ['img1.jpg', 'img2.jpg']
1342 | preds = cm.predict_files(files)
1343 | 
1344 | # or predict from imag data (here i used opencv to read images)
1345 | imgs = np.zeros((len(files),224,224,3))
1346 | for i, file in enumerate(files):
1347 |     img = cv2.imread(file).astype('float64')
1348 |     img = cv2.resize(img, (224,224))
1349 |     imgs[i] = img
1350 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=False, talk=False)
1351 | preds = cm.predict_imgs(imgs)
1352 | ```
1353 | 
1354 | `dl` - whether or not the model class should download the model file (by default set to False, if the the model paht you give dosent exist it will try to donwload anyways)
1355 | 
1356 | `talk` - the class prints out what it's doing, set to False by default.
1357 | 
1358 | note: images on disk will be resized to 224x224, if you put your own image data it should be sized 224x224x3, when i doubt check the function's docstring with ?predict_imgs
1359 | 
1360 | *input*
1361 | 
1362 | a model path to download or an already downloaded model path,
1363 | 
1364 | image file names or imag data in a numpy array
1365 | 
1366 | *output*
1367 | 
1368 | probability of image being a crowd
1369 | 
1370 | # developer note '.' field splitting:
1371 | 
1372 | there was a habit at the lab of creating one helper function that would take a tweet and a '.' delimited list of fields, split on this character to traverse into a json and save lots of coding time and lines of code. i wanted to leave a few lines here to explain why this is a bad idea in the context of the smapp lab:
1373 | 
1374 | 1 - it makes code difficult to understand for grad students, we want them to be able to see exactly what a function does without needing to be a python expert.
1375 | 
1376 | 2 - it casuse problems if you want to traverse into a json object but one of the fields you want 3 levels in has a '.' as part of its name. now twitter doesnt do this but sometimse people cahnge their data to csv, data gets messed up, or people want to use slightly different data. the tools should work for whatever people throw at them, not exclusively for twitter data.
1377 | 
1378 | 3 - the obvious solution is to offer a function where the user can define a splitting character, the thing is this will be confusing to read. So in the end i conclude to go another route. In the end this would save a few lines of code and reduce readability drastically.
1379 | 
1380 | if you want a way to declare nested traversals see: [https://github.com/SMAPPNYU/smappdragon#set_filter](https://github.com/SMAPPNYU/smappdragon#set_filter)
1381 | 
1382 | #developer note publishing:
1383 | 
1384 | 1 - make a ~/.pypirc file with:
1385 | 
1386 | [distutils]
1387 | index-servers = pypi
1388 | 
1389 | [pypi]
1390 | repository: https://pypi.python.org/pypi
1391 | username: YOUR_PYPI_USERNAME
1392 | password: YOUR_PASSWORD
1393 | 
1394 | 2 - pip install twine
1395 | 
1396 | 3 - python setup.py sdist
1397 | 
1398 | 4 - twine upload sdist/*
1399 | 
1400 | # author
1401 | 
1402 | [yvan](https://github.com/yvan)


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pysmap
 2 | dependencies:
 3 | - python=3.6
 4 | - bokeh
 5 | - pytz
 6 | - pandas
 7 | - pip:
 8 |   - smappdragon>=0.0.41
 9 |   - langdetect>=1.0.6
10 |   - stop-words>=2015.2.23.1
11 |   - networkx>=1.11
12 |   - pymongo>=3.2.2
13 |   - matplotlib>=2.0.0
14 |   - keras>=2.0.8
15 |   - python-opencv>=3.3.0.9
16 |   - tensorflow>=1.3.0
17 |   - h5py>=2.7.0


--------------------------------------------------------------------------------
/pysmap/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | module
3 | '''
4 | 
5 | from pysmap.twitterutil.smapp_collection import SmappCollection
6 | from pysmap.twitterutil.smapp_dataset import SmappDataset
7 | from pysmap.viz import plots
8 | from pysmap.viz import networks


--------------------------------------------------------------------------------
/pysmap/mltools/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | module indicator for mltools
3 | '''
4 | 
5 | from . import smapp_model, crowd_model
6 | __all__ = ['smapp_model', 'crowd_model']


--------------------------------------------------------------------------------
/pysmap/mltools/crowd_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gc, abc, cv2, requests, os, shutil, gzip
 3 | 
 4 | from pysmap.mltools.smapp_model import SmappModel
 5 | from keras.models import load_model
 6 | from keras.applications.resnet50 import preprocess_input
 7 | 
 8 | def download_file(url, local_url):
 9 |     r = requests.get(url, stream=True)
10 |     with open(local_url, 'wb') as f:
11 |         shutil.copyfileobj(r.raw, f)
12 | 
13 | def unzip_file(local_url, model_path):
14 |     with gzip.open(local_url, 'rb') as fin:
15 |         with open(model_path, 'wb') as fout:
16 |             shutil.copyfileobj(fin, fout)
17 | 
18 | class CrowdModel(SmappModel):
19 |     __metaclass__ = abc.ABCMeta
20 | 
21 |     def __init__(self, model_path, model_dl='http://165.227.83.131:82/', dl=False, talk=True):
22 |         if dl or not os.path.exists(model_path):
23 |             url = os.path.join(model_dl,'crowdv1.h5.gz')
24 |             local_url = os.path.join('/'.join(model_path.split('/')[:-1]),'crowdv1.h5.gz')
25 |             if talk: print('downloading model file to: {}'.format(local_url))
26 |             download_file(url, local_url)
27 |             unzip_file(local_url, model_path)
28 |             if talk: print('downloaded model file to: {}'.format(model_path))
29 |         if talk: print('loading model from from: {}'.format(model_path))
30 |         self.model = load_model(model_path)
31 | 
32 |     def predict_imgs(self, imgs):
33 |         '''
34 |         takes an image input and predicts on it
35 |         this expects an ndarray (heightxwidthxchannels)
36 |         this model shouldbe a (Nx224x224x3) numpy array
37 |         this method it noce if you want to do preprocessing
38 |         then predict results on those preprocessed images
39 |         this function expects the image array to be jpg
40 |         '''
41 |         imgs = preprocess_input(imgs)
42 |         return self.model.predict(imgs)
43 | 
44 |     def predict_files(self, files):
45 |         '''
46 |         reads files off disk, resizes them
47 |         and then predicts them, files should
48 |         be a list or itrerable of file paths
49 |         that lead to images, they are then
50 |         loaded with opencv, resized, and predicted
51 |         '''
52 |         imgs = [0]*len(files)
53 |         for i, file in enumerate(files):
54 |             img = cv2.imread(file).astype('float64')
55 |             img = cv2.resize(img, (224,224))
56 |             img = preprocess_input(img)
57 |             if img is None:
58 |                 print('failed to open: {}, continuing...'.format(file))
59 |             imgs[i] = img
60 |         return self.model.predict(np.array(imgs))
61 | 
62 |     def view_predictions(imgs, y_pred, y_true, start, end):
63 |         '''
64 |         displays the images in a grid formation from the 
65 |         start index to the end index, y_true are the true
66 |         labels for the images, y_pred should be your predictions
67 |         imgs should be an (NxWxHx3) array of your input images
68 | 
69 |         '''
70 |         fig, ax = plt.subplots(16, 4, sharex='col', sharey='row', figsize=(25, 100))
71 |         for i, img in enumerate(imgs[start:end]):
72 |             pred_label = y_pred[start:end][i]
73 |             actual_label = y_true[i]
74 |             ax[i//4][i%4].imshow(img)
75 |             ax[i//4][i%4].annotate(pred_label[0],
76 |                           (0,0), (0, -32), xycoords='axes fraction', 
77 |                            textcoords='offset points', va='top', size=20)
78 |             ax[i//4][i%4].annotate(actual_label,
79 |                           (0,0), (200, -32), xycoords='axes fraction', 
80 |                            textcoords='offset points', va='top', size=20)
81 |             ax[i//4][i%4].axis('off')
82 |         return ax
83 | 


--------------------------------------------------------------------------------
/pysmap/mltools/smapp_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | this is the base loader class
 3 | it will handle all the nitty gritty
 4 | of dealing with models, while
 5 | individual classes will just
 6 | '''
 7 | 
 8 | import gc
 9 | import abc
10 | 
11 | class SmappModel(object):
12 | 	__metaclass__ = abc.ABCMeta
13 | 
14 | 	@abc.abstractmethod
15 | 	def __init__(self, model_path):
16 | 		pass
17 | 
18 | 	def delete(objs):
19 | 		'''
20 | 		deallocates the memory occupide by this model
21 | 		added as convinienve function to make ti easier
22 | 		to avoid running out of ram when working with
23 | 		several models at once, usage: delete(model1, model2)
24 | 		'''
25 | 		for obj in objs: del obj
26 | 		return gc.collect()
27 | 
28 | 


--------------------------------------------------------------------------------
/pysmap/twitterutil/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | module
3 | '''
4 | 
5 | from . import smapp_collection, smapp_dataset
6 | __all__ = ['smapp_collection', 'smapp_dataset']


--------------------------------------------------------------------------------
/pysmap/twitterutil/smapp_collection.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import abc
  3 | import copy
  4 | import random
  5 | import sqlite3
  6 | import operator
  7 | import itertools
  8 | import smappdragon
  9 | 
 10 | from datetime import datetime
 11 | from bson import BSON, json_util
 12 | from stop_words import get_stop_words
 13 | from langdetect import detect, lang_detect_exception, DetectorFactory
 14 | 
 15 | class SmappCollection(object):
 16 |     def __init__(self, data_source_type, *args):
 17 |             if data_source_type == 'bson':
 18 |                 self.collection = smappdragon.BsonCollection(args[0])
 19 |             elif data_source_type == 'json':
 20 |                 self.collection = smappdragon.JsonCollection(args[0])
 21 |             elif data_source_type == 'csv':
 22 |                 self.collection = smappdragon.CsvCollection(args[0])
 23 |             elif data_source_type == 'mongo':
 24 |                 self.collection = smappdragon.MongoCollection(
 25 |                     args[0],
 26 |                     args[1],
 27 |                     args[2],
 28 |                     args[3],
 29 |                     args[4],
 30 |                     args[5]
 31 |                 )
 32 |             else:
 33 |                 raise IOError('Could not find your input, it\'s mispelled or doesn\'t exist.')
 34 | 
 35 |     def __iter__(self):
 36 |         for tweet in self.collection.get_iterator():
 37 |             yield tweet
 38 | 
 39 |     def set_custom_filter(self, custom_filter):
 40 |         cp = copy.deepcopy(self)
 41 |         cp.collection.set_custom_filter(custom_filter)
 42 |         return cp
 43 | 
 44 |     def get_tweet_texts(self):
 45 |         for tweet in self.collection.get_iterator():
 46 |             yield tweet['text']
 47 | 
 48 |     def count_tweets(self):
 49 |         return sum(1 for tweet in self.collection.get_iterator())
 50 | 
 51 |     def count_tweet_terms(self, *args):
 52 |         def tweet_contains_terms(tweet):
 53 |             return any([term in tweet['text'] for term in args])
 54 |         cp = copy.deepcopy(self)
 55 |         return sum(1 for tweet in cp.collection.set_custom_filter(tweet_contains_terms).get_iterator())
 56 | 
 57 |     def get_tweets_containing(self, *args):
 58 |         def tweet_contains_terms(tweet):
 59 |             return any([term in tweet['text'] for term in args])
 60 |         cp = copy.deepcopy(self)
 61 |         cp.collection.set_custom_filter(tweet_contains_terms)
 62 |         return cp
 63 | 
 64 |     def get_date_range(self, start, end):
 65 |         if type(start) is not datetime or type(end) is not datetime:
 66 |             raise ValueError('inputs to date_range must be python datetime.date objects')
 67 |         def tweet_is_in_date_range(tweet):
 68 |             return (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') >= start) and (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') < end)
 69 |         cp = copy.deepcopy(self)
 70 |         cp.collection.set_custom_filter(tweet_is_in_date_range)
 71 |         return cp
 72 | 
 73 |     def find_date_range(self):
 74 |         date_min = datetime.max
 75 |         date_max = datetime.min
 76 |         for tweet in self.collection.get_iterator():
 77 |             date_to_process = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
 78 |             if date_to_process <= date_min:
 79 |                 date_min = date_to_process
 80 |             if date_to_process >= date_max:
 81 |                 date_max = date_to_process
 82 |         return {"date_min":date_min,"date_max":date_max}
 83 | 
 84 |     def tweet_language_is(self, *args):
 85 |         def language_in_tweet(tweet):
 86 |             return  any(['lang' in tweet and language_code in tweet['lang'] for language_code in args])
 87 |         cp = copy.deepcopy(self)
 88 |         cp.collection.set_custom_filter(language_in_tweet)
 89 |         return cp
 90 | 
 91 |     def detect_tweet_language(self, *args):
 92 |         DetectorFactory.seed = 0
 93 |         def language_in_tweet(tweet):
 94 |             detected_lang = None
 95 |             try: 
 96 |                 detected_lang = detect(tweet['text'])             
 97 |             except lang_detect_exception.LangDetectException:
 98 |                 pass
 99 |             return  any([detected_lang in args])
100 |         cp = copy.deepcopy(self)
101 |         cp.collection.set_custom_filter(language_in_tweet)
102 |         return cp
103 | 
104 |     def user_language_is(self, *args):
105 |         def language_in_tweet(tweet):
106 |             return any([language_code in tweet['user']['lang'] for language_code in args])
107 |         cp = copy.deepcopy(self)
108 |         cp.collection.set_custom_filter(language_in_tweet)
109 |         return cp
110 | 
111 |     def exclude_retweets(self):
112 |         def tweet_is_not_retweet(tweet):
113 |             return 'retweeted_status' not in tweet
114 |         cp = copy.deepcopy(self)
115 |         cp.collection.set_custom_filter(tweet_is_not_retweet)
116 |         return cp
117 | 
118 |     def get_retweets(self):
119 |         def tweet_is_retweet(tweet):
120 |             return 'retweeted_status' in tweet
121 |         cp = copy.deepcopy(self)
122 |         cp.collection.set_custom_filter(tweet_is_retweet)
123 |         return cp
124 | 
125 |     def user_location_contains(self, *args):
126 |         def user_has_location(tweet):
127 |             return tweet['user']['location'] and any([place_term in tweet['user']['location'] for place_term in args])
128 |         cp = copy.deepcopy(self)
129 |         cp.collection.set_custom_filter(user_has_location)
130 |         return cp
131 | 
132 |     def user_description_contains(self, *args):
133 |         def user_description_contains_terms(tweet):
134 |             return tweet['user']['description'] and any([d_term in tweet['user']['description'] for d_term in args])
135 |         cp = copy.deepcopy(self)
136 |         cp.collection.set_custom_filter(user_description_contains_terms)
137 |         return cp
138 | 
139 |     def user_id_is(self, *args):
140 |         def user_id_created_tweet(tweet):
141 |             return tweet['user']['id'] and any([u_id == tweet['user']['id'] for u_id in args])
142 |         cp = copy.deepcopy(self)
143 |         cp.collection.set_custom_filter(user_id_created_tweet)
144 |         return cp
145 | 
146 |     def get_geo_enabled(self):
147 |         def geo_enabled_filter(tweet):
148 |             return ("coordinates" in tweet 
149 |                 and tweet["coordinates"] is not None 
150 |                 and "coordinates" in tweet["coordinates"])
151 |         cp = copy.deepcopy(self)
152 |         cp.collection.set_custom_filter(geo_enabled_filter)
153 |         return cp
154 | 
155 |     def get_non_geo_enabled(self):
156 |         def non_geo_enabled_filter(tweet):
157 |             return ('coordinates' not in tweet or
158 |                 tweet['coordinates'] is None or
159 |                 'coordinates' not in tweet['coordinates'])
160 |         cp = copy.deepcopy(self)
161 |         cp.collection.set_custom_filter(non_geo_enabled_filter)
162 |         return cp
163 |         
164 |     def within_geobox(self, sw_lon, sw_lat, ne_lon, ne_lat):
165 |         def tweet_is_in_geobox(tweet):
166 |             if tweet['coordinates'] and tweet['coordinates']['coordinates']:
167 |                 coords = tweet['coordinates']['coordinates']
168 |                 return coords[0] > float(sw_lon) and coords[0] < float(ne_lon) and coords[1] > float(sw_lat) and coords[1] < float(ne_lat)
169 |             return False
170 |         cp = copy.deepcopy(self)
171 |         cp.collection.set_custom_filter(tweet_is_in_geobox)
172 |         return cp
173 | 
174 |     def place_name_contains_country(self, *args):
175 |         def place_name_contains_terms(tweet):
176 |             return tweet['place'] and any([d_term in tweet['place']['country'] for d_term in args])
177 |         cp = copy.deepcopy(self)
178 |         cp.collection.set_custom_filter(place_name_contains_terms)
179 |         return cp
180 | 
181 |     def get_top_entities(self, requested_entities):
182 |         returndict = {}
183 |         returnstructure = {}
184 |         tweet_parser = smappdragon.TweetParser()
185 |         #init dempty dict for all entity types
186 |         for entity_type in requested_entities:
187 |             returndict[entity_type] = {}
188 | 
189 |         for tweet in self.collection.get_iterator():
190 |             for entity_type in requested_entities:
191 |                 for entity in tweet_parser.get_entity(entity_type, tweet):
192 |                     if entity_type == 'user_mentions':
193 |                         entity_value = tweet_parser.get_entity_field('id_str', entity)
194 |                     elif entity_type == 'hashtags' or entity_type == 'symbols':
195 |                         entity_value = tweet_parser.get_entity_field('text', entity)
196 |                     else:
197 |                         entity_value = tweet_parser.get_entity_field('url', entity)
198 | 
199 |                     if entity_value in returndict[entity_type]:
200 |                         returndict[entity_type][entity_value] += 1
201 |                     else:
202 |                         returndict[entity_type][entity_value] = 1
203 | 
204 |         for entity_type in returndict:
205 |             returnstructure[entity_type] = {}
206 |             if len(returndict[entity_type]) > 0:
207 |                 sorted_list = sorted(returndict[entity_type].items(), key=operator.itemgetter(1), reverse=True)
208 |                 # if the user put in 0 return all entites
209 |                 # otherwise slice the array and return the
210 |                 # number of top things they asked for
211 |                 # if the list is too short throw in None
212 |                 if requested_entities[entity_type] == 0:
213 |                     returnstructure[entity_type] = {name: count for name, count in sorted_list}
214 |                 elif len(sorted_list) < requested_entities[entity_type]:
215 |                     returnstructure[entity_type] = {name: count for name, count in sorted_list}
216 |                     for i in range(0, requested_entities[entity_type]-len(sorted_list)):
217 |                         returnstructure[entity_type][i] = None
218 |                 else:
219 |                     returnstructure[entity_type] = { \
220 |                         name: count for name, count in sorted_list[0:requested_entities[entity_type]] \
221 |                     }
222 |         return returnstructure
223 | 
224 |     def limit_number_of_tweets(self, limit):
225 |         cp = copy.deepcopy(self)
226 |         cp.collection.set_limit(limit)
227 |         return cp
228 | 
229 |     def dump_to_bson(self, output_file):
230 |         filehandle = open(output_file, 'ab+')
231 |         for tweet in self.collection.get_iterator():
232 |             filehandle.write(BSON.encode(tweet))
233 |         filehandle.close()
234 | 
235 |     def dump_to_json(self, output_file):
236 |         filehandle = open(output_file, 'a')
237 |         for tweet in self.collection.get_iterator():
238 |             filehandle.write(json_util.dumps(tweet)+'\n')
239 |         filehandle.close()
240 |         
241 |     def dump_to_csv(self, output_file, input_fields, write_header=True, top_level=False):
242 |         filehandle = open(output_file, 'a', encoding='utf-8')
243 |         writer = csv.writer(filehandle)
244 |         if write_header:
245 |             writer.writerow(input_fields)
246 |         tweet_parser = smappdragon.tools.tweet_parser.TweetParser()
247 | 
248 |         for tweet in self.collection.get_iterator():
249 |             if top_level:
250 |                 ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
251 |             else:
252 |                 ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields)
253 |             ret_values = [col_val[1] for col_val in ret]
254 |             writer.writerow(ret_values)
255 |         filehandle.close()
256 | 
257 |     def dump_to_sqlite_db(self, output_file, input_fields, top_level=False):
258 |         def replace_none(s):
259 |             if s is None:
260 |                 return 'NULL'
261 |             return s
262 | 
263 |         tweet_parser = smappdragon.tools.tweet_parser.TweetParser()
264 |         column_str = ','.join([column for column in input_fields]).replace('.','__')
265 |         question_marks = ','.join(['?' for column in input_fields])
266 | 
267 |         con = sqlite3.connect(output_file)
268 |         cur = con.cursor()
269 |         cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))
270 | 
271 |         insert_list = []
272 |         # batch insert if more than 10k tweets
273 |         for tweet in self.collection.get_iterator():
274 |             if top_level:
275 |                 ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
276 |             else:
277 |                 ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields)
278 |             row = [replace_none(col_val[1]) for col_val in ret]
279 |             insert_list.append(tuple(row))
280 |             if (len(insert_list) % 10000) == 0:
281 |                 cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
282 |                 con.commit()
283 |                 insert_list = []
284 |         if len(insert_list) < 10000:
285 |             cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
286 |             con.commit()
287 |         con.close()
288 | 
289 |     def get_top_hashtags(self, num_top):
290 |         return self.get_top_entities({'hashtags':num_top})
291 | 
292 |     def get_top_urls(self, num_top):
293 |         return self.get_top_entities({'urls':num_top})
294 | 
295 |     def get_top_mentions(self, num_top):
296 |         return self.get_top_entities({'user_mentions':num_top})
297 | 
298 |     def get_top_media(self, num_top):
299 |         return self.get_top_entities({'media':num_top})
300 | 
301 |     def get_top_symbols(self, num_top):
302 |         return self.get_top_entities({'symbols':num_top})
303 | 
304 |     def get_top_terms(self, num_top, stop_words=None):
305 |         term_counts = {}
306 |         if not stop_words:
307 |             stop_words = get_stop_words('en')
308 |         for tweet in self.collection.get_iterator():
309 |             split_tweet = tweet['text'].split()
310 |             for tweet_token in split_tweet:
311 |                 if tweet_token not in stop_words:
312 |                     term_counts[tweet_token] = 0 if tweet_token not in term_counts else term_counts[tweet_token]+1
313 |         sorted_counts = sorted(term_counts.items(), key=operator.itemgetter(1), reverse=True)[:num_top]
314 |         return_counts = {}
315 |         for k, v in sorted_counts:
316 |             return_counts[k] = v
317 |         return return_counts
318 | 
319 |     def sample(self, k):
320 |         '''
321 |         this method is especially troublesome
322 |         i do not reccommend making any changes to it
323 |         you may notice it uplicates code fro smappdragon
324 |         there is no way around this as far as i can tell
325 |         it really  might screw up a lot of stuff, stip tweets
326 |         has been purposely omitted as it isnt supported in pysmap
327 |         '''
328 |         def new_get_iterator():
329 |             tweet_parser = smappdragon.TweetParser()
330 |             it = iter(self.collection.get_iterator())
331 |             sample = list(itertools.islice(it, k))
332 |             random.shuffle(sample)
333 |             for i, item in enumerate(it, start=k+1):
334 |                 j = random.randrange(i)
335 |                 if j < k:
336 |                     sample[j] = item
337 |             for tweet in sample:
338 |                 if self.collection.limit != 0 and self.collection.limit <= count:
339 |                     return
340 |                 elif tweet_parser.tweet_passes_filter(self.collection.filter, tweet) \
341 |                 and tweet_parser.tweet_passes_custom_filter_list(self.collection.custom_filters, tweet):
342 |                     yield tweet
343 |         cp = copy.deepcopy(self)
344 |         cp.collection.get_iterator = new_get_iterator
345 |         return cp
346 | 
347 | '''
348 | author @yvan
349 | for a lower level set of tools see: https://github.com/SMAPPNYU/smappdragon
350 | '''


--------------------------------------------------------------------------------
/pysmap/twitterutil/smapp_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import csv
  4 | import abc
  5 | import copy
  6 | import glob
  7 | import random
  8 | import sqlite3
  9 | import pymongo
 10 | import operator
 11 | import itertools
 12 | import smappdragon
 13 | 
 14 | from datetime import datetime
 15 | from bson import BSON, json_util
 16 | from pysmap.twitterutil.smapp_collection import SmappCollection
 17 | from langdetect import detect, lang_detect_exception, DetectorFactory
 18 | from stop_words import get_stop_words
 19 | 
 20 | class SmappDataset(object):
 21 |     def __init__(self, *args, **kwargs):
 22 |             input_servers_ports = {}
 23 |             self.collections = []
 24 |             for input_list_or_datasource in args:
 25 |                 if type(input_list_or_datasource) is SmappCollection:
 26 |                     self.collections.append(input_list_or_datasource.collection)
 27 |                 elif type(input_list_or_datasource) is type(self):
 28 |                     self.collections.extend(input_list_or_datasource.collections)
 29 |                 else:
 30 |                     if input_list_or_datasource[0] == 'bson':
 31 |                         if 'file_pattern' == input_list_or_datasource[1]:
 32 |                             for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])):
 33 |                                 self.collections.append(smappdragon.BsonCollection(path))
 34 |                         else:
 35 |                             self.collections.append(smappdragon.BsonCollection(input_list_or_datasource[1]))
 36 |                     elif input_list_or_datasource[0] == 'json':
 37 |                         if 'file_pattern' == input_list_or_datasource[1]:
 38 |                             for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])):
 39 |                                 self.collections.append(smappdragon.JsonCollection(path))
 40 |                         else:
 41 |                             self.collections.append(smappdragon.JsonCollection(input_list_or_datasource[1]))
 42 |                     elif input_list_or_datasource[0] == 'csv':
 43 |                         if 'file_pattern' == input_list_or_datasource[1]:
 44 |                             for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])):
 45 |                                 self.collections.append(smappdragon.CsvCollection(path))
 46 |                         else:
 47 |                             self.collections.append(smappdragon.CsvCollection(input_list_or_datasource[1]))
 48 |                     elif input_list_or_datasource[0] == 'mongo':
 49 |                         host_port_key = input_list_or_datasource[1]+str(input_list_or_datasource[2])
 50 |                         if host_port_key not in input_servers_ports:
 51 |                             new_connection = pymongo.MongoClient(input_list_or_datasource[1], int(input_list_or_datasource[2]))
 52 |                             input_servers_ports[host_port_key] = new_connection
 53 |                         if 'database_regex' in kwargs or 'collection_regex' in kwargs:
 54 |                             mongo = pymongo.MongoClient(input_list_or_datasource[1], int(input_list_or_datasource[2]))
 55 |                             if 'database_regex' in kwargs:
 56 |                                 db_regex = re.compile(kwargs['database_regex'])
 57 |                                 matched_dbs = [match.group(1) for db_name in mongo.database_names() for match in [db_regex.search(db_name)] if match]
 58 |                             else:
 59 |                                 matched_dbs = [input_list_or_datasource[5]]
 60 | 
 61 |                             for matched_db in matched_dbs:
 62 |                                 if 'collection_regex' in kwargs:
 63 |                                     collection_regex = re.compile(kwargs['collection_regex'])
 64 |                                     matched_collections = [match.group(1) for collection_name in mongo[matched_db].collection_names() for match in [collection_regex.search(collection_name)] if match]
 65 |                                 else:
 66 |                                     if len(input_list_or_datasource) > 6:
 67 |                                         matched_collections = [input_list_or_datasource[6]]
 68 |                                     else:
 69 |                                         matched_collections = [input_list_or_datasource[5]]
 70 |                                 for matched_collection in matched_collections:
 71 |                                     self.collections.append(smappdragon.MongoCollection(
 72 |                                         input_list_or_datasource[3],
 73 |                                         input_list_or_datasource[4],
 74 |                                         matched_db,
 75 |                                         matched_collection,
 76 |                                         passed_mongo=input_servers_ports[input_list_or_datasource[1]+str(input_list_or_datasource[2])]
 77 |                                     ))
 78 |                         else:
 79 |                             self.collections.append(smappdragon.MongoCollection(
 80 |                                 input_list_or_datasource[3],
 81 |                                 input_list_or_datasource[4],
 82 |                                 input_list_or_datasource[5],
 83 |                                 input_list_or_datasource[6],
 84 |                                 passed_mongo=input_servers_ports[input_list_or_datasource[1]+str(input_list_or_datasource[2])]
 85 |                             ))
 86 |                     else:
 87 |                         raise IOError('Could not find your input: {}, it\'s mispelled or doesn\'t exist.'.format(input_list_or_datasource))
 88 | 
 89 |     # simple helper method for getting the iterators out
 90 |     # of all collections in a SmappDataset, sample overrides
 91 |     # this method
 92 |     def get_collection_iterators(self):
 93 |         return itertools.chain(*[collection.get_iterator() for collection in self.collections])
 94 | 
 95 |     # helper applies filters to all collections in dataset
 96 |     def apply_filter_to_collections(self, filter_to_set):
 97 |         self.collections = [collection.set_custom_filter(filter_to_set) for collection in self.collections]
 98 | 
 99 |     def __iter__(self):
100 |         for tweet in self.get_collection_iterators():
101 |             yield tweet
102 | 
103 |     def set_custom_filter(self, custom_filter):
104 |         cp = copy.deepcopy(self)
105 |         cp.apply_filter_to_collections(custom_filter)
106 |         return cp
107 | 
108 |     def get_tweet_texts(self):
109 |         for tweet in self.get_collection_iterators():
110 |             yield tweet['text']
111 | 
112 |     def count_tweets(self):
113 |         return sum(1 for tweet in self.get_collection_iterators())
114 | 
115 |     def count_tweet_terms(self, *args):
116 |         def tweet_contains_terms(tweet):
117 |             return any([term in tweet['text'] for term in args])
118 |         cp = copy.deepcopy(self)
119 |         cp.apply_filter_to_collections(tweet_contains_terms)
120 |         return sum(1 for tweet in cp.get_collection_iterators())
121 | 
122 |     def get_tweets_containing(self, *args):
123 |         def tweet_contains_terms(tweet):
124 |             return any([term in tweet['text'] for term in args])
125 |         cp = copy.deepcopy(self)
126 |         cp.apply_filter_to_collections(tweet_contains_terms)
127 |         return cp
128 | 
129 |     def get_date_range(self, start, end):
130 |         if type(start) is not datetime or type(end) is not datetime:
131 |             raise ValueError('inputs to date_range must be python datetime.date objects')
132 |         def tweet_is_in_date_range(tweet):
133 |             return (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') >= start) and (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') < end)
134 |         cp = copy.deepcopy(self)
135 |         cp.apply_filter_to_collections(tweet_is_in_date_range)
136 |         return cp
137 | 
138 |     def find_date_range(self):
139 |         date_min = datetime.max
140 |         date_max = datetime.min
141 |         for tweet in self.get_collection_iterators():
142 |             date_to_process = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
143 |             if date_to_process <= date_min:
144 |                 date_min = date_to_process
145 |             if date_to_process >= date_max:
146 |                 date_max = date_to_process
147 |         return {"date_min":date_min,"date_max":date_max}
148 | 
149 |     def tweet_language_is(self, *args):
150 |         def language_in_tweet(tweet):
151 |             return  any([language_code in tweet['lang'] for language_code in args])
152 |         cp = copy.deepcopy(self)
153 |         cp.apply_filter_to_collections(language_in_tweet)
154 |         return cp
155 | 
156 |     def detect_tweet_language(self, *args):
157 |         DetectorFactory.seed = 0
158 |         def language_in_tweet(tweet):
159 |             detected_lang = None
160 |             try: 
161 |                 detected_lang = detect(tweet['text'])             
162 |             except lang_detect_exception.LangDetectException:
163 |                 pass
164 |             return  any([detected_lang in args])
165 |         cp = copy.deepcopy(self)
166 |         cp.apply_filter_to_collections(language_in_tweet)
167 |         return cp
168 | 
169 |     def user_language_is(self, *args):
170 |         def language_in_tweet(tweet):
171 |             return any([language_code in tweet['user']['lang'] for language_code in args])
172 |         cp = copy.deepcopy(self)
173 |         cp.apply_filter_to_collections(language_in_tweet)
174 |         return cp
175 | 
176 |     def exclude_retweets(self):
177 |         def tweet_is_not_retweet(tweet):
178 |             return 'retweeted_status' not in tweet
179 |         cp = copy.deepcopy(self)
180 |         cp.apply_filter_to_collections(tweet_is_not_retweet)
181 |         return cp
182 | 
183 |     def get_retweets(self):
184 |         def tweet_is_retweet(tweet):
185 |             return 'retweeted_status' in tweet
186 |         cp = copy.deepcopy(self)
187 |         cp.apply_filter_to_collections(tweet_is_retweet)
188 |         return cp
189 | 
190 |     def user_location_contains(self, *args):
191 |         def user_has_location(tweet):
192 |             return tweet['user']['location'] and any([place_term in tweet['user']['location'] for place_term in args])
193 |         cp = copy.deepcopy(self)
194 |         cp.apply_filter_to_collections(user_has_location)
195 |         return cp
196 | 
197 |     def user_description_contains(self, *args):
198 |         def user_description_contains_terms(tweet):
199 |             return tweet['user']['description'] and any([d_term in tweet['user']['description'] for d_term in args])
200 |         cp = copy.deepcopy(self)
201 |         cp.apply_filter_to_collections(user_description_contains_terms)
202 |         return cp
203 | 
204 |     def user_id_is(self, *args):
205 |         def user_id_created_tweet(tweet):
206 |             return tweet['user']['id'] and any([u_id == tweet['user']['id'] for u_id in args])
207 |         cp = copy.deepcopy(self)
208 |         cp.apply_filter_to_collections(user_id_created_tweet)
209 |         return cp
210 | 
211 |     def get_geo_enabled(self):
212 |         def geo_enabled_filter(tweet):
213 |             return ("coordinates" in tweet 
214 |                 and tweet["coordinates"] is not None 
215 |                 and "coordinates" in tweet["coordinates"])
216 |         cp = copy.deepcopy(self)
217 |         cp.apply_filter_to_collections(geo_enabled_filter)
218 |         return cp
219 | 
220 |     def get_non_geo_enabled(self):
221 |         def non_geo_enabled_filter(tweet):
222 |             return ('coordinates' not in tweet or
223 |                 tweet['coordinates'] is None or
224 |                 'coordinates' not in tweet['coordinates'])
225 |         cp = copy.deepcopy(self)
226 |         cp.apply_filter_to_collections(non_geo_enabled_filter)
227 |         return cp
228 | 
229 |     def within_geobox(self, sw_lon, sw_lat, ne_lon, ne_lat):
230 |         def tweet_is_in_geobox(tweet):
231 |             if tweet['coordinates'] and tweet['coordinates']['coordinates']:
232 |                 coords = tweet['coordinates']['coordinates']
233 |                 return coords[0] > float(sw_lon) and coords[0] < float(ne_lon) and coords[1] > float(sw_lat) and coords[1] < float(ne_lat)
234 |             return False
235 |         cp = copy.deepcopy(self)
236 |         cp.apply_filter_to_collections(tweet_is_in_geobox)
237 |         return cp
238 | 
239 |     def place_name_contains_country(self, *args):
240 |         def place_name_contains_terms(tweet):
241 |             return tweet['place'] and any([d_term in tweet['place']['country'] for d_term in args])
242 |         cp = copy.deepcopy(self)
243 |         cp.apply_filter_to_collections(place_name_contains_terms)
244 |         return cp
245 | 
246 | 
247 |     def get_top_entities(self, requested_entities):
248 |         returndict = {}
249 |         returnstructure = {}
250 |         tweet_parser = smappdragon.TweetParser()
251 |         #init dempty dict for all entity types
252 |         for entity_type in requested_entities:
253 |             returndict[entity_type] = {}
254 | 
255 |         for tweet in self.get_collection_iterators():
256 |             for entity_type in requested_entities:
257 |                 for entity in tweet_parser.get_entity(entity_type, tweet):
258 |                     if entity_type == 'user_mentions':
259 |                         entity_value = tweet_parser.get_entity_field('id_str', entity)
260 |                     elif entity_type == 'hashtags' or entity_type == 'symbols':
261 |                         entity_value = tweet_parser.get_entity_field('text', entity)
262 |                     else:
263 |                         entity_value = tweet_parser.get_entity_field('url', entity)
264 | 
265 |                     if entity_value in returndict[entity_type]:
266 |                         returndict[entity_type][entity_value] += 1
267 |                     else:
268 |                         returndict[entity_type][entity_value] = 1
269 | 
270 |         for entity_type in returndict:
271 |             returnstructure[entity_type] = {}
272 |             if len(returndict[entity_type]) > 0:
273 |                 sorted_list = sorted(returndict[entity_type].items(), key=operator.itemgetter(1), reverse=True)
274 |                 # if the user put in 0 return all entites
275 |                 # otherwise slice the array and return the
276 |                 # number of top things they asked for
277 |                 # if the list is too short throw in None
278 |                 if requested_entities[entity_type] == 0:
279 |                     returnstructure[entity_type] = {name: count for name, count in sorted_list}
280 |                 elif len(sorted_list) < requested_entities[entity_type]:
281 |                     returnstructure[entity_type] = {name: count for name, count in sorted_list}
282 |                     for i in range(0, requested_entities[entity_type]-len(sorted_list)):
283 |                         returnstructure[entity_type][i] = None
284 |                 else:
285 |                     returnstructure[entity_type] = { \
286 |                         name: count for name, count in sorted_list[0:requested_entities[entity_type]] \
287 |                     }
288 |         return returnstructure
289 | 
290 |     def limit_number_of_tweets(self, limit):
291 |         cp = copy.deepcopy(self)
292 |         cp.collections = [collection.set_limit(limit) for collection in cp.collections]
293 |         return cp
294 | 
295 |     def dump_to_bson(self, output_file, num_files=1):
296 |         filehandles = [None]*num_files
297 |         filename, file_extension = output_file.split(os.extsep, 1)
298 | 
299 |         # open all filehandles
300 |         if num_files == 1:
301 |             filehandles[0] = open(output_file, 'ab+')
302 |         else:
303 |             # open all filehandles
304 |             filename, file_extension = output_file.split(os.extsep, 1)
305 |             for i in range(num_files):
306 |                 filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'ab+')
307 | 
308 |         # write the tweets as evenly 
309 |         # as possible in each file
310 |         tracker = 0 
311 |         for tweet in self.get_collection_iterators():
312 |             filehandles[tracker].write(BSON.encode(tweet))
313 |             if tracker == num_files-1:
314 |                 tracker = 0
315 |             else:
316 |                 tracker += 1
317 | 
318 |         # close all filehandles
319 |         for fh in filehandles:
320 |             fh.close()
321 | 
322 |     def dump_to_json(self, output_file, num_files=1):
323 |         filehandles = [None]*num_files
324 | 
325 |         if num_files == 1:
326 |             filehandles[0] = open(output_file, 'a')
327 |         else:
328 |             # open all filehandles
329 |             filename, file_extension = output_file.split(os.extsep, 1)
330 |             for i in range(num_files):
331 |                 filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'a')
332 | 
333 |         # write the tweets as evenly 
334 |         # as possible in each file
335 |         tracker = 0 
336 |         for tweet in self.get_collection_iterators():
337 |             filehandles[tracker].write(json_util.dumps(tweet)+'\n')
338 |             if tracker == num_files-1:
339 |                 tracker = 0
340 |             else:
341 |                 tracker += 1
342 | 
343 |         # close all filehandles
344 |         for fh in filehandles:
345 |             fh.close()
346 | 
347 |     def dump_to_csv(self, output_file, input_fields, write_header=True, top_level=False, num_files=1):
348 |         filehandles = [None]*num_files
349 |         writers = [None]*num_files
350 | 
351 |         if num_files == 1:
352 |             filehandles[0] = open(output_file, 'a', encoding='utf-8')
353 |             writers[0] = csv.writer(filehandles[0])
354 |             if write_header:
355 |                 writers[0].writerow(input_fields)
356 |         else:
357 |             # open all filehandles
358 |             filename, file_extension = output_file.split(os.extsep, 1)
359 |             for i in range(num_files):
360 |                 filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'a')
361 |                 writers[i] = csv.writer(filehandles[i])
362 |                 if write_header:
363 |                     writers[i].writerow(input_fields)
364 | 
365 |         tweet_parser = smappdragon.tools.tweet_parser.TweetParser()
366 | 
367 |                 # write the tweets as evenly 
368 |         # as possible in each file
369 |         tracker = 0 
370 |         for tweet in self.get_collection_iterators():
371 |             if top_level:
372 |                 ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
373 |             else:
374 |                 ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields)
375 |             ret_values = [col_val[1] for col_val in ret]
376 |             writers[tracker].writerow(ret_values)
377 | 
378 |             if tracker == num_files-1:
379 |                 tracker = 0
380 |             else:
381 |                 tracker += 1
382 | 
383 |        # close all filehandles
384 |         for fh in filehandles:
385 |             fh.close()
386 | 
387 |     def dump_to_sqlite_db(self, output_file, input_fields, top_level=False, num_files=1):
388 |         def replace_none(s):
389 |             if s is None:
390 |                 return 'NULL'
391 |             return s
392 |         cons = [None]*num_files
393 |         cursors = [None]*num_files
394 | 
395 |         tweet_parser = smappdragon.tools.tweet_parser.TweetParser()
396 |         column_str = ','.join([column for column in input_fields]).replace('.','__')
397 |         question_marks = ','.join(['?' for column in input_fields])
398 | 
399 |         con = sqlite3.connect(output_file)
400 |         cur = con.cursor()
401 |         cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str))
402 | 
403 |         insert_list = []
404 |         # batch insert if more than 10k tweets
405 |         for tweet in self.get_collection_iterators():
406 |             if top_level:
407 |                 ret = list(zip(input_fields, [tweet.get(field) for field in input_fields]))
408 |             else:
409 |                 ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields)
410 |             row = [replace_none(col_val[1]) for col_val in ret]
411 |             insert_list.append(tuple(row))
412 |             if (len(insert_list) % 10000) == 0:
413 |                 cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
414 |                 con.commit()
415 |                 insert_list = []
416 |         if len(insert_list) < 10000:
417 |             cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list)
418 |             con.commit()
419 |         con.close()
420 | 
421 |     def get_top_hashtags(self, num_top):
422 |         return self.get_top_entities({'hashtags':num_top})
423 | 
424 |     def get_top_urls(self, num_top):
425 |         return self.get_top_entities({'urls':num_top})
426 | 
427 |     def get_top_mentions(self, num_top):
428 |         return self.get_top_entities({'user_mentions':num_top})
429 | 
430 |     def get_top_media(self, num_top):
431 |         return self.get_top_entities({'media':num_top})
432 | 
433 |     def get_top_symbols(self, num_top):
434 |         return self.get_top_entities({'symbols':num_top})
435 | 
436 |     def get_top_terms(self, num_top, stop_words=None):
437 |         term_counts = {}
438 |         if not stop_words:
439 |             stop_words = get_stop_words('en')
440 |         for tweet in self.get_collection_iterators():
441 |             split_tweet = tweet['text'].split()
442 |             for tweet_token in split_tweet:
443 |                 if tweet_token not in stop_words:
444 |                     term_counts[tweet_token] = 0 if tweet_token not in term_counts else term_counts[tweet_token]+1
445 |         sorted_counts = sorted(term_counts.items(), key=operator.itemgetter(1), reverse=True)[:num_top]
446 |         return_counts = {}
447 |         for k, v in sorted_counts:
448 |             return_counts[k] = v
449 |         return return_counts
450 | 
451 |     def sample(self, k):
452 |         '''
453 |         this method is especially troublesome
454 |         i do not reccommend making any changes to it
455 |         you may notice it uplicates code fro smappdragon
456 |         there is no way around this as far as i can tell
457 |         it really  might screw up a lot of stuff, stip tweets
458 |         has been purposely omitted as it isnt supported in pysmap
459 |         '''
460 |         def new_get_iterators():
461 |             tweet_parser = smappdragon.TweetParser()
462 |             it = iter(self.get_collection_iterators())
463 |             sample = list(itertools.islice(it, k))
464 |             random.shuffle(sample)
465 |             for i, item in enumerate(it, start=k+1):
466 |                 j = random.randrange(i)
467 |                 if j < k:
468 |                     sample[j] = item
469 |             for tweet in sample:
470 |                 if all([collection.limit != 0 and collection.limit <= count for collection in self.collections]):
471 |                     return
472 |                 elif all([tweet_parser.tweet_passes_filter(collection.filter, tweet) \
473 |                 and tweet_parser.tweet_passes_custom_filter_list(collection.custom_filters, tweet) for collection in self.collections]):
474 |                     yield tweet
475 | 
476 |         cp = copy.deepcopy(self)
477 |         cp.get_collection_iterators = new_get_iterators
478 |         return cp
479 | 
480 | '''
481 | author @yvan
482 | for a lower level set of tools see: https://github.com/SMAPPNYU/smappdragon
483 | '''


--------------------------------------------------------------------------------
/pysmap/viz/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | module
3 | '''
4 | 
5 | from . import plots
6 | __all__ = ['plots']


--------------------------------------------------------------------------------
/pysmap/viz/networks.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | from smappdragon import TweetParser
 3 | 
 4 | '''
 5 | generate a retweet graph from the selection of tweets.
 6 | '''
 7 | def retweet_network(collection, tweet_fields, user_fields):
 8 |     def replace_none(s):
 9 |         if s is None:
10 |             return 'NULL'
11 |         return s
12 | 
13 |     tp = TweetParser()
14 |     dg = nx.DiGraph(name="retweet graph")
15 | 
16 |     for tweet in collection:
17 | 
18 |         um_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet['user'], user_fields)}
19 |         t_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet, tweet_fields)}
20 | 
21 |         if tweet['user']['id_str'] not in dg:
22 |             dg.add_node(tweet['user']['id_str'], attr_dict=um_dict)
23 |         if 'retweeted_status' in tweet:
24 |             rtu_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet['retweeted_status']['user'], user_fields)}
25 |             dg.add_node(tweet['retweeted_status']['user']['id_str'], attr_dict=rtu_dict)
26 |             dg.add_edge(tweet['user']['id_str'], tweet['retweeted_status']['user']['id_str'], attr_dict=t_dict)
27 |     return dg


--------------------------------------------------------------------------------
/pysmap/viz/plots.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | plt.style.use('seaborn')
  3 | 
  4 | from datetime import datetime, timedelta
  5 | from collections import OrderedDict
  6 | from smappdragon import TweetParser
  7 | 
  8 | '''
  9 |   this gets tweets by timeslice
 10 |   collection is a SmappCollection
 11 |   field is the field in a tweet on which you want to compare
 12 |   values_to_match are the values you want that field to match
 13 |   filter can be any extra filter like a custom smappdragon filter
 14 |   that can be applied to a tweet to make your graph
 15 |   period_type is the grouping you want, by day, by week, by month, etc
 16 |   start and end is the total date range you want to be queried,
 17 |   later ell do multipe fields
 18 |   output_path 
 19 | '''
 20 | def bar_graph_tweet_field_grouped_by_period(collection, field, values_to_match, custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title):
 21 |   if period_type == 'hours':
 22 |     time_delta = timedelta(hours=1)
 23 |   elif period_type == 'days':
 24 |     time_delta = timedelta(days=1)
 25 |   elif period_type == 'weeks':
 26 |     time_delta = timedelta(weeks=1)
 27 |   elif period_type == 'months':
 28 |     time_delta = timedelta(weeks=4)
 29 |   elif period_type == 'years':
 30 |     time_delta = timedelta(weeks=52)
 31 | 
 32 |   # calculate how many periods we need
 33 |   duration = end - start
 34 |   periods = round(duration // time_delta)
 35 | 
 36 |   # setup a dictionary
 37 |   # avoid having an empty dict
 38 |   field_counts = {}
 39 |   if periods <= 0:
 40 |     field_counts[0] = 0
 41 |   else:
 42 |     for period in range(periods):
 43 |       field_counts[period] = 0
 44 | 
 45 |   # split the input field for compound fields
 46 |   split_field = field.split('.')
 47 |   tweet_parser = TweetParser()
 48 | 
 49 |   for tweet in collection.get_date_range(start, end):
 50 |     flattened_tweet = tweet_parser.flatten_dict(tweet)
 51 |     
 52 |     for tweet_tuple in flattened_tweet:
 53 |       if tweet_tuple[0] == split_field:
 54 |         value = tweet_tuple[1] 
 55 |         break
 56 | 
 57 |     # empty fild value matches all tweets, then only custom filter can be used to count
 58 |     if ((field == '') or (value in values_to_match)) and custom_filter(tweet):
 59 |       tweet_time = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
 60 |       period = round((tweet_time - start) // time_delta)
 61 |       field_counts[period if period > 0 else 0] += 1
 62 | 
 63 |   data = {
 64 |     'period':[key for key in field_counts.keys()],
 65 |     'tweets':[val for val in field_counts.values()]
 66 |   }
 67 | 
 68 |   plt.plot(data['period'], data['tweets'])
 69 |   plt.xlabel(x_label)
 70 |   plt.ylabel(y_label)
 71 |   plt.title(graph_title)
 72 |   plt.savefig(output_path)
 73 | 
 74 | def bar_graph_languages(collection, langs_to_match, period_type, start, end, output_path, x_label, y_label, graph_title):
 75 |   bar_graph_tweet_field_grouped_by_period(collection, 'lang', langs_to_match, lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title)
 76 | 
 77 | def bar_graph_user_languages(collection, langs_to_match, period_type, start, end, output_path, x_label, y_label, graph_title):
 78 |   bar_graph_tweet_field_grouped_by_period(collection, 'user.lang', langs_to_match, lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title)
 79 | 
 80 | def bar_graph_tweets(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
 81 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title)
 82 | 
 83 | def bar_graph_tweets_with_urls(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
 84 |   def custom_filter(tweet):
 85 |     if len(tweet['entities']['urls']) > 0:
 86 |       return True
 87 |     return False
 88 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
 89 | 
 90 | def bar_graph_tweets_with_media(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
 91 |   def custom_filter(tweet):
 92 |     if len(tweet['entities']['media']) > 0:
 93 |       return True
 94 |     return False
 95 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
 96 | 
 97 | def bar_graph_tweets_with_mentions(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
 98 |   def custom_filter(tweet):
 99 |     if len(tweet['entities']['user_mentions']) > 0:
100 |       return True
101 |     return False
102 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
103 | 
104 | def bar_graph_tweets_with_hashtags(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
105 |   def custom_filter(tweet):
106 |     if len(tweet['entities']['hashtags']) > 0:
107 |       return True
108 |     return False
109 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
110 | 
111 | def bar_graph_tweets_with_symbols(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
112 |   def custom_filter(tweet):
113 |     if len(tweet['entities']['symbols']) > 0:
114 |       return True
115 |     return False
116 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
117 | 
118 | def bar_graph_tweets_with_retweets(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
119 |   def custom_filter(tweet):
120 |     if 'retweeted_status' in tweet:
121 |       return True
122 |     return False
123 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
124 | 
125 | def bar_graph_tweets_with_locations(collection, period_type, start, end, output_path, x_label, y_label, graph_title):
126 |   def custom_filter(tweet):
127 |     if 'retweeted_status' in tweet:
128 |       return True
129 |     return False
130 |   bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title)
131 |   


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from setuptools import setup
 3 | 
 4 | setup(name='pysmap',
 5 | 	packages=['pysmap', 'pysmap.twitterutil', 'pysmap.viz', 'pysmap.mltools'],
 6 | 	version='0.0.42',
 7 | 	description='pysmap is a set of tools for working with twitter data',
 8 | 	author='yvan',
 9 | 	author_email='yns207@nyu.edu',
10 | 	url='https://github.com/SMAPPNYU/pysmap',
11 | 	keywords='twitter data tools pysmap',
12 | 	license='MIT',
13 | 	install_requires=[
14 | 	  'smappdragon==0.0.43',
15 | 	  'stop-words>=2015.2.23.1',
16 | 	  'langdetect>=1.0.6',
17 | 	  'matplotlib>=2.0.0',
18 | 	  'pandas>=0.18.1',
19 | 	  'pymongo>=3.2.2',
20 | 	  'pytz>=2016.4',
21 | 	  'networkx>=1.11',
22 | 	  'keras>=2.0.8',
23 | 	  'opencv-python>=3.3.0.9',
24 | 	  'tensorflow>=1.3.0',
25 | 	  'h5py>=2.7.0'
26 | 	]
27 | )


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | module
3 | '''
4 | from . import test_smapp_collection
5 | __all__ = ['test_tweet_parser', 'test_mongo_collection']


--------------------------------------------------------------------------------
/test/data/invalid.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SMAPPNYU/pysmap/eb871992f40c53125129535e871525d5623c8c2d/test/data/invalid.bson


--------------------------------------------------------------------------------
/test/data/valid-single.bson.json:
--------------------------------------------------------------------------------
1 | {"_id":{"$oid":"5637c49e0651ef2dda8b5dfd"},"contributors":null,"truncated":false,"text":"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat","is_quote_status":false,"in_reply_to_status_id":null,"random_number":0.0009388446238663972,"id":{"$numberLong":"661275583813431296"},"favorite_count":0,"source":"\u003ca href=\"https://twitter.com/Col_Connaughton\" rel=\"nofollow\"\u003eColin's Autotweeterpro5.3\u003c/a\u003e","retweeted":false,"coordinates":null,"timestamp_ms":"1446495359744","entities":{"user_mentions":[],"symbols":[],"hashtags":[{"indices":[74,83],"text":"jadehelm"},{"indices":[84,98],"text":"newworldorder"},{"indices":[99,103],"text":"usa"},{"indices":[104,112],"text":"tyranny"},{"indices":[113,120],"text":"threat"}],"urls":[{"url":"https://t.co/VA4bQRudLt","indices":[50,73],"expanded_url":"https://www.youtube.com/watch?v=0nJqymxVpwc","display_url":"youtube.com/watch?v=0nJqym…"}]},"in_reply_to_screen_name":null,"id_str":"661275583813431296","retweet_count":0,"in_reply_to_user_id":null,"favorited":false,"timestamp":{"$date":"2015-11-02T20:15:59.000Z"},"user":{"follow_request_sent":null,"profile_use_background_image":true,"default_profile_image":false,"id":379851447,"verified":false,"profile_image_url_https":"https://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","followers_count":3159,"profile_sidebar_border_color":"C0DEED","id_str":"379851447","profile_background_color":"C0DEED","listed_count":401,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","utc_offset":0,"statuses_count":477638,"description":"#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me","friends_count":2019,"location":"London UK","profile_link_color":"0084B4","profile_image_url":"http://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","following":null,"geo_enabled":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/379851447/1416509762","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"ISRAEL BOMBS BABIES","lang":"en","profile_background_tile":false,"favourites_count":15917,"screen_name":"Col_Connaughton","notifications":null,"url":null,"created_at":"Sun Sep 25 17:29:09 +0000 2011","contributors_enabled":false,"time_zone":"London","protected":false,"default_profile":true,"is_translator":false},"geo":null,"in_reply_to_user_id_str":null,"possibly_sensitive":true,"lang":"de","created_at":"Mon Nov 02 20:15:59 +0000 2015","filter_level":"low","in_reply_to_status_id_str":null,"place":null}
2 | 


--------------------------------------------------------------------------------
/test/data/valid.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SMAPPNYU/pysmap/eb871992f40c53125129535e871525d5623c8c2d/test/data/valid.bson


--------------------------------------------------------------------------------
/test/data/valid.csv:
--------------------------------------------------------------------------------
1 | id_str,entities.hashtags.0,entities.hashtags.1,source,user.id,timestamp.$date,text
2 | 661275583813431296,"{""indices"": [74, 83], ""text"": ""jadehelm""}","{""indices"": [84, 98], ""text"": ""newworldorder""}","<a href=""https://twitter.com/Col_Connaughton"" rel=""nofollow"">Colin's Autotweeterpro5.3</a>",379851447,1446495359000,"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat"
3 | 


--------------------------------------------------------------------------------
/test/test_crowd_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os, sys, unittest, cv2, warnings
 3 | 
 4 | from datetime import datetime
 5 | from test.config import config
 6 | from pysmap.mltools.crowd_model import CrowdModel
 7 | from keras.applications.resnet50 import preprocess_input
 8 | # from matplotlib.testing.decorators import image_comparison
 9 | 
10 | class TestCrowdModel(unittest.TestCase):
11 |     def test_control(self):
12 |         '''
13 |         a control test to make sure everything
14 |         in the unittest framework is working
15 |         '''
16 |         self.assertTrue(True)
17 | 
18 |     def test_model_loads_file(self):
19 |         '''
20 |         test that the model loads a model
21 |         file properly and produces a model object
22 |         '''
23 |         cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False)
24 |         assert type(cm) is CrowdModel
25 | 
26 |     def test_model_dl_file(self):
27 |         '''
28 |         test that the model class can download a file
29 |         from the server where we store model files
30 |         '''
31 |         cm = CrowdModel(config['crowd']['dl_path'], dl=True, talk=True)
32 |         statinfo = os.stat(config['crowd']['dl_path'])
33 |         assert os.path.exists(config['crowd']['dl_path'])
34 |         assert statinfo.st_size == 295489952
35 |         assert type(cm) is CrowdModel
36 |         files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']]
37 |         preds = cm.predict_files(files)
38 |         assert len(preds) > 0
39 |         assert preds[0][0] > 0.9 # should be 0.997
40 |         assert preds[1][0] < 0.1 # sould be 0.0034
41 | 
42 |     def test_model_predicts_imgs(self):
43 |         '''
44 |         test that a model can make predictions from
45 |         an array of imgs that have already been loaded
46 |         '''
47 |         files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']]
48 |         imgs = np.zeros((len(files),224,224,3))
49 |         for i, file in enumerate(files):
50 |             img = cv2.imread(file).astype('float64')
51 |             img = cv2.resize(img, (224,224))
52 |             imgs[i] = img
53 |         cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False)
54 |         preds = cm.predict_imgs(imgs)
55 |         assert len(preds) > 0
56 |         assert preds[0][0] > 0.9 # should be 0.997
57 |         assert preds[1][0] < 0.1 # sould be 0.0034
58 | 
59 |     def test_model_predicts_files(self):
60 |         '''
61 |         test that the model class cna predict from image files
62 |         this loads the images, prepocesses then predicts
63 |         '''
64 |         cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False)
65 |         files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']]
66 |         preds = cm.predict_files(files)
67 |         assert len(preds) > 0
68 |         assert preds[0][0] > 0.9 # should be 0.997
69 |         assert preds[1][0] < 0.1 # sould be 0.00
70 | 
71 | if __name__ == '__main__':
72 |     warnings.filterwarnings("ignore")
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/test/test_networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import networkx as nx
 4 | 
 5 | from datetime import datetime
 6 | from test.config import config
 7 | from pysmap import SmappCollection
 8 | from pysmap import networks
 9 | 
10 | class TestNetworks(unittest.TestCase):
11 |     def test_control(self):
12 |         self.assertTrue(True)
13 | 
14 |     def test_make_retweet_network_graph(self):
15 |         output_path = '{}/chart_tests/network-{}-retweet.graphml'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
16 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
17 |         collection = SmappCollection('json', file_path)
18 |         digraph = networks.retweet_network(collection, ['id_str', 'retweeted_status.id_str', 'created_at', 'text', 'lang'], ['id_str', 'screen_name', 'location', 'description'])
19 |         nx.write_graphml(digraph, output_path)
20 | 
21 |     def test_empty_make_retweet_network_graph(self):
22 |         output_path = '{}/chart_tests/network-{}-retweet-empty.graphml'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
23 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
24 |         collection = SmappCollection('json', file_path)
25 |         digraph = networks.retweet_network(collection, [], [])
26 |         nx.write_graphml(digraph, output_path)


--------------------------------------------------------------------------------
/test/test_plots.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from datetime import datetime
 5 | from test.config import config
 6 | from pysmap import SmappCollection
 7 | from pysmap import plots
 8 | 
 9 | class TestPlots(unittest.TestCase):
10 |     def test_control(self):
11 |         self.assertTrue(True)
12 | 
13 |     def test_tweet_field_grouped_by_timeslice_hours(self):
14 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
15 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
16 |         collection = SmappCollection('json', file_path)
17 |         def custom_filter(tweet):
18 |             if '#JadeHelm' in tweet['text']:
19 |                 return True
20 |             return False
21 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
22 | 
23 |     def test_tweet_field_grouped_by_timeslice_days(self):
24 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
25 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
26 |         collection = SmappCollection('json', file_path)
27 |         def custom_filter(tweet):
28 |             return True
29 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
30 | 
31 |     def test_tweet_field_grouped_by_timeslice_weeks(self):
32 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
33 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
34 |         collection = SmappCollection('json', file_path)
35 |         def custom_filter(tweet):
36 |             return True
37 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'weeks', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
38 | 
39 |     def test_tweet_field_grouped_by_timeslice_months(self):
40 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
41 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
42 |         collection = SmappCollection('json', file_path)
43 |         def custom_filter(tweet):
44 |             return True
45 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
46 | 
47 |     def test_tweet_field_grouped_by_timeslice_years(self):
48 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
49 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
50 |         collection = SmappCollection('json', file_path)
51 |         def custom_filter(tweet):
52 |             return True
53 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'years', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
54 | 
55 |     def test_tweet_field_grouped_by_timeslice_custom_filter(self):
56 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
57 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
58 |         collection = SmappCollection('json', file_path)
59 |         def custom_filter(tweet):
60 |             if '#JadeHelm' in tweet['text']:
61 |                 return True
62 |             return False
63 |         plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
64 | 
65 |     def test_tweet_field_grouped_by_timeslice_single_level_field(self):
66 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
67 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
68 |         collection = SmappCollection('json', file_path)
69 |         def custom_filter(tweet):
70 |             return True
71 |         plots.bar_graph_tweet_field_grouped_by_period(collection, 'id_str', ['661283295670493185'], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
72 | 
73 |     def test_tweet_field_grouped_by_timeslice_compound_field(self):
74 |         output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now())
75 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
76 |         collection = SmappCollection('json', file_path)
77 |         def custom_filter(tweet):
78 |             return True
79 |         plots.bar_graph_tweet_field_grouped_by_period(collection, 'user.time_zone', ['Pacific Time (US & Canada)'], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
80 | 


--------------------------------------------------------------------------------
/test/test_smapp_collection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | from datetime import datetime
  5 | from test.config import config
  6 | from pysmap import SmappCollection
  7 | 
  8 | class TestSmappCollection(unittest.TestCase):
  9 |     def test_control(self):
 10 |         self.assertTrue(True)
 11 | 
 12 |     def test_smapp_bson_collection_iterates(self):
 13 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 14 |     	collection = SmappCollection('bson', file_path)
 15 |     	self.assertTrue(len(list(collection)) > 0)
 16 | 
 17 |     def test_smapp_json_collection_iterates(self):
 18 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
 19 |     	collection = SmappCollection('json', file_path)
 20 |     	self.assertTrue(len(list(collection)) > 0)
 21 | 
 22 |     def test_smapp_csv_collection_iterates(self):
 23 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
 24 |     	collection = SmappCollection('csv', file_path)
 25 |     	self.assertTrue(len(list(collection)) > 0)
 26 | 
 27 |     def test_limit_number_of_tweets(self):
 28 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 29 |     	collection = SmappCollection('bson', file_path)
 30 |     	self.assertTrue(len(list(collection.limit_number_of_tweets(100))) > 0)
 31 | 
 32 |     # def test_smapp_mongo_collection_iterates(self):
 33 |     # 	collection = SmappCollection('mongo', 
 34 |     # 		config['mongo']['host'], 
 35 |     # 		config['mongo']['port'], 
 36 |     # 		config['mongo']['user'], 
 37 |     # 		config['mongo']['password'],
 38 |     # 		config['mongo']['database'],
 39 |     # 		config['mongo']['collection'])
 40 |     # 	self.assertTrue(len(list(collection.limit_number_of_tweets(100))) > 0)
 41 | 
 42 |     def test_get_tweet_texts(self):
 43 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 44 |     	collection = SmappCollection('bson', file_path)
 45 |     	texts = [text for text in collection.limit_number_of_tweets(1).get_tweet_texts()]
 46 |     	self.assertEqual(str, type(texts[0]))
 47 | 
 48 |     def test_count_tweet_terms(self):
 49 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 50 |     	collection = SmappCollection('bson', file_path)
 51 |     	count = collection.count_tweet_terms('jade')
 52 |     	self.assertEqual(167, count)
 53 | 
 54 |     def test_count_tweet_terms_multiple(self):
 55 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 56 |         collection = SmappCollection('bson', file_path)
 57 |         count = collection.count_tweet_terms('jade', 'helm')
 58 |         self.assertEqual(176, count)
 59 | 
 60 |     def test_count_tweets(self):
 61 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 62 |         collection = SmappCollection('bson', file_path)
 63 |         count = collection.count_tweets()
 64 |         self.assertEqual(1187, count)
 65 | 
 66 |     def test_get_tweets_containing(self):
 67 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 68 |     	collection = SmappCollection('bson', file_path)
 69 |     	count = len([tweet for tweet in collection.get_tweets_containing('jade')])
 70 |     	self.assertEqual(167, count)
 71 | 
 72 |     def test_get_tweets_containing_multiple(self):
 73 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 74 |         collection = SmappCollection('bson', file_path)
 75 |         count = len([tweet for tweet in collection.get_tweets_containing('jade', 'helm')])
 76 |         self.assertEqual(176, count)
 77 | 
 78 |     def test_get_date_range(self):
 79 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 80 |         collection = SmappCollection('bson', file_path)
 81 |         count = len([tweet for tweet in collection.get_date_range(datetime(2015,11,2), datetime(2015,11,3))])
 82 |         self.assertEqual(26, count)
 83 | 
 84 |     def test_find_date_range(self):
 85 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 86 |         collection = SmappCollection('bson', file_path)
 87 |         range_obj = collection.find_date_range()
 88 |         self.assertEqual(datetime(2015, 11, 2, 19, 56, 33), range_obj['date_min'])
 89 |         self.assertEqual(datetime(2015, 11, 6, 21, 35, 54), range_obj['date_max'])
 90 | 
 91 |     def test_tweet_language_is(self):
 92 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 93 |     	collection = SmappCollection('bson', file_path)
 94 |     	count = len([tweet for tweet in collection.tweet_language_is('en')])
 95 |     	self.assertEqual(825, count)
 96 | 
 97 |     def test_detect_tweet_language(self):
 98 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 99 |         collection = SmappCollection('bson', file_path)
100 |         count = len([tweet for tweet in collection.detect_tweet_language('en')])
101 |         self.assertEqual(907, count)
102 | 
103 |     def test_user_language_is(self):
104 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
105 |     	collection = SmappCollection('bson', file_path)
106 |     	count = len([tweet for tweet in collection.user_language_is('en')])
107 |     	self.assertEqual(801, count)
108 | 
109 |     def test_exclude_retweets(self):
110 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
111 |     	collection = SmappCollection('bson', file_path)
112 |     	count = len([tweet for tweet in collection.exclude_retweets()])
113 |     	self.assertEqual(682, count)
114 | 
115 |     def test_get_retweets(self):
116 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
117 |         collection = SmappCollection('bson', file_path)
118 |         count = len([tweet for tweet in collection.get_retweets()])
119 |         self.assertEqual(505, count)
120 | 
121 |     def test_user_location_contains(self):
122 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
123 |     	collection = SmappCollection('bson', file_path)
124 |     	count = len([tweet for tweet in collection.user_location_contains('TX')])
125 |     	self.assertEqual(10, count)
126 | 
127 |     def test_user_description_contains(self):
128 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
129 |         collection = SmappCollection('json', file_path)
130 |         count = len([tweet for tweet in collection.user_description_contains('JESUS')])
131 |         self.assertEqual(15, count)
132 | 
133 |     def test_user_id_is(self):
134 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
135 |         collection = SmappCollection('json', file_path)
136 |         count = len([tweet for tweet in collection.user_id_is(379851447, 149751818)])
137 |         self.assertEqual(77, count)
138 | 
139 |     def test_place_name_contains_country(self):
140 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
141 |         collection = SmappCollection('json', file_path)
142 |         count = len([tweet for tweet in collection.place_name_contains_country('United States')])
143 |         self.assertEqual(6, count)
144 | 
145 |     def test_within_geobox(self):
146 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
147 |         collection = SmappCollection('json', file_path)
148 |         # geobox here is for us mountain time
149 |         # i created a coordinate in our data file on the last object [-105.29, 40.33]
150 |         # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997]
151 |         count = len([tweet for tweet in collection.within_geobox(-113.95, 28.81, -100.05, 48.87)])
152 |         self.assertEqual(1, count)
153 | 
154 |     def test_get_geo_enabled(self):
155 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
156 |     	collection = SmappCollection('bson', file_path)
157 |     	count = len([tweet for tweet in collection.get_geo_enabled()])
158 |     	self.assertEqual(1, count)
159 | 
160 |     def test_get_non_geo_enabled(self):
161 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
162 |     	collection = SmappCollection('bson', file_path)
163 |     	count = len([tweet for tweet in collection.get_non_geo_enabled()])
164 |     	self.assertEqual(1186, count)
165 | 
166 |     def test_dump_to_bson(self):
167 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'):
168 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson')
169 | 
170 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson'
171 |         collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
172 |         collection.dump_to_bson(output_path)
173 |         self.assertTrue(os.path.getsize(output_path) > 0)
174 | 
175 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'):
176 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson')
177 | 
178 |     def test_dump_to_json(self):
179 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
180 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
181 | 
182 |         output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
183 |         collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
184 |         collection.dump_to_json(output_path)
185 |         self.assertTrue(os.path.getsize(output_path) > 0)
186 | 
187 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
188 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
189 | 
190 |     def test_dump_to_csv(self):
191 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
192 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
193 | 
194 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv'
195 |         collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
196 |         collection.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
197 |         self.assertTrue(os.path.getsize(output_path) > 0)
198 | 
199 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
200 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
201 | 
202 |     def test_dump_to_sqlite_db(self):
203 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
204 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
205 | 
206 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db'
207 |         collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
208 |         collection.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
209 |         self.assertTrue(os.path.getsize(output_path) > 0)
210 | 
211 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
212 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
213 | 
214 |     def test_get_top_hashtags(self):
215 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
216 |     	collection = SmappCollection('bson', file_path)
217 |     	base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}}
218 |     	hashtags = collection.get_top_hashtags(5)
219 |     	self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))
220 | 
221 |     def test_get_top_urls(self):
222 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
223 |     	collection = SmappCollection('bson', file_path)
224 |     	urls = collection.get_top_urls(5)
225 |     	base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}}
226 |     	self.assertTrue(set(urls.keys()) == set(base_urls.keys()))
227 | 
228 |     def test_get_top_mentions(self):
229 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
230 |     	collection = SmappCollection('bson', file_path)
231 |     	top_mentions = collection.get_top_mentions(5)
232 |     	base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}}
233 |     	self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys()))
234 | 
235 |     def test_get_top_media(self):
236 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
237 |     	collection = SmappCollection('bson', file_path)
238 |     	top_media = collection.get_top_media(5)
239 |     	base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}}
240 |     	self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))
241 | 
242 |     def test_get_top_symbols(self):
243 |     	file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
244 |     	collection = SmappCollection('bson', file_path)
245 |     	top_symbols = collection.get_top_symbols(5)
246 |     	base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}}
247 |     	self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys()))
248 | 
249 |     def test_get_top_terms(self):
250 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
251 |         collection = SmappCollection('bson', file_path)
252 |         top_counts = collection.get_top_terms(10)
253 |         base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193}
254 |         self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))
255 | 
256 |     def test_base_top_entities_returns_dict(self):
257 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
258 |         collection = SmappCollection('bson', file_path)
259 |         returndict = collection.get_top_entities({'hashtags':5})
260 |         self.assertTrue(isinstance(returndict, dict))
261 | 
262 |     def test_base_top_entities_returns_hashtags(self):
263 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
264 |         collection = SmappCollection('bson', file_path)
265 |         returndict = collection.get_top_entities({'hashtags':5})
266 |         self.assertTrue('hashtags' in returndict)
267 | 
268 |     def test_base_top_entities_returns_hashtags_and_media(self):
269 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
270 |         collection = SmappCollection('bson', file_path)
271 |         returndict = collection.get_top_entities({'user_mentions':5, 'media':3})
272 |         self.assertTrue('user_mentions' in returndict and 'media' in returndict)
273 | 
274 |     def test_base_top_entities_returns_counts(self):
275 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
276 |         collection = SmappCollection('bson', file_path)
277 |         returndict = collection.get_top_entities({'urls':5, 'symbols':3})
278 |         if len(returndict['urls']) > 0:
279 |             self.assertTrue(len(returndict['urls']) == 5)
280 |         if len(returndict['symbols']) > 0:
281 |             self.assertTrue(len(returndict['symbols']) == 3)
282 | 
283 |     def test_sample_returns_right_number_of_items(self):
284 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
285 |         collection = SmappCollection('bson', file_path)
286 |         sample_collection = collection.sample(10)
287 |         self.assertEqual(10, len(list(sample_collection)))
288 | 
289 |     def test_sample_returns_dif_tweets_than_fist_10_tweets(self):
290 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
291 |         collection_one = SmappCollection('bson', file_path)
292 |         sample_tweets = list(collection_one.sample(10))
293 |         collection_two = SmappCollection('bson', file_path)
294 |         first_ten_tweets = list(collection_two.limit_number_of_tweets(10))
295 |         self.assertNotEqual(sample_tweets, first_ten_tweets)
296 | 
297 |     def test_sample_chains_and_dumps(self):
298 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
299 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
300 | 
301 |         output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
302 |         collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid'])
303 |         sample_tweets = collection.sample(10)
304 |         sample_tweets.dump_to_json(output_path)
305 |         self.assertTrue(os.path.getsize(output_path) > 0)
306 |         with open(output_path) as f:
307 |             self.assertEqual(10, len([line for line in f]))
308 | 
309 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
310 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
311 | 
312 |     def test_set_custom_filter_properly_filters(self):
313 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
314 |         collection_one = SmappCollection('bson', file_path)
315 |         full_collection_len = len(list(collection_one))
316 |         def is_tweet_a_retweet(tweet):
317 |             if 'retweeted' in tweet and tweet['retweeted']:
318 |                 return True
319 |             else:
320 |                 return False
321 |         num_retweets = len(list(collection_one.set_custom_filter(is_tweet_a_retweet)))
322 | 
323 |         collection_two = SmappCollection('bson', file_path)
324 |         def is_not_a_retweet(tweet):
325 |             if 'retweeted' in tweet and tweet['retweeted']:
326 |                 return False
327 |             else:
328 |                 return True
329 |         num_non_retweets = len(list(collection_two.set_custom_filter(is_not_a_retweet)))
330 |         self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
331 | 
332 | if __name__ == '__main__':
333 |     unittest.main()
334 | 


--------------------------------------------------------------------------------
/test/test_smapp_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | from datetime import datetime
  5 | from test.config import config
  6 | from pysmap import SmappDataset, SmappCollection
  7 | from smappdragon import BsonCollection
  8 | 
  9 | class TestSmappDataset(unittest.TestCase):
 10 |     def test_control(self):
 11 |         self.assertTrue(True)
 12 | 
 13 |     def test_smapp_dataset_takes_base_input_types(self):
 14 |         file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 15 |         file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
 16 |         file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
 17 |         collection = SmappDataset(['bson', file_path_bson], ['json', file_path_json], ['csv', file_path_csv])
 18 |         self.assertTrue(len(list(collection)) > 0)
 19 | 
 20 |     def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self):
 21 |         file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 22 |         file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 23 |         file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
 24 |         file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
 25 |         collection = SmappCollection('bson', file_path_bson_2)
 26 |         dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv])
 27 |         dataset_2 = SmappDataset(dataset_1,  ['json', file_path_json], collection)
 28 |         self.assertTrue(len(list(dataset_2)) > 0)
 29 | 
 30 |     # def test_smapp_dataset_takes_collection_regex(self):
 31 |     #     dataset = SmappDataset(['mongo', 
 32 |     #         config['mongo']['host'], 
 33 |     #         config['mongo']['port'], 
 34 |     #         config['mongo']['user'], 
 35 |     #         config['mongo']['password'],
 36 |     #         config['mongo']['database']], collection_regex='(^data$|^tweets$|^tweets_\d+$)')
 37 |     #     self.assertTrue(len(list(dataset)) > 0)
 38 | 
 39 |     # def test_smapp_dataset_takes_database_regex(self):
 40 |     #     dataset = SmappDataset(['mongo', 
 41 |     #         config['mongo']['host'], 
 42 |     #         config['mongo']['port'], 
 43 |     #         config['mongo']['user'], 
 44 |     #         config['mongo']['password'],
 45 |     #         config['mongo']['collection']], database_regex='(^47Traitors$)')
 46 |     #     self.assertTrue(len(list(dataset)) > 0)
 47 | 
 48 |     # def test_smapp_dataset_takes_database_regex_and_collection_regex(self):
 49 |     #     dataset = SmappDataset(['mongo', 
 50 |     #         config['mongo']['host'], 
 51 |     #         config['mongo']['port'], 
 52 |     #         config['mongo']['user'], 
 53 |     #         config['mongo']['password']], database_regex='(^47Traitors$)', collection_regex='(^data$|^tweets$|^tweets_\d+$)')
 54 |     #     self.assertTrue(len(list(dataset)) > 0)
 55 | 
 56 |     def test_smapp_dataset_file_pattern_takes_a_unix_pattern(self):
 57 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
 58 |         dataset = SmappDataset(['bson', 'file_pattern', file_path])
 59 |         self.assertTrue(len(list(dataset)) > 0)
 60 | 
 61 |     def test_smapp_dataset_file_pattern_takes_home_path(self):
 62 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
 63 |         file_path = file_path.replace('/Users/yvanscher', '~')
 64 |         dataset = SmappDataset(['bson','file_pattern',file_path])
 65 |         self.assertTrue(len(list(dataset)) > 0)
 66 | 
 67 |     def test_smapp_dataset_file_pattern_returns_two_collections(self):
 68 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
 69 |         dataset = SmappDataset(['bson','file_pattern',file_path])
 70 |         self.assertTrue(all([type(collection) == BsonCollection for collection in dataset.collections]))
 71 | 
 72 |     def test_smapp_bson_collection_iterates(self):
 73 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 74 |         dataset = SmappDataset(['bson', file_path])
 75 |         self.assertTrue(len(list(dataset)) > 0)
 76 | 
 77 |     def test_smapp_json_collection_iterates(self):
 78 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
 79 |         dataset = SmappDataset(['json', file_path])
 80 |         self.assertTrue(len(list(dataset)) > 0)
 81 | 
 82 |     def test_smapp_csv_collection_iterates(self):
 83 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
 84 |         dataset = SmappDataset(['csv', file_path])
 85 |         self.assertTrue(len(list(dataset)) > 0)
 86 | 
 87 |     # limit before mongo because mongo should be limited or it takes too long
 88 |     def test_limit_number_of_tweets(self):
 89 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
 90 |         dataset = SmappDataset(['bson', file_path])
 91 |         self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0)
 92 | 
 93 |     # def test_smapp_mongo_collection_iterates(self):
 94 |     #     dataset = SmappDataset(['mongo', 
 95 |     #         config['mongo']['host'], 
 96 |     #         config['mongo']['port'], 
 97 |     #         config['mongo']['user'], 
 98 |     #         config['mongo']['password'],
 99 |     #         config['mongo']['database'],
100 |     #         config['mongo']['collection']])
101 |     #     self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0)
102 | 
103 |     def test_get_tweet_texts(self):
104 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
105 |         dataset = SmappDataset(['bson', file_path])
106 |         texts = [text for text in dataset.limit_number_of_tweets(1).get_tweet_texts()]
107 |         self.assertEqual(str, type(texts[0]))
108 | 
109 |     def test_count_tweet_terms(self):
110 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
111 |         dataset = SmappDataset(['bson', file_path])
112 |         count = dataset.count_tweet_terms('jade')
113 |         self.assertEqual(167, count)
114 | 
115 |     def test_count_tweet_terms_multiple(self):
116 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
117 |         dataset = SmappDataset(['bson', file_path])
118 |         count = dataset.count_tweet_terms('jade', 'helm')
119 |         self.assertEqual(176, count)
120 | 
121 |     def test_count_tweets(self):
122 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
123 |         dataset = SmappDataset(['bson', file_path])
124 |         count = dataset.count_tweets()
125 |         self.assertEqual(1187, count)
126 | 
127 |     def test_get_tweets_containing(self):
128 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
129 |         dataset = SmappDataset(['bson', file_path])
130 |         count = len([tweet for tweet in dataset.get_tweets_containing('jade')])
131 |         self.assertEqual(167, count)
132 | 
133 |     def test_get_tweets_containing_multiple(self):
134 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
135 |         dataset = SmappDataset(['bson', file_path])
136 |         count = len([tweet for tweet in dataset.get_tweets_containing('jade', 'helm')])
137 |         self.assertEqual(176, count)
138 | 
139 |     def test_get_date_range(self):
140 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
141 |         dataset = SmappDataset(['bson', file_path])
142 |         count = len([tweet for tweet in dataset.get_date_range(datetime(2015,11,2), datetime(2015,11,3))])
143 |         self.assertEqual(26, count)
144 | 
145 |     def test_find_date_range(self):
146 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
147 |         dataset = SmappDataset(['bson', file_path])
148 |         range_obj = dataset.find_date_range()
149 |         self.assertEqual(datetime(2015, 11, 2, 19, 56, 33), range_obj['date_min'])
150 |         self.assertEqual(datetime(2015, 11, 6, 21, 35, 54), range_obj['date_max'])
151 | 
152 |     def test_tweet_language_is(self):
153 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
154 |         dataset = SmappDataset(['bson', file_path])
155 |         count = len([tweet for tweet in dataset.tweet_language_is('en')])
156 |         self.assertEqual(825, count)
157 | 
158 |     def test_detect_tweet_language(self):
159 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
160 |         dataset = SmappDataset(['bson', file_path])
161 |         count = len([tweet for tweet in dataset.detect_tweet_language('en')])
162 |         self.assertEqual(907, count)
163 | 
164 |     def test_user_language_is(self):
165 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
166 |         dataset = SmappDataset(['bson', file_path])
167 |         count = len([tweet for tweet in dataset.user_language_is('en')])
168 |         self.assertEqual(801, count)
169 | 
170 |     def test_exclude_retweets(self):
171 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
172 |         dataset = SmappDataset(['bson', file_path])
173 |         count = len([tweet for tweet in dataset.exclude_retweets()])
174 |         self.assertEqual(682, count)
175 | 
176 |     def test_get_retweets(self):
177 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
178 |         dataset = SmappDataset(['bson', file_path])
179 |         count = len([tweet for tweet in dataset.get_retweets()])
180 |         self.assertEqual(505, count)
181 | 
182 |     def test_user_location_contains(self):
183 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
184 |         dataset = SmappDataset(['bson', file_path])
185 |         count = len([tweet for tweet in dataset.user_location_contains('TX')])
186 |         self.assertEqual(10, count)
187 | 
188 |     def test_user_description_contains(self):
189 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
190 |         dataset = SmappDataset(['json', file_path])
191 |         count = len([tweet for tweet in dataset.user_description_contains('JESUS')])
192 |         self.assertEqual(15, count)
193 | 
194 |     def test_user_id_is(self):
195 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
196 |         dataset = SmappDataset(['json', file_path])
197 |         count = len([tweet for tweet in dataset.user_id_is(379851447, 149751818)])
198 |         self.assertEqual(77, count)
199 | 
200 |     def test_place_name_contains_country(self):
201 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
202 |         dataset = SmappDataset(['json', file_path])
203 |         count = len([tweet for tweet in dataset.place_name_contains_country('United States')])
204 |         self.assertEqual(6, count)
205 | 
206 |     def test_within_geobox(self):
207 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
208 |         dataset = SmappDataset(['json', file_path])
209 |         # geobox here is for us mountain time
210 |         # i created a coordinate in our data file on the last object [-105.29, 40.33]
211 |         # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997]
212 |         count = len([tweet for tweet in dataset.within_geobox(-113.95, 28.81, -100.05, 48.87)])
213 |         self.assertEqual(1, count)
214 | 
215 |     def test_get_geo_enabled(self):
216 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
217 |         dataset = SmappDataset(['bson', file_path])
218 |         count = len([tweet for tweet in dataset.get_geo_enabled()])
219 |         self.assertEqual(1, count)
220 | 
221 |     def test_get_non_geo_enabled(self):
222 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
223 |         dataset = SmappDataset(['bson', file_path])
224 |         count = len([tweet for tweet in dataset.get_non_geo_enabled()])
225 |         self.assertEqual(1186, count)
226 | 
227 | 
228 |     def test_dump_to_bson(self):
229 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'):
230 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson')
231 | 
232 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson'
233 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
234 |         dataset.dump_to_bson(output_path)
235 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson') > 0)
236 | 
237 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'):
238 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson')
239 | 
240 |     def test_dump_to_json(self):
241 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
242 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
243 | 
244 |         output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
245 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
246 |         dataset.dump_to_json(output_path)
247 |         self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')) > 0)
248 | 
249 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
250 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
251 | 
252 |     def test_dump_to_csv(self):
253 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
254 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
255 | 
256 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv'
257 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
258 |         dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
259 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv') > 0)
260 | 
261 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'):
262 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
263 | 
264 |     def test_dump_to_sqlite_db(self):
265 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
266 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
267 | 
268 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db'
269 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
270 |         dataset.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
271 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db') > 0)
272 | 
273 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
274 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
275 | 
276 |     def test_dump_to_bson_parallel(self):
277 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'):
278 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson')
279 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'):
280 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson')
281 | 
282 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson'
283 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
284 |         dataset.dump_to_bson(output_path, num_files=2)
285 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.bson') > 0)
286 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.bson') > 0)
287 | 
288 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'):
289 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson')
290 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'):
291 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson')
292 | 
293 |     def test_dump_to_json_parallel(self):
294 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json'):
295 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json')
296 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json'):
297 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json')
298 | 
299 |         output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
300 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
301 |         dataset.dump_to_json(output_path, num_files=2)
302 |         self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output_0.bson.json')) > 0)
303 |         self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output_1.bson.json')) > 0)
304 | 
305 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json'):
306 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json')
307 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json'):
308 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json')
309 | 
310 |     def test_dump_to_csv_parallel(self):
311 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'):
312 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv')
313 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'):
314 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv')
315 | 
316 |         output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv'
317 |         dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
318 |         dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], num_files=2)
319 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.csv') > 0)
320 |         self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.csv') > 0)
321 | 
322 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'):
323 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv')
324 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'):
325 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv')
326 |             
327 |     def test_get_top_hashtags(self):
328 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
329 |         dataset = SmappDataset(['bson', file_path])
330 |         base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}}
331 |         hashtags = dataset.get_top_hashtags(5)
332 |         self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))
333 | 
334 |     def test_get_top_urls(self):
335 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
336 |         dataset = SmappDataset(['bson', file_path])
337 |         urls = dataset.get_top_urls(5)
338 |         base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}}
339 |         self.assertTrue(set(urls.keys()) == set(base_urls.keys()))
340 | 
341 |     def test_get_top_mentions(self):
342 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
343 |         dataset = SmappDataset(['bson', file_path])
344 |         top_mentions = dataset.get_top_mentions(5)
345 |         base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}}
346 |         self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys()))
347 | 
348 |     def test_get_top_media(self):
349 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
350 |         dataset = SmappDataset(['bson', file_path])
351 |         top_media = dataset.get_top_media(5)
352 |         base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}}
353 |         self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))
354 | 
355 |     def test_get_top_symbols(self):
356 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
357 |         dataset = SmappDataset(['bson', file_path])
358 |         top_symbols = dataset.get_top_symbols(5)
359 |         base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}}
360 |         self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys()))
361 | 
362 |     def test_get_top_terms(self):
363 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
364 |         dataset = SmappDataset(['bson', file_path])
365 |         top_counts = dataset.get_top_terms(10)
366 |         base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193}
367 |         self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))
368 | 
369 |     def test_base_top_entities_returns_dict(self):
370 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
371 |         dataset = SmappDataset(['bson', file_path])
372 |         returndict = dataset.get_top_entities({'hashtags':5})
373 |         self.assertTrue(isinstance(returndict, dict))
374 | 
375 |     def test_base_top_entities_returns_hashtags(self):
376 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
377 |         dataset = SmappDataset(['bson', file_path])
378 |         returndict = dataset.get_top_entities({'hashtags':5})
379 |         self.assertTrue('hashtags' in returndict)
380 | 
381 |     def test_base_top_entities_returns_hashtags_and_media(self):
382 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
383 |         dataset = SmappDataset(['bson', file_path])
384 |         returndict = dataset.get_top_entities({'user_mentions':5, 'media':3})
385 |         self.assertTrue('user_mentions' in returndict and 'media' in returndict)
386 | 
387 |     def test_base_top_entities_returns_counts(self):
388 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
389 |         dataset = SmappDataset(['bson', file_path])
390 |         returndict = dataset.get_top_entities({'urls':5, 'symbols':3})
391 |         if len(returndict['urls']) > 0:
392 |             self.assertTrue(len(returndict['urls']) == 5)
393 |         if len(returndict['symbols']) > 0:
394 |             self.assertTrue(len(returndict['symbols']) == 3)
395 | 
396 |     def test_sample_returns_right_number_of_items(self):
397 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
398 |         dataset = SmappDataset(['bson', file_path])
399 |         sample_collection = dataset.sample(10)
400 |         self.assertEqual(10, len(list(sample_collection)))
401 | 
402 |     def test_sample_returns_dif_tweets_than_fist_10_tweets(self):
403 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
404 |         dataset = SmappDataset(['bson', file_path])
405 |         sample_tweets = list(dataset.sample(10))
406 |         dataset_two = SmappDataset(['bson', file_path])
407 |         first_ten_tweets = list(dataset_two.limit_number_of_tweets(10))
408 |         self.assertNotEqual(sample_tweets, first_ten_tweets)
409 | 
410 |     def test_sample_chains_and_dumps(self):
411 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
412 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
413 | 
414 |         output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
415 |         collection = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
416 |         sample_tweets = collection.sample(10)
417 |         sample_tweets.dump_to_json(output_path)
418 |         self.assertTrue(os.path.getsize(output_path) > 0)
419 |         with open(output_path) as f:
420 |             self.assertEqual(10, len([line for line in f]))
421 | 
422 |         if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
423 |             os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
424 | 
425 |     def test_set_custom_filter_properly_filters(self):
426 |         file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
427 |         dataset_one = SmappDataset(['bson', file_path])
428 |         full_collection_len = len(list(dataset_one))
429 |         def is_tweet_a_retweet(tweet):
430 |             if 'retweeted' in tweet and tweet['retweeted']:
431 |                 return True
432 |             else:
433 |                 return False
434 |         num_retweets = len(list(dataset_one.set_custom_filter(is_tweet_a_retweet)))
435 | 
436 |         dataset_two = SmappDataset(['bson', file_path])
437 |         def is_not_a_retweet(tweet):
438 |             if 'retweeted' in tweet and tweet['retweeted']:
439 |                 return False
440 |             else:
441 |                 return True
442 |         num_non_retweets = len(list(dataset_two.set_custom_filter(is_not_a_retweet)))
443 |         self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
444 | 
445 | 
446 | if __name__ == '__main__':
447 |     unittest.main()
448 | 


--------------------------------------------------------------------------------