├── .gitignore ├── .pylintrc ├── LICENSE ├── README.md ├── environment.yml ├── pysmap ├── __init__.py ├── mltools │ ├── __init__.py │ ├── crowd_model.py │ └── smapp_model.py ├── twitterutil │ ├── __init__.py │ ├── smapp_collection.py │ └── smapp_dataset.py └── viz │ ├── __init__.py │ ├── networks.py │ └── plots.py ├── setup.py └── test ├── __init__.py ├── data ├── invalid.bson ├── valid-single.bson.json ├── valid.bson ├── valid.bson.json └── valid.csv ├── test_crowd_model.py ├── test_networks.py ├── test_plots.py ├── test_smapp_collection.py └── test_smapp_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | .DS_Store 65 | chart_tests/ 66 | 67 | config.py -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | 3 | # Enable the message, report, category or checker with the given id(s). You can 4 | # either give multiple identifier separated by comma (,) or put this option 5 | # multiple time. 6 | #enable= 7 | 8 | # Disable the message, report, category or checker with the given id(s). You 9 | # can either give multiple identifier separated by comma (,) or put this option 10 | # multiple time (only on the command line, not in the configuration file where 11 | # it should appear only once). 12 | disable=pointless-string-statement, too-many-branches, missing-docstring, too-many-arguments, invalid-name, line-too-long, duplicate-code, simplifiable-if-statement 13 | 14 | [FORMAT] 15 | indent-string=\t -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 smapp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | _ __ _ _ ___ _ __ ___ __ _ _ __ 3 | | '_ \| | | / __| '_ ` _ \ / _` | '_ \ 4 | | |_) | |_| \__ \ | | | | | (_| | |_) | 5 | | .__/ \__, |___/_| |_| |_|\__,_| .__/ 6 | |_| |___/ |_| 7 | ``` 8 | 9 | [![PyPI](https://img.shields.io/pypi/v/pysmap.svg)](https://pypi.python.org/pypi/pysmap) [![PyPI](https://img.shields.io/pypi/l/pysmap.svg)](https://github.com/SMAPPNYU/pysmap/blob/master/LICENSE) 10 | 11 | :snake: pysmap is a high level toolkit for dealing with twitter data it also has a higher level interface for [smappdragon](https://github.com/SMAPPNYU/smappdragon). it has functionality from the old toolkit and functionality from our old util library smappPy. 12 | - [twitterutil](#twitterutil) 13 | - [smapp_dataset](#smapp_dataset) 14 | - [smapp_collection](#smapp_collection) 15 | - [set_custom_filter](#set_custom_filter) 16 | - [get_tweets_containing](#get_tweets_containing) 17 | - [get_top_terms](#get_top_terms) 18 | - [get_tweet_texts](#get_tweet_texts) 19 | - [get_date_range](#get_date_range) 20 | - [get_geo_enabled](#get_geo_enabled) 21 | - [get_non_geo_enabled](#get_non_geo_enabled) 22 | - [get_top_entities](#get_top_entities) 23 | - [get_top_hashtags](#get_top_hashtags) 24 | - [get_top_urls](#get_top_urls) 25 | - [get_top_mentions](#get_top_mentions) 26 | - [get_top_media](#get_top_media) 27 | - [get_top_symbols](#get_top_symbols) 28 | - [find_date_range](#find_date_range) 29 | - [count_tweet_terms](#count_tweet_terms) 30 | - [count_tweets](#count_tweets) 31 | - [exclude_retweets](#exclude_retweets) 32 | - [get_retweets](#get_retweets) 33 | - [user_location_contains](#user_location_contains) 34 | - [user_description_contains](#user_description_contains) 35 | - [user_id_is](#user_id_is) 36 | - [place_name_contains_country](#place_name_contains_country) 37 | - [within_geobox](#within_geobox) 38 | - [limit_number_of_tweets](#limit_number_of_tweets) 39 | - [tweet_language_is](#tweet_language_is) 40 | - [detect_tweet_language](#detect_tweet_language) 41 | - [user_language_is](#user_language_is) 42 | - [sample](#sample) 43 | - [dump_to_bson](#dump_to_bson) 44 | - [dump_to_json](#dump_to_json) 45 | - [dump_to_csv](#dump_to_csv) 46 | - [dump_to_sqlite_db](#dump_to_sqlite_db) 47 | - [viz](#viz) 48 | - [plots](#plots) 49 | - [bar_graph_tweet_field_grouped_by_period](#bar_graph_tweet_field_grouped_by_period) 50 | - [bar_graph_languages](#bar_graph_languages) 51 | - [bar_graph_user_languages](#bar_graph_user_languages) 52 | - [bar_graph_tweets](#bar_graph_tweets) 53 | - [bar_graph_tweets_with_urls](#bar_graph_tweets_with_urls) 54 | - [bar_graph_tweets_with_media](#bar_graph_tweets_with_media) 55 | - [bar_graph_tweets_with_mentions](#bar_graph_tweets_with_mentions) 56 | - [bar_graph_tweets_with_hashtags](#bar_graph_tweets_with_hashtags) 57 | - [bar_graph_tweets_with_symbols](#bar_graph_tweets_with_symbols) 58 | - [bar_graph_tweets_with_retweets](#bar_graph_tweets_with_retweets) 59 | - [bar_graph_tweets_with_locations](#bar_graph_tweets_with_locations) 60 | - [networks](#networks) 61 | - [retweet_network](#retweet_network) 62 | - [models](#models) 63 | - [crowd_model](#crowd_model) 64 | 65 | # installation 66 | 67 | `pip install pysmap` 68 | 69 | `pip install pysmap --upgrade` 70 | 71 | # twitterutil 72 | 73 | the package with an array of twitter tools. 74 | 75 | # smapp_collection 76 | 77 | this is the smapp_collection class, an abstraction of smappdragon collections. 78 | 79 | abstract: 80 | ```python 81 | from pysmap import SmappCollection 82 | 83 | collection = SmappCollection(DATA_TYPE, OTHER_INPUTS) 84 | ``` 85 | 86 | practical: 87 | ```python 88 | from pysmap import SmappCollection 89 | 90 | collection = SmappCollection('bson', '/path/to/my/bson/file.bson') 91 | # or 92 | collection = SmappCollection('mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1') 93 | # or 94 | collection = SmappCollection('json', '/path/to/my/file.json') 95 | # or 96 | collection = SmappCollection('csv', '/path/to/my/csv/file.csv') 97 | ``` 98 | 99 | *returns* a collection object that you can use to call methods below on 100 | 101 | # smapp_dataset 102 | 103 | this is the dataset class, it can be used anywhere one might use a [SmappCollection](#smapp_collection) object. it lets you combine collections and other datasets at will. 104 | 105 | abstract: 106 | ```python 107 | 108 | # standard 109 | 110 | dataset = SmappDataset([TYPE_OF INPUT, FILE_PATH], [TYPE_OF_INPUT, MONGO_INPUTS]) 111 | 112 | # or with regex for matching mongo databases/collections 113 | # this is only for mongo and not for files 114 | 115 | dataset = SmappDataset(collection_regex=REGEX, database_regex=REGEX, [MONGO_INPUT, MONGO_INPUT, etc]) 116 | 117 | dataset = SmappDataset(collection_regex=REGEX, [MONGO_INPUT, MONGO_INPUT, etc]) 118 | 119 | # or with a unix style file pattern for matching file paths (this is not regex) 120 | # this is only for files and not for mongo 121 | 122 | dataset = SmappDataset([TYPE_OF_INPUT, 'file_pattern', FILE_PATTTERN], [TYPE_OF_INPUT, 'file_pattern', FILE_PATTTERN], etc) 123 | ``` 124 | 125 | practical: 126 | ```python 127 | # combine collections of the same type 128 | dataset = SmappDataset(['bson', '/path/to/my/bson/file1.bson'], ['bson', '/path/to/my/bson/file2.bson'], ['bson', '/path/to/my/bson/file3.bson']) 129 | 130 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_2']) 131 | 132 | # combine collections of different types 133 | 134 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['bson', '/path/to/my/bson/file1.bson'], ['json', '/path/to/my/bson/json_file.json']) 135 | 136 | # or combine collections and datasets 137 | 138 | collection = SmappCollection('csv', '/path/to/my/csv/file.csv') 139 | 140 | dataset_one = SmappDataset(['bson', '/path/to/my/bson/file1.bson'], ['bson', '/path/to/my/bson/file2.bson'], ['bson', '/path/to/my/bson/file3.bson']) 141 | 142 | dataset_two = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_1'], ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler', 'tweets_2']) 143 | 144 | final_dataset = SmappDataset(['json', '/path/to/my/bson/json_file.json'], dataset_one, dataset_two, collection) 145 | 146 | # or use regex to match for multiple collections/dbs 147 | 148 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD', 'GERMANY_ELECTION_2015_Nagler'], collection_regex='(^data$|^tweets$|^tweets_\d+$)') 149 | 150 | dataset = SmappDataset(['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD'], collection_regex='(^tweets$|^tweets_\d+$)', database_regex='(^GERMANY_ELECTION_2015_Nagler_\d+$)') 151 | 152 | # or use a file pattern to match many files 153 | dataset_one = SmappDataset(['bson', 'file_pattern', '~/smappwork/data_*.bson']) 154 | 155 | dataset_two = SmappDataset(['json', 'file_pattern', '~/smappwork/data_*.json'], ['csv', 'file_pattern', '/Users/yvan/data/counts_*.csv']) 156 | 157 | dataset_three = SmappDataset(['json', '/non/pattern/path/to/my/bson/json_file.json'], dataset_one, dataset_two) 158 | ``` 159 | 160 | `regex` - regex stands for 'regular expression' its the way programmers pattern match on words, so regex inputs for SmappDataset allow you to pattern match data sources, you must use regex type input patterns or lists+collections+datasets as inputs you cannot use both 161 | 162 | `collection_regex` - this is required, to grab all collections named tweets_X (backwards compatiblilty) use `(^tweets$|^tweets_\d+$)` for new/regular collections use `(^data$)` or `(^data$|^tweets$|^tweets_\d+$)` for compatilibly backwards and forwards, if you have a different naming convention you can use a regex to match for that. 163 | 164 | `database_regex` - only required for mongo datasets, you can omit this variable if you are not using regex to try to match databases 165 | 166 | `file_pattern` - use to select multiple file paths based off a unix style pattern. pysmap smapp_dataset uses [glob](https://docs.python.org/2/library/glob.html#module-glob) under the hood to match the filepaths. pysmap also includes tilde `~` expansion which is not included by glob. so for example: 167 | ``` 168 | /scratch/smapp/test_dumps_dec1/dump_*.json 169 | #would match 170 | /scratch/smapp/test_dumps_dec1/dump_1.json 171 | /scratch/smapp/test_dumps_dec1/dump_blah_blah.json 172 | #and 173 | try_dump_dat_parallel_?.bson 174 | #would match 175 | try_dump_dat_parallel_0.bson 176 | try_dump_dat_parallel_1.bson 177 | #and 178 | try_dump_dat_parallel_[0-9].* 179 | #would match 180 | try_dump_dat_parallel_0.bson 181 | try_dump_dat_parallel_0.csv 182 | try_dump_dat_parallel_0.db 183 | try_dump_dat_parallel_0.json 184 | try_dump_dat_parallel_1.bson 185 | try_dump_dat_parallel_1.csv 186 | try_dump_dat_parallel_1.db 187 | try_dump_dat_parallel_1.json 188 | ``` 189 | read about [unix file patterns here](http://www.robelle.com/smugbook/wildcard.html). 190 | 191 | regex explanation example in the statement: 192 | 193 | ```python 194 | dataset = SmappDataset(collection_regex='(^tweets$|^tweets_\d+$)', database_regex='(^GERMANY_ELECTION_2015_Nagler_\d+$)', ['mongo', 'superhost.bio.nyu.edu', 27574, smappReadWriteUserName, 'PASSWORD']) 195 | ``` 196 | 197 | the collection regex `(^tweets$|^tweets_\d+$)` means match every collection that is called tweets or tweets_\d where `\d` is some number. so tweets, tweets_1, tweets_2, etc 198 | 199 | the database regex `(^GERMANY_ELECTION_2015_Nagler_\d+$)` means match every database that has GERMANY_ELECTION_2015_Nagler_\d where `\d` is some number. so GERMANY_ELECTION_2015_Nagler_1, GERMANY_ELECTION_2015_Nagler_2, etc. the regex will not match 'GERMANY_ELECTION_2015_Nagler' in this case as it lacks the term '^GERMANY_ELECTION_2015_Nagler$'. 200 | 201 | *input* several `SmappDataset` objects and/or `SmappCollection` objects 202 | 203 | *output* a SmappDataset object that can be used the same way a [SmappCollection](#smapp_collection) can be 204 | 205 | #iterate through tweets 206 | 207 | iterate through the tweets in the collection you've made. 208 | 209 | abstract: 210 | ```python 211 | for tweet in collection: 212 | print(tweet) 213 | ``` 214 | 215 | practical: 216 | ```python 217 | for tweet in collection.get_tweets_containing('cat').tweet_language_is('fr'): 218 | print(tweet) 219 | ``` 220 | 221 | note: 222 | 223 | if on nyu hpc, print will not work, totally out of my control. you gotta change locale. 224 | 225 | to fix it, you need to reset the default bash encoding BEFORE opening/running python. just type in bash: 226 | ``` 227 | LANG=en_US.utf8 228 | ``` 229 | 230 | # set_custom_filter 231 | 232 | sets a user defined function to act as a filter 233 | 234 | abstract: 235 | ```python 236 | collection.set_custom_filter(TERM) 237 | ``` 238 | 239 | practical: 240 | ```python 241 | def my_cust_filter(tweet): 242 | if 'text' in tweet and 'cats' in tweet['text']: 243 | return True 244 | else: 245 | return False 246 | 247 | collection.set_custom_filter(my_cust_filter) 248 | ``` 249 | 250 | *returns* a collection or dataset whese all tweets will be passed through the filter 251 | 252 | note this is just a wrapper for smappdragons [set_custom_filter](https://github.com/SMAPPNYU/smappdragon#set_custom_filter) function. 253 | 254 | # get_tweets_containing 255 | 256 | gets tweets containing the specified term. 257 | 258 | abstract: 259 | ```python 260 | collection.get_tweets_containing(TERM) 261 | ``` 262 | 263 | practical: 264 | ```python 265 | collection.get_tweets_containing('cats') 266 | ``` 267 | 268 | *returns* a collection which will filter out any tweets that do no have the specified term 269 | 270 | # count_tweet_terms 271 | 272 | counts the number of tweets that contain all these terms 273 | 274 | abstract: 275 | ```python 276 | collection.count_tweet_terms(TERM_1, TERM_2, ETC) 277 | ``` 278 | 279 | practical: 280 | ```python 281 | count = collection.count_tweet_terms('cats', 'dogs') 282 | print(count) 283 | ``` 284 | 285 | *returns* an integer value that counts all the tweets containing the terms 286 | 287 | # count_tweets 288 | 289 | counts the number of tweets in a collection 290 | 291 | abstract: 292 | ```python 293 | collection.count_tweets() 294 | ``` 295 | 296 | practical: 297 | ```python 298 | count = collection.count_tweets() 299 | print(count) 300 | ``` 301 | 302 | *returns* an integer value that counts all the tweets in a collection 303 | 304 | # get_top_terms 305 | 306 | counts thet top words in a collection, [english stop words](https://github.com/Alir3z4/stop-words/blob/25c6a0aea665871e887f155b883e950c3743ce50/english.txt) are automatically included, otherwise you can specify your own set of stopwords with python stop-wrods. the stopwords are words taht get ignored and dwill not return in the final counts 307 | 308 | abstract: 309 | ```python 310 | collection.count_tweet_terms(MUMBER_OF_TERMS, LIST_OF_STOP_WORDS) 311 | ``` 312 | 313 | practical: 314 | ```python 315 | count = collection.get_top_terms(5) 316 | #or 317 | count = collection.get_top_terms(5, ['blah', 'it', 'cat']) 318 | print(count) 319 | ``` 320 | 321 | *note* `LIST_OF_STOP_WORDS` is optional, it is set to englis hby default 322 | 323 | *returns* a dictionary that has all the top_X terms 324 | 325 | # get_tweet_texts 326 | 327 | returns a new collection where the only key will be tweets. 328 | 329 | abstract: 330 | ```python 331 | for text in collection.get_tweet_texts(): 332 | print(text) 333 | ``` 334 | 335 | practical: 336 | ```python 337 | for text in collection.get_tweet_texts(): 338 | print(text) 339 | ``` 340 | 341 | *returns* an iterator that returns just the text of each tweet 342 | 343 | # get_date_range 344 | 345 | gets tweets in a date range specified by python datmetime objects 346 | 347 | abstract: 348 | ```python 349 | collection.get_date_range(START, END) 350 | ``` 351 | 352 | practical: 353 | ```python 354 | from datetime import datetime 355 | collection.get_date_range(datetime(2014,1,30), datetime(2014,4,30)) 356 | ``` 357 | 358 | *returns* a collection that will only return tweets from the specified datetime range 359 | 360 | # find_date_range 361 | 362 | finds the date range (min/max date in a collection) 363 | 364 | abstract: 365 | ```python 366 | collection.find_date_range() 367 | ``` 368 | 369 | practical: 370 | ```python 371 | from datetime import datetime 372 | range = collection.find_date_range() 373 | print(range) 374 | # or compare to datetime objects 375 | if range['date_min'] > datetime.now() 376 | print('greater') 377 | elif range['date_max'] < datetime.now(): 378 | print('less') 379 | print('whatever') 380 | ``` 381 | 382 | *output* 383 | ``` 384 | {"date_min":datetime(2016,5,23),"date_max":datetime(2016,5,24)} 385 | ``` 386 | 387 | *returns* a dictionary with two datetime objects 388 | 389 | # tweet_language_is 390 | 391 | only returns tweets where the language is the specified one (differs from [detect_tweet_language](#detect_tweet_language) just checks the field on the tweet object reported by twitter, does not detect) 392 | 393 | abstract: 394 | ```python 395 | collection.tweet_language_is(LANGUAGE_CODES) 396 | ``` 397 | 398 | practical: 399 | ```python 400 | #get tweets in english and french 401 | collection.tweet_language_is('en', 'fr') 402 | ``` 403 | 404 | *returns* a collection where all the tweets have their text language as the specified language 405 | 406 | # detect_tweet_language 407 | 408 | a filter that filters tweets based on language detetction. (differs from [tweet_language_is](#tweet_language_is) because it actually detects the language, tweet_language_is just checks the field on the tweet object reported by twitter) 409 | 410 | abstract: 411 | ```python 412 | collection.detect_tweet_language(LANGUAGE_CODES) 413 | ``` 414 | 415 | practical: 416 | ```python 417 | #get tweets in english 418 | collection.detect_tweet_language('en') 419 | #get tweetsi n english and french 420 | collection.detect_tweet_language('en', 'fr') 421 | ``` 422 | 423 | *returns* a collection where all the tweets have their text language as the specified language 424 | 425 | note: uses [langdetect](https://pypi.python.org/pypi/langdetect?) under the hood. it is a pythoh port of google language detection tool. 426 | 427 | 428 | # user_language_is 429 | 430 | only returns tweets where the user's specified language is the specified one 431 | 432 | abstract: 433 | ```python 434 | collection.user_language_is(LANGUAGE_CODE) 435 | ``` 436 | 437 | practical: 438 | ```python 439 | collection.user_language_is('en') 440 | ``` 441 | 442 | *returns* a collection where all the tweets will come from users whose specified language matches the input 443 | 444 | # exclude_retweets 445 | 446 | exclueds retweets from your collection 447 | 448 | abstract: 449 | ```python 450 | collection.exclude_retweets() 451 | ``` 452 | 453 | practical: 454 | ```python 455 | collection.exclude_retweets() 456 | ``` 457 | 458 | *returns* a collection where there are no retweets 459 | 460 | 461 | # get_retweets 462 | 463 | gets all tweets that are retweets from the collection 464 | 465 | abstract: 466 | ```python 467 | collection.get_retweets() 468 | ``` 469 | 470 | practical: 471 | ```python 472 | collection.get_retweets() 473 | ``` 474 | 475 | *returns* a collection where there are only retweets 476 | 477 | # user_location_contains 478 | 479 | returns tweets that have a user location that contain one of the listed terms 480 | 481 | abstract: 482 | ```python 483 | collection.user_location_contains(PLACE_TERM, ANOTHER_PLACE_TERM, ETC) 484 | ``` 485 | 486 | practical: 487 | ```python 488 | collection.tweets_with_user_location('CA', 'FL', 'NY', 'palm springs') 489 | ``` 490 | 491 | *returns* a collection where the user location field of that tweet has any of the specified places 492 | 493 | # user_description_contains 494 | 495 | returns tweets where the user description (for the user tweeting) contained the requested terms 496 | 497 | abstract: 498 | ```python 499 | collection.user_description_contains(TERM, TERM, ETC) 500 | ``` 501 | 502 | practical: 503 | ```python 504 | collection.user_description_contains('dad', 'conservative', 'texas', 'mother') 505 | ``` 506 | 507 | *returns* a collection where the user location field of that tweet has any of the specified places 508 | 509 | # user_id_is 510 | 511 | returns tweets that match one of the passed in user ids 512 | 513 | abstract: 514 | ```python 515 | collection.user_id_is(ID, ID, ETC) 516 | ``` 517 | 518 | practical: 519 | ```python 520 | collection.user_id_is(379851447, 149751818) 521 | ``` 522 | 523 | *returns* a collection where the user id field matches one of the passed in ids 524 | 525 | # place_name_contains_country 526 | 527 | returns tweets that have a user location 528 | 529 | abstract: 530 | ```python 531 | collection.place_name_contains_country(PLACE_TERM, ANOTHER_PLACE_TERM, ETC) 532 | ``` 533 | 534 | practical: 535 | ```python 536 | collection.place_name_contains_country('United States', 'France', 'Spain') 537 | ``` 538 | 539 | *returns* a collection where the places field of that tweet has the specified place 540 | 541 | note: for more information about places see https://dev.twitter.com/overview/api/places 542 | 543 | # within_geobox 544 | 545 | returns tweets that ari within a geobox 546 | 547 | abstract: 548 | ```python 549 | collection.within_geobox(sw_longitude, sw_latitude, ne_longitude, ne_latitude) 550 | ``` 551 | 552 | practical: 553 | ```python 554 | collection.within_geobox(-77.042484, 38.886323, -77.010384, 38.894006) 555 | ``` 556 | 557 | *returns* a collection where the tweets streaming through will be from the stated geobox 558 | 559 | note: 560 | sw_longitude, sw_latitude - the southwest corner 561 | ne_longitude, ne_latitude - the northeast corner 562 | geobox specified by points (longitude, latitude) 563 | 564 | # get_geo_enabled 565 | 566 | returns only geotagged tweets 567 | 568 | abstract: 569 | ```python 570 | collection.get_geo_enabled() 571 | ``` 572 | 573 | practical: 574 | ```python 575 | collection.get_geo_enabled() 576 | ``` 577 | 578 | *returns* a collection that only produces geo tagged tweets 579 | 580 | # get_non_geo_enabled 581 | 582 | returns only non geotagged tweets 583 | 584 | abstract: 585 | ```python 586 | collection.get_non_geo_enabled() 587 | ``` 588 | 589 | practical: 590 | ```python 591 | collection.get_non_geo_enabled() 592 | ``` 593 | 594 | *returns* a collection that only produces non geo tagged tweets 595 | 596 | # limit_number_of_tweets 597 | 598 | limits the # of tweets a collection can output 599 | 600 | abstract: 601 | ```python 602 | collection.limit_number_of_tweets(LIMIT_NUMEBER) 603 | ``` 604 | 605 | practical: 606 | ```python 607 | collection.limit_number_of_tweets(145) 608 | 609 | for tweet in collection.limit_number_of_tweets(145): 610 | print(tweet) 611 | ``` 612 | 613 | *returns* a collection that is limited on terms of the number of tweets it can output 614 | 615 | node: works differently than expected on datasets, it will apply this limit to each sub collection/file in the dataset, so if you have 5 files in a dataset it would apply a liit of 145 to each file in the dataset, and 616 | you would end up with 145 x 5 = 725 tweets. 617 | 618 | # sample 619 | 620 | gets a sample of tweets from a collection using reservior sampling 621 | 622 | abstract: 623 | ```python 624 | collection.sample(NUMBER_OF_TWEETS_TO_SAMPLE) 625 | ``` 626 | 627 | practical: 628 | ```python 629 | 630 | for tweet in collection.sample(10): 631 | print(tweet) 632 | ``` 633 | 634 | *returns* a collection that only returns a sample of tweets as big as the number of tweets you specified 635 | 636 | note: you can [read more about reservior sampling here](http://www.geeksforgeeks.org/reservoir-sampling/) and [here](https://en.wikipedia.org/wiki/Reservoir_sampling). reservior sampling allows us to sample a data set in one pass without knowing ahead of time how man ythings are in taht dataset and still match the underlying distribution of the data. 637 | 638 | note: if you try to sample more tweets than are in a collection or dataset this method will throw an error. this is because reservior sampling does not work in this scenario. count your datasets first if you are unsure how many data points are in them. 639 | 640 | # dump_to_bson 641 | 642 | abstract: 643 | ```python 644 | collection.dump_to_bson(output_file) 645 | ``` 646 | 647 | practical: 648 | ```python 649 | collection.dump_to_bson('/Users/blah/your_data.bson') 650 | # or with a dataset dumping to one file 651 | dataset.dump_to_bson('/Users/blah/your_data.bson') 652 | # or with a dataset dumping to one file for each input 653 | dataset.dump_to_bson('/Users/blah/your_data.bson', parallel=True) 654 | ``` 655 | 656 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file. 657 | 658 | *input* a path to a bson file 659 | 660 | *output* a bson file with the data from your SmappCollection 661 | 662 | note: if you use the [sample](#sample) method you can no longer use the 'parallel' argument to any dump methods, sample has to override the iterators for aech collection, essentially stripping us of the original iterators. 663 | 664 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps. 665 | 666 | # dump_to_json 667 | 668 | abstract: 669 | ```python 670 | collection.dump_to_json(output_file) 671 | ``` 672 | 673 | practical: 674 | ```python 675 | collection.dump_to_json('/Users/blah/your_data.json') 676 | # or with a dataset dumping to one file 677 | dataset.dump_to_json('/Users/blah/your_data.json') 678 | # or with a dataset dumping to one file for each input 679 | dataset.dump_to_json('/Users/blah/your_data.json', parallel=True) 680 | ``` 681 | 682 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file. 683 | 684 | *input* a path to a json file 685 | 686 | *output* a json file with the data from your SmappCollection 687 | 688 | note: if you use the [sample](#sample) method you can no longer use the 'parallel' argument to any dump methods, sample has to override the iterators for aech collection, essentially stripping us of the original iterators. 689 | 690 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps. 691 | 692 | # dump_to_csv 693 | 694 | dumps a collection/dataset to a csv based on the fields you specify. can see the fields inside a tweet object [here](https://dev.twitter.com/overview/api/tweets). 695 | 696 | abstract: 697 | ```python 698 | collection.dump_to_csv('/PATH/TO/OUTPUT/FILE.csv', ['FIELD1', 'FIELD2', 'FIELD3.SUBFIELD', ETC]) 699 | ``` 700 | 701 | practical: 702 | ```python 703 | collection.dump_to_csv('~/smappstuff/file.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 704 | # or 705 | collection.limit_number_of_tweets(5).dump_to_csv('/Users/kevin/work/smappwork/file.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 706 | # or with a dataset dumping to one file 707 | dataset.dump_to_csv('/Users/blah/your_data.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 708 | # or with a dataset dumping to one file for each input 709 | dataset.dump_to_csv('/Users/blah/your_data.csv', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], parallel=True) 710 | # or if you have '.' in input fields that you want interpreted literally 711 | collection.dump_to_csv('out_file.csv', ['id_str'], top_level=True) 712 | # or if you want to omit the header 713 | collection.dump_to_csv('out_file.csv', ['id_str'], top_level=False) 714 | ``` 715 | 716 | *input* a path to a csv file and fields to keep 717 | 718 | ```python 719 | import pysmap 720 | 721 | collection = pysmap.SmappCollection('json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json') 722 | # or dataset 723 | dataset = pysmap.SmappDataset( 724 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json'], 725 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_19_2016__00_00_00__23_59_59.json'], 726 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_20_2016__00_00_00__23_59_59.json'] 727 | ) 728 | 729 | field_list = ['id_str', 730 | 'coordinates.coordinates.0', 731 | 'coordinates.coordinates.1', 732 | 'user.id_str', 733 | 'user.lang', 734 | 'lang', 735 | 'text', 736 | 'user.screen_name', 737 | 'user.location', 738 | 'user.description', 739 | 'created_at', 740 | 'user.friends_count', 741 | 'user.followers_count', 742 | 'retweet_count', 743 | 'entities.urls.0.expanded_url', 744 | 'entities.urls.1.expanded_url', 745 | 'entities.urls.2.expanded_url', 746 | 'entities.urls.3.expanded_url', 747 | 'entities.urls.4.expanded_url'] 748 | 749 | dataset.dump_to_csv('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.csv', field_list) 750 | ``` 751 | 752 | *output* a csv file with the data from your SmappCollection, but only the fields you chose to keep 753 | 754 | ```csv 755 | id_str,coordinates.coordinates.0,coordinates.coordinates.1,user.id_str,user.lang,lang,text,user.screen_name,user.location,user.description,created_at,user.friends_count,user.followers_count,retweet_count,entities.urls.0.expanded_url,entities.urls.1.expanded_url,entities.urls.2.expanded_url,entities.urls.3.expanded_url,entities.urls.4.expanded_url 756 | 757 | 788556059375874048,50,50,2240756971,en,en,RT @dailypenn: The DP and @WellesleyNews are jointly endorsing Wellesley alum @HillaryClinton over Wharton ’68 @realDonaldTrump.… ,CorrectRecord,,Correct The Record is a strategic research and rapid response team designed to defend Hillary Clinton from baseless attacks.,Wed Oct 19 01:43:09 +0000 2016,224,23080,0,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/ 758 | 759 | 788556059317186560,,,4655522325,fr,fr,Barack Obama conseille à Donald Trump « d’arrêter de pleurnicher » - https://t.co/eEl1mOnIwp https://t.co/8EeOGya28r,metrodakar_net,Senegal,,Wed Oct 19 01:43:09 +0000 2016,110,657,0,http://www.metrodakar.net/barack-obama-conseille-a-donald-trump-darreter-de-pleurnicher/,,,, 760 | ``` 761 | 762 | `num files` - (similar to the former the parallel argument) with the 'num_files' argument you can tell your dataset to write to a specific number of files. the method functionality had to be changed to fix the sample method. the data set will try to write evenly to each file. 763 | 764 | note: to get things inside a list you need to refer to their list index. its better to overshoot (so if you want to get 5 entites urls where there are 5) you would use `['entities.urls.0.expanded_url','entities.urls.1.expanded_url','entities.urls.2.expanded_url','entities.urls.3.expanded_url','entities.urls.4.expanded_url']`, for tweet objects with less than 5 `urls` entities this will fill out urls up to 5 urls, if there are less than 5 the extra ones will be empty `,,` fields 765 | 766 | note: empty lists `[]` will return nothing. you must specify fields. 767 | 768 | note: fields that have no value will appear empty `,,` 769 | 770 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps. 771 | 772 | # dump_to_sqlite_db 773 | 774 | dumps all tweets (only the fields you specify) to an sqlite database file 775 | 776 | abstract: 777 | ```python 778 | collection.dump_to_sqlite_db('/PATH/TO/OUTPUT/FILE.db', ['FIELD1', 'FIELD2', 'FIELD3.SUBFIELD', ETC]) 779 | ``` 780 | 781 | pratical: 782 | ```python 783 | import pysmap 784 | 785 | collection.dump_to_sqlite_db('~/smappstuff/file.db', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 786 | # or 787 | collection.limit_number_of_tweets(5).dump_to_sqlite_db('/Users/kevin/work/smappwork/file.db', ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 788 | # or 789 | dataset = pysmap.SmappDataset( 790 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_18_2016__00_00_00__23_59_59.json'], 791 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_19_2016__00_00_00__23_59_59.json'], 792 | ['json','/scratch/smapp/us_election_hillary_2016/data/us_election_hillary_2016_data__10_20_2016__00_00_00__23_59_59.json'] 793 | ) 794 | 795 | field_list = ['id_str', 796 | 'coordinates.coordinates.0', 797 | 'coordinates.coordinates.1', 798 | 'user.id_str', 799 | 'user.lang', 800 | 'lang', 801 | 'text', 802 | 'user.screen_name', 803 | 'user.location', 804 | 'user.description', 805 | 'created_at', 806 | 'user.friends_count', 807 | 'user.followers_count', 808 | 'retweet_count', 809 | 'entities.urls.0.expanded_url', 810 | 'entities.urls.1.expanded_url', 811 | 'entities.urls.2.expanded_url', 812 | 'entities.urls.3.expanded_url', 813 | 'entities.urls.4.expanded_url'] 814 | 815 | dataset.dump_to_sqlite_db('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.db', field_list) 816 | # or with a dataset dumping to one file for each input 817 | dataset.dump_to_sqlite_db('/scratch/smapp/compile_trump_hillary_csvs/us_election_hillary_2016_data.db', field_list, parallel=True) 818 | ``` 819 | 820 | *input* a collection object and a list of fields/subfields 821 | ``` 822 | [ 823 | 'id_str', 824 | 'coordinates.coordinates.0', 825 | 'coordinates.coordinates.1', 826 | 'user.id_str', 827 | 'user.lang', 828 | 'lang', 829 | 'text', 830 | 'user.screen_name', 831 | 'user.location', 832 | 'user.description', 833 | 'created_at', 834 | 'user.friends_count', 835 | 'user.followers_count', 836 | 'retweet_count', 837 | 'entities.urls.0.expanded_url', 838 | 'entities.urls.1.expanded_url', 839 | 'entities.urls.2.expanded_url', 840 | 'entities.urls.3.expanded_url', 841 | 'entities.urls.4.expanded_url' 842 | ] 843 | ``` 844 | 845 | *output* an sqlite db that looks like so: 846 | ``` 847 | sqlite> .schema 848 | CREATE TABLE data (id_str,user__id_str,text,entities__urls__0__expanded_url,entities__urls__1__expanded_url,entities__media__0__expanded_url,entities__media__1__expanded_url); 849 | sqlite> .tables 850 | data 851 | sqlite> select * from data; 852 | 686799531875405824|491074580|@_tessr @ProductHunt No one has stolen me yet. Security through obscurity.|NULL|NULL|NULL|NULL 853 | 686661056115175425|491074580|Predictions of peach's demise already starting. Nice.|NULL|NULL|NULL|NULL 854 | 686956278099349506|491074580|When was the state of the union first started? Ok wow since the office has existed. https://t.co/Cqgjkhr3Aa|https://en.wikipedia.org/wiki/State_of_the_Union#History|NULL|NULL|NULL 855 | 687115788487122944|491074580|RT @lessig: Looks like the @citizenequality act got a supporter tonight. Thank you @POTUS|NULL|NULL|NULL|NULL 856 | 686661056115175425|491074580|Predictions of peach's demise already starting. Nice.|NULL|NULL|NULL|NULL 857 | 687008713039835136|491074580|#GOPDebate approaching. Can't wait to observer a trump in its natural habitat!|NULL|NULL|NULL|NULL 858 | 687208777561448448|18673945|@yvanscher hey! saw u upvoted Cubeit on ProductHunt. Any feedback on how we can make Cubeit better for you? :) Thanks!|NULL|NULL|NULL|NULL 859 | 686662539913084928|491074580|RT @PopSci: iOS 9.3 update will tint your screen at night, for your health https://t.co/zrDt4TsoXB https://t.co/yXCEGQPHWp|http://pops.ci/cJWqhM|NULL|http://twitter.com/PopSci/status/686661925267206144/photo/1|NULL 860 | ``` 861 | 862 | note: the dump to sqlite method does not have a num_files (used to paralel) argument because the performance is bad with the sample method. 863 | 864 | note: all file dumps happen in append mode. This means that if the file you are trying to dump to already exists it will append data into this file. So we recommend dumping to new files when you run dumps. 865 | 866 | # get_top_entities 867 | 868 | returns the top twitter entites from a tweet object, you can [read about twitter entities here](https://dev.twitter.com/overview/api/entities-in-twitter-objects) 869 | 870 | abstract: 871 | ```python 872 | collection.top_entities({'ENTITY_FIELD':NUMBER_OF_TOP_TERMS, 'ENTITY_FIELD':NUMBER_OF_TOP_TERMS, 'ENTITY_FIELD':NUMBER_OF_TOP_TERMS}) 873 | ``` 874 | 875 | practical: 876 | ```python 877 | collection.top_entities({'user_mentions':5, 'media':3, 'hashtags':5, 'urls':0, 'user_mentions':2, 'symbols':2}) 878 | # or 879 | collection.top_entities({'hashtags':5}) 880 | ``` 881 | 882 | *returns* a dictionary containing tho requested entities and the counts for each entity 883 | 884 | input: 885 | ```python 886 | print collection.top_entities({'user_mentions':5, 'media':3, 'hashtags':5}) 887 | ``` 888 | 889 | output: 890 | ``` 891 | { 892 | "hashtags": { 893 | "JadeHelm": 118, 894 | "pjnet": 26, 895 | "jadehelm": 111, 896 | "falseflag": 32, 897 | "2a": 26 898 | }, 899 | "user_mentions": { 900 | "1619936671": 41, 901 | "27234909": 56, 902 | "733417892": 121, 903 | "10228272": 75, 904 | "233498836": 58 905 | }, 906 | "media": { 907 | "https://t.co/ORaTXOM2oX": 55, 908 | "https://t.co/pAfigDPcNc": 27, 909 | "https://t.co/TH8TmGuYww": 24 910 | } 911 | } 912 | ``` 913 | 914 | *returns* a dictionary filled with the top terms you requested 915 | 916 | note: passing 0 to a field like `'hashtags':0` returns all the hashtags 917 | 918 | note: no support for extended entities, retweet entities, user entites, or direct message entities. 919 | 920 | note: if not enough entity objects are returned they get filled into the dictionary with null like so: 921 | 922 | ``` 923 | { 924 | "symbols": { 925 | "0": null, 926 | "1": null, 927 | "hould": 1 928 | } 929 | } 930 | ``` 931 | 932 | # get_top_hashtags 933 | 934 | get the top hashtags from a collection 935 | 936 | abstract: 937 | ```python 938 | collection.get_top_hashtags(NUMBER_TOP) 939 | ``` 940 | 941 | practical: 942 | ```python 943 | hashtags = collection.get_top_hashtags(5) 944 | print(hashtags) 945 | ``` 946 | 947 | *returns* the top hashtags as a dictionary 948 | 949 | # get_top_urls 950 | 951 | get the top urls from a collection 952 | 953 | abstract: 954 | ```python 955 | collection.get_top_urls(NUMBER_TOP) 956 | ``` 957 | 958 | practical: 959 | ```python 960 | urls = collection.get_top_urls(6) 961 | print(urls) 962 | ``` 963 | 964 | *returns* the top urls from a collection 965 | 966 | # get_top_mentions 967 | 968 | get the top mentions from a collection (these are @ mentions) 969 | 970 | abstract: 971 | ```python 972 | collection.get_top_mentions(NUMBER_TOP) 973 | ``` 974 | 975 | practical: 976 | ```python 977 | mentions = collection.get_top_mentions(40) 978 | ``` 979 | 980 | *returns* the top @ mentions from a collection 981 | 982 | # get_top_media 983 | 984 | get the top media url references 985 | 986 | abstract: 987 | ```python 988 | collection.get_top_media(NUMBER_TOP) 989 | ``` 990 | 991 | practical: 992 | ```python 993 | media = collection.get_top_media(3) 994 | print(media) 995 | ``` 996 | 997 | *returns* the top media urls from a collection 998 | 999 | # get_top_symbols 1000 | 1001 | get the top symbols in a collection 1002 | 1003 | abstract: 1004 | ```python 1005 | collection.get_top_symbols(NUMBER_TOP) 1006 | ``` 1007 | 1008 | practical: 1009 | ```python 1010 | symbols = collection.get_top_symbols(10) 1011 | print(symbols) 1012 | ``` 1013 | 1014 | *returns* the top symbols from a collection the number of top symbols depends on how man yspecified for input 1015 | 1016 | # contributors 1017 | 1018 | you might ask the difference between, pysmap and smappdragon. pysmap is easier to use but less flexible/more rigid in its implementation. smappdragon is a flexible tool fro programmers to use, you can build arbitray filters for data, pysmap is just a set of filters. 1019 | 1020 | methods on smappdragon are lower level and more general. whereas methods on pysmap would be specific and rigid. so for example on smappdragon, you could [get all the entities](https://github.com/SMAPPNYU/smappdragon#top_entities), on pysmap you would have to ask for hashtags, mentions, etc. (which are all entities). 1021 | 1022 | another example, something like [apply_labels](https://github.com/SMAPPNYU/smapp-toolkit#apply_labels) would go on smappdragon, not pysmap. 1023 | 1024 | # viz 1025 | 1026 | a set of visualization tools, basically ways to graph and visualize a [SmappCollection](#smapp_collection) 1027 | 1028 | # plots 1029 | 1030 | a set of graph tools 1031 | 1032 | # bar_graph_tweet_field_grouped_by_period 1033 | 1034 | a tool that can be used to create generalized bar graphs from a smapp collection an various tweet data. 1035 | 1036 | abstract: 1037 | ```python 1038 | bar_graph_tweet_field_grouped_by_period(SMAPP_COLLECTION, TWEET_FIELD, TWEET_FIELD_VALUES_TO_MATCH, CUSTOM_FILTER_FUNCTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1039 | ``` 1040 | 1041 | practical: 1042 | ```python 1043 | from pysmap import SmappCollection, plots 1044 | 1045 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1046 | output_path = 'doc/output_graph.html' 1047 | 1048 | def custom_filter(tweet): 1049 | return True 1050 | 1051 | plots.bar_graph_tweet_field_grouped_by_period(collection, 'user.lang', ['en', 'fr', 'es'], custom_filter, 'weeks', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1052 | ``` 1053 | 1054 | *returns* an html graph file and opens the graph in the default browser of the user 1055 | 1056 | # bar_graph_languages 1057 | 1058 | make a bar graph of the number of tweets containing the specified languages 1059 | 1060 | abstract: 1061 | ```python 1062 | bar_graph_languages(SMAPP_COLLECTION, LANGUAGES_TO_MATCH, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1063 | ``` 1064 | 1065 | practical: 1066 | ```python 1067 | from pysmap import SmappCollection, plots 1068 | 1069 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1070 | output_path = 'doc/output_graph.html' 1071 | 1072 | plots.bar_graph_languages(collection, ['en', 'fr', 'es'], 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1073 | ``` 1074 | 1075 | *returns* an html graph file and opens the graph in the default browser of the user 1076 | 1077 | # bar_graph_user_languages 1078 | 1079 | graph all the tweets where the users who made the tweets have one of the specified languages 1080 | 1081 | abstract: 1082 | ```python 1083 | bar_graph_user_languages(SMAPP_COLLECTION, LANGUAGES_TO_MATCH, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1084 | ``` 1085 | 1086 | practical: 1087 | ```python 1088 | from pysmap import SmappCollection, plots 1089 | 1090 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1091 | output_path = 'doc/output_graph.html' 1092 | 1093 | plots.bar_graph_user_languages(collection, ['en', 'fr', 'es'], 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1094 | ``` 1095 | 1096 | *returns* an html graph file and opens the graph in the default browser of the user 1097 | 1098 | # bar_graph_tweets 1099 | 1100 | graph all tweets per time period 1101 | 1102 | abstract: 1103 | ```python 1104 | bar_graph_tweets(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1105 | ``` 1106 | 1107 | practical: 1108 | ```python 1109 | from pysmap import SmappCollection, plots 1110 | 1111 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1112 | output_path = 'doc/output_graph.html' 1113 | 1114 | bar_graph_tweets(collection, period_type, start, end, output_path, 'time', 'tweet count', 'tweet count v time') 1115 | ``` 1116 | 1117 | *returns* an html graph file and opens the graph in the default browser of the user 1118 | 1119 | # bar_graph_tweets_with_urls 1120 | 1121 | graph all tweets that contain urls by time period 1122 | 1123 | abstract: 1124 | ```python 1125 | bar_graph_tweets_with_urls(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1126 | ``` 1127 | 1128 | practical: 1129 | ```python 1130 | from pysmap import SmappCollection, plots 1131 | 1132 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1133 | output_path = 'doc/output_graph.html' 1134 | 1135 | plots.bar_graph_tweets_with_urls(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1136 | ``` 1137 | 1138 | *returns* an html graph file and opens the graph in the default browser of the user 1139 | 1140 | # bar_graph_tweets_with_media 1141 | 1142 | graph all tweets that contain media (like images) by time period 1143 | 1144 | abstract: 1145 | ```python 1146 | bar_graph_tweets_with_media(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1147 | ``` 1148 | 1149 | practical: 1150 | ```python 1151 | from pysmap import SmappCollection, plots 1152 | 1153 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1154 | output_path = 'doc/output_graph.html' 1155 | 1156 | plots.bar_graph_tweets_with_media(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1157 | ``` 1158 | 1159 | *returns* an html graph file and opens the graph in the default browser of the user 1160 | 1161 | # bar_graph_tweets_with_mentions 1162 | 1163 | graph all tweets that contain user mentions by time period 1164 | 1165 | abstract: 1166 | ```python 1167 | bar_graph_tweets_with_mentions(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1168 | ``` 1169 | 1170 | practical: 1171 | ```python 1172 | from pysmap import SmappCollection, plots 1173 | 1174 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1175 | output_path = 'doc/output_graph.html' 1176 | 1177 | plots.bar_graph_tweets_with_mentions(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1178 | ``` 1179 | 1180 | *returns* an html graph file and opens the graph in the default browser of the user 1181 | 1182 | # bar_graph_tweets_with_hashtags 1183 | 1184 | graph all tweets that contain hashtags by time period 1185 | 1186 | abstract: 1187 | ```python 1188 | bar_graph_tweets_with_hashtags(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1189 | ``` 1190 | 1191 | practical: 1192 | ```python 1193 | from pysmap import SmappCollection, plots 1194 | 1195 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1196 | output_path = 'doc/output_graph.html' 1197 | 1198 | plots.bar_graph_tweets_with_hashtags(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1199 | ``` 1200 | 1201 | *returns* an html graph file and opens the graph in the default browser of the user 1202 | 1203 | # bar_graph_tweets_with_symbols 1204 | 1205 | graph all tweets that contain symbols (like stock tickers, $AAPL, $GOOG, $TWTR) by time period 1206 | 1207 | abstract: 1208 | ```python 1209 | bar_graph_tweets_with_symbols(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1210 | ``` 1211 | 1212 | practical: 1213 | ```python 1214 | from pysmap import SmappCollection, plots 1215 | 1216 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1217 | output_path = 'doc/output_graph.html' 1218 | 1219 | plots.bar_graph_tweets_with_symbols(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1220 | ``` 1221 | 1222 | *returns* an html graph file and opens the graph in the default browser of the user 1223 | 1224 | # bar_graph_tweets_with_retweets 1225 | 1226 | graph all tweets that are retweets by time period 1227 | 1228 | abstract: 1229 | ```python 1230 | bar_graph_tweets_with_retweets(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1231 | ``` 1232 | 1233 | practical: 1234 | ```python 1235 | from pysmap import SmappCollection, plots 1236 | 1237 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1238 | output_path = 'doc/output_graph.html' 1239 | 1240 | plots.bar_graph_tweets_with_retweets(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1241 | ``` 1242 | 1243 | *returns* an html graph file and opens the graph in the default browser of the user 1244 | 1245 | # bar_graph_tweets_with_location 1246 | 1247 | graph all tweets that have a location field attached to them 1248 | 1249 | abstract: 1250 | ```python 1251 | bar_graph_tweets_with_location(SMAPP_COLLECTION, SLICE_PERIOD, START_DATE, END_DATE, OUTPUT_FILE_PATH, X_LABEL, Y_LABEL, GRAPH_TITLE) 1252 | ``` 1253 | 1254 | practical: 1255 | ```python 1256 | from pysmap import SmappCollection, plots 1257 | 1258 | collection = SmappCollection('json', 'docs/tweet_collection.json') 1259 | output_path = 'doc/output_graph.html' 1260 | 1261 | plots.bar_graph_tweets_with_location(collection, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'time', 'tweet count', 'tweet count v time') 1262 | ``` 1263 | 1264 | *returns* an html graph file and opens the graph in the default browser of the user 1265 | 1266 | # networks 1267 | 1268 | code for making network graphs of twitter data 1269 | 1270 | # retweet_network 1271 | 1272 | export a retweet graph using the `networkx` library where users are nodes, retweets are directed edges. 1273 | 1274 | abstract: 1275 | ```python 1276 | import networkx as nx 1277 | from pysmap import networks 1278 | 1279 | digraph = networks.retweet_network(COLLECTION_OBJECT, TWEET_METADATA, USER_METADATA) 1280 | nx.write_graphml(digraph, '/path/where/you/want/your.graphml') 1281 | ``` 1282 | 1283 | practical: 1284 | ```python 1285 | import networkx as nx 1286 | from pysmap import networks 1287 | 1288 | tweet_fields = ['id_str', 'retweeted_status.id_str', 'timestamp', 'text', 'lang'] 1289 | user_fields = ['id_str', 'screen_name', 'location', 'description'] 1290 | 1291 | digraph = networks.retweet_network(collection, tweet_fields, user_fields) 1292 | nx.write_graphml(digraph, '~/smappdata/collection_retweets.graphml') 1293 | 1294 | # or omitting metadata (which saves space) 1295 | col = collection.get_tweets_containing('cats').get_retweets() 1296 | digraph = networks.retweet_network(col, [], []) 1297 | nx.write_graphml(digraph, '~/smappdata/collection_sparse_retweets.graphml') 1298 | ``` 1299 | 1300 | *input* 1301 | 1302 | `collection` - [smapp_dataset](#smapp_dataset) or [smapp_collection](#smapp_collection) 1303 | 1304 | `user_fields` - is a list of fields from the User object that will be included as attributes of the nodes. 1305 | 1306 | `tweet_fields` - is a list of the fields from the Tweet object that will be included as attributes of the edges. 1307 | 1308 | *output* 1309 | 1310 | a `.graphml` file may then be opened in graph analysis/visualization programs such as [Gephi](http://gephi.github.io/) or [Pajek](http://vlado.fmf.uni-lj.si/pub/networks/pajek/). 1311 | 1312 | note: if the collection result includes non-retweets as well, users with no retweets 1313 | will also appear in the graph as isolated nodes. only retweets are edges in the resulting graph. 1314 | 1315 | note: nodes and edges have attributes attached to them, which are customizable using the `user_fields` and `tweet_fields` arguments. 1316 | 1317 | note: for large graphs where the structure is interesting but the tweet text itself is not, it is advisable to ommit most of the metadata. 1318 | 1319 | note: the `networkx` library also provides algorithms for [vizualization](http://networkx.github.io/documentation/networkx-1.9.1/reference/drawing.html) and [analysis](http://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.html). 1320 | 1321 | note: there are no defaults, you have to specify the fields you want. 1322 | 1323 | 1324 | # models 1325 | 1326 | pretrained models for various tasks 1327 | 1328 | 1329 | # crowd_model 1330 | 1331 | a model for detecting crowds of people 1332 | 1333 | usage: 1334 | ``` 1335 | #dowloads the model the this path and loads it 1336 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=True, talk=True) 1337 | # or just load the model from this path (default behavior) 1338 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=False, talk=False) 1339 | 1340 | # predict from filenames 1341 | files = ['img1.jpg', 'img2.jpg'] 1342 | preds = cm.predict_files(files) 1343 | 1344 | # or predict from imag data (here i used opencv to read images) 1345 | imgs = np.zeros((len(files),224,224,3)) 1346 | for i, file in enumerate(files): 1347 | img = cv2.imread(file).astype('float64') 1348 | img = cv2.resize(img, (224,224)) 1349 | imgs[i] = img 1350 | cm = CrowdModel('/Users/yvan/Downloads/crowdv1.model', dl=False, talk=False) 1351 | preds = cm.predict_imgs(imgs) 1352 | ``` 1353 | 1354 | `dl` - whether or not the model class should download the model file (by default set to False, if the the model paht you give dosent exist it will try to donwload anyways) 1355 | 1356 | `talk` - the class prints out what it's doing, set to False by default. 1357 | 1358 | note: images on disk will be resized to 224x224, if you put your own image data it should be sized 224x224x3, when i doubt check the function's docstring with ?predict_imgs 1359 | 1360 | *input* 1361 | 1362 | a model path to download or an already downloaded model path, 1363 | 1364 | image file names or imag data in a numpy array 1365 | 1366 | *output* 1367 | 1368 | probability of image being a crowd 1369 | 1370 | # developer note '.' field splitting: 1371 | 1372 | there was a habit at the lab of creating one helper function that would take a tweet and a '.' delimited list of fields, split on this character to traverse into a json and save lots of coding time and lines of code. i wanted to leave a few lines here to explain why this is a bad idea in the context of the smapp lab: 1373 | 1374 | 1 - it makes code difficult to understand for grad students, we want them to be able to see exactly what a function does without needing to be a python expert. 1375 | 1376 | 2 - it casuse problems if you want to traverse into a json object but one of the fields you want 3 levels in has a '.' as part of its name. now twitter doesnt do this but sometimse people cahnge their data to csv, data gets messed up, or people want to use slightly different data. the tools should work for whatever people throw at them, not exclusively for twitter data. 1377 | 1378 | 3 - the obvious solution is to offer a function where the user can define a splitting character, the thing is this will be confusing to read. So in the end i conclude to go another route. In the end this would save a few lines of code and reduce readability drastically. 1379 | 1380 | if you want a way to declare nested traversals see: [https://github.com/SMAPPNYU/smappdragon#set_filter](https://github.com/SMAPPNYU/smappdragon#set_filter) 1381 | 1382 | #developer note publishing: 1383 | 1384 | 1 - make a ~/.pypirc file with: 1385 | 1386 | [distutils] 1387 | index-servers = pypi 1388 | 1389 | [pypi] 1390 | repository: https://pypi.python.org/pypi 1391 | username: YOUR_PYPI_USERNAME 1392 | password: YOUR_PASSWORD 1393 | 1394 | 2 - pip install twine 1395 | 1396 | 3 - python setup.py sdist 1397 | 1398 | 4 - twine upload sdist/* 1399 | 1400 | # author 1401 | 1402 | [yvan](https://github.com/yvan) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pysmap 2 | dependencies: 3 | - python=3.6 4 | - bokeh 5 | - pytz 6 | - pandas 7 | - pip: 8 | - smappdragon>=0.0.41 9 | - langdetect>=1.0.6 10 | - stop-words>=2015.2.23.1 11 | - networkx>=1.11 12 | - pymongo>=3.2.2 13 | - matplotlib>=2.0.0 14 | - keras>=2.0.8 15 | - python-opencv>=3.3.0.9 16 | - tensorflow>=1.3.0 17 | - h5py>=2.7.0 -------------------------------------------------------------------------------- /pysmap/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | module 3 | ''' 4 | 5 | from pysmap.twitterutil.smapp_collection import SmappCollection 6 | from pysmap.twitterutil.smapp_dataset import SmappDataset 7 | from pysmap.viz import plots 8 | from pysmap.viz import networks -------------------------------------------------------------------------------- /pysmap/mltools/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | module indicator for mltools 3 | ''' 4 | 5 | from . import smapp_model, crowd_model 6 | __all__ = ['smapp_model', 'crowd_model'] -------------------------------------------------------------------------------- /pysmap/mltools/crowd_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gc, abc, cv2, requests, os, shutil, gzip 3 | 4 | from pysmap.mltools.smapp_model import SmappModel 5 | from keras.models import load_model 6 | from keras.applications.resnet50 import preprocess_input 7 | 8 | def download_file(url, local_url): 9 | r = requests.get(url, stream=True) 10 | with open(local_url, 'wb') as f: 11 | shutil.copyfileobj(r.raw, f) 12 | 13 | def unzip_file(local_url, model_path): 14 | with gzip.open(local_url, 'rb') as fin: 15 | with open(model_path, 'wb') as fout: 16 | shutil.copyfileobj(fin, fout) 17 | 18 | class CrowdModel(SmappModel): 19 | __metaclass__ = abc.ABCMeta 20 | 21 | def __init__(self, model_path, model_dl='http://165.227.83.131:82/', dl=False, talk=True): 22 | if dl or not os.path.exists(model_path): 23 | url = os.path.join(model_dl,'crowdv1.h5.gz') 24 | local_url = os.path.join('/'.join(model_path.split('/')[:-1]),'crowdv1.h5.gz') 25 | if talk: print('downloading model file to: {}'.format(local_url)) 26 | download_file(url, local_url) 27 | unzip_file(local_url, model_path) 28 | if talk: print('downloaded model file to: {}'.format(model_path)) 29 | if talk: print('loading model from from: {}'.format(model_path)) 30 | self.model = load_model(model_path) 31 | 32 | def predict_imgs(self, imgs): 33 | ''' 34 | takes an image input and predicts on it 35 | this expects an ndarray (heightxwidthxchannels) 36 | this model shouldbe a (Nx224x224x3) numpy array 37 | this method it noce if you want to do preprocessing 38 | then predict results on those preprocessed images 39 | this function expects the image array to be jpg 40 | ''' 41 | imgs = preprocess_input(imgs) 42 | return self.model.predict(imgs) 43 | 44 | def predict_files(self, files): 45 | ''' 46 | reads files off disk, resizes them 47 | and then predicts them, files should 48 | be a list or itrerable of file paths 49 | that lead to images, they are then 50 | loaded with opencv, resized, and predicted 51 | ''' 52 | imgs = [0]*len(files) 53 | for i, file in enumerate(files): 54 | img = cv2.imread(file).astype('float64') 55 | img = cv2.resize(img, (224,224)) 56 | img = preprocess_input(img) 57 | if img is None: 58 | print('failed to open: {}, continuing...'.format(file)) 59 | imgs[i] = img 60 | return self.model.predict(np.array(imgs)) 61 | 62 | def view_predictions(imgs, y_pred, y_true, start, end): 63 | ''' 64 | displays the images in a grid formation from the 65 | start index to the end index, y_true are the true 66 | labels for the images, y_pred should be your predictions 67 | imgs should be an (NxWxHx3) array of your input images 68 | 69 | ''' 70 | fig, ax = plt.subplots(16, 4, sharex='col', sharey='row', figsize=(25, 100)) 71 | for i, img in enumerate(imgs[start:end]): 72 | pred_label = y_pred[start:end][i] 73 | actual_label = y_true[i] 74 | ax[i//4][i%4].imshow(img) 75 | ax[i//4][i%4].annotate(pred_label[0], 76 | (0,0), (0, -32), xycoords='axes fraction', 77 | textcoords='offset points', va='top', size=20) 78 | ax[i//4][i%4].annotate(actual_label, 79 | (0,0), (200, -32), xycoords='axes fraction', 80 | textcoords='offset points', va='top', size=20) 81 | ax[i//4][i%4].axis('off') 82 | return ax 83 | -------------------------------------------------------------------------------- /pysmap/mltools/smapp_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | this is the base loader class 3 | it will handle all the nitty gritty 4 | of dealing with models, while 5 | individual classes will just 6 | ''' 7 | 8 | import gc 9 | import abc 10 | 11 | class SmappModel(object): 12 | __metaclass__ = abc.ABCMeta 13 | 14 | @abc.abstractmethod 15 | def __init__(self, model_path): 16 | pass 17 | 18 | def delete(objs): 19 | ''' 20 | deallocates the memory occupide by this model 21 | added as convinienve function to make ti easier 22 | to avoid running out of ram when working with 23 | several models at once, usage: delete(model1, model2) 24 | ''' 25 | for obj in objs: del obj 26 | return gc.collect() 27 | 28 | -------------------------------------------------------------------------------- /pysmap/twitterutil/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | module 3 | ''' 4 | 5 | from . import smapp_collection, smapp_dataset 6 | __all__ = ['smapp_collection', 'smapp_dataset'] -------------------------------------------------------------------------------- /pysmap/twitterutil/smapp_collection.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import abc 3 | import copy 4 | import random 5 | import sqlite3 6 | import operator 7 | import itertools 8 | import smappdragon 9 | 10 | from datetime import datetime 11 | from bson import BSON, json_util 12 | from stop_words import get_stop_words 13 | from langdetect import detect, lang_detect_exception, DetectorFactory 14 | 15 | class SmappCollection(object): 16 | def __init__(self, data_source_type, *args): 17 | if data_source_type == 'bson': 18 | self.collection = smappdragon.BsonCollection(args[0]) 19 | elif data_source_type == 'json': 20 | self.collection = smappdragon.JsonCollection(args[0]) 21 | elif data_source_type == 'csv': 22 | self.collection = smappdragon.CsvCollection(args[0]) 23 | elif data_source_type == 'mongo': 24 | self.collection = smappdragon.MongoCollection( 25 | args[0], 26 | args[1], 27 | args[2], 28 | args[3], 29 | args[4], 30 | args[5] 31 | ) 32 | else: 33 | raise IOError('Could not find your input, it\'s mispelled or doesn\'t exist.') 34 | 35 | def __iter__(self): 36 | for tweet in self.collection.get_iterator(): 37 | yield tweet 38 | 39 | def set_custom_filter(self, custom_filter): 40 | cp = copy.deepcopy(self) 41 | cp.collection.set_custom_filter(custom_filter) 42 | return cp 43 | 44 | def get_tweet_texts(self): 45 | for tweet in self.collection.get_iterator(): 46 | yield tweet['text'] 47 | 48 | def count_tweets(self): 49 | return sum(1 for tweet in self.collection.get_iterator()) 50 | 51 | def count_tweet_terms(self, *args): 52 | def tweet_contains_terms(tweet): 53 | return any([term in tweet['text'] for term in args]) 54 | cp = copy.deepcopy(self) 55 | return sum(1 for tweet in cp.collection.set_custom_filter(tweet_contains_terms).get_iterator()) 56 | 57 | def get_tweets_containing(self, *args): 58 | def tweet_contains_terms(tweet): 59 | return any([term in tweet['text'] for term in args]) 60 | cp = copy.deepcopy(self) 61 | cp.collection.set_custom_filter(tweet_contains_terms) 62 | return cp 63 | 64 | def get_date_range(self, start, end): 65 | if type(start) is not datetime or type(end) is not datetime: 66 | raise ValueError('inputs to date_range must be python datetime.date objects') 67 | def tweet_is_in_date_range(tweet): 68 | return (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') >= start) and (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') < end) 69 | cp = copy.deepcopy(self) 70 | cp.collection.set_custom_filter(tweet_is_in_date_range) 71 | return cp 72 | 73 | def find_date_range(self): 74 | date_min = datetime.max 75 | date_max = datetime.min 76 | for tweet in self.collection.get_iterator(): 77 | date_to_process = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') 78 | if date_to_process <= date_min: 79 | date_min = date_to_process 80 | if date_to_process >= date_max: 81 | date_max = date_to_process 82 | return {"date_min":date_min,"date_max":date_max} 83 | 84 | def tweet_language_is(self, *args): 85 | def language_in_tweet(tweet): 86 | return any(['lang' in tweet and language_code in tweet['lang'] for language_code in args]) 87 | cp = copy.deepcopy(self) 88 | cp.collection.set_custom_filter(language_in_tweet) 89 | return cp 90 | 91 | def detect_tweet_language(self, *args): 92 | DetectorFactory.seed = 0 93 | def language_in_tweet(tweet): 94 | detected_lang = None 95 | try: 96 | detected_lang = detect(tweet['text']) 97 | except lang_detect_exception.LangDetectException: 98 | pass 99 | return any([detected_lang in args]) 100 | cp = copy.deepcopy(self) 101 | cp.collection.set_custom_filter(language_in_tweet) 102 | return cp 103 | 104 | def user_language_is(self, *args): 105 | def language_in_tweet(tweet): 106 | return any([language_code in tweet['user']['lang'] for language_code in args]) 107 | cp = copy.deepcopy(self) 108 | cp.collection.set_custom_filter(language_in_tweet) 109 | return cp 110 | 111 | def exclude_retweets(self): 112 | def tweet_is_not_retweet(tweet): 113 | return 'retweeted_status' not in tweet 114 | cp = copy.deepcopy(self) 115 | cp.collection.set_custom_filter(tweet_is_not_retweet) 116 | return cp 117 | 118 | def get_retweets(self): 119 | def tweet_is_retweet(tweet): 120 | return 'retweeted_status' in tweet 121 | cp = copy.deepcopy(self) 122 | cp.collection.set_custom_filter(tweet_is_retweet) 123 | return cp 124 | 125 | def user_location_contains(self, *args): 126 | def user_has_location(tweet): 127 | return tweet['user']['location'] and any([place_term in tweet['user']['location'] for place_term in args]) 128 | cp = copy.deepcopy(self) 129 | cp.collection.set_custom_filter(user_has_location) 130 | return cp 131 | 132 | def user_description_contains(self, *args): 133 | def user_description_contains_terms(tweet): 134 | return tweet['user']['description'] and any([d_term in tweet['user']['description'] for d_term in args]) 135 | cp = copy.deepcopy(self) 136 | cp.collection.set_custom_filter(user_description_contains_terms) 137 | return cp 138 | 139 | def user_id_is(self, *args): 140 | def user_id_created_tweet(tweet): 141 | return tweet['user']['id'] and any([u_id == tweet['user']['id'] for u_id in args]) 142 | cp = copy.deepcopy(self) 143 | cp.collection.set_custom_filter(user_id_created_tweet) 144 | return cp 145 | 146 | def get_geo_enabled(self): 147 | def geo_enabled_filter(tweet): 148 | return ("coordinates" in tweet 149 | and tweet["coordinates"] is not None 150 | and "coordinates" in tweet["coordinates"]) 151 | cp = copy.deepcopy(self) 152 | cp.collection.set_custom_filter(geo_enabled_filter) 153 | return cp 154 | 155 | def get_non_geo_enabled(self): 156 | def non_geo_enabled_filter(tweet): 157 | return ('coordinates' not in tweet or 158 | tweet['coordinates'] is None or 159 | 'coordinates' not in tweet['coordinates']) 160 | cp = copy.deepcopy(self) 161 | cp.collection.set_custom_filter(non_geo_enabled_filter) 162 | return cp 163 | 164 | def within_geobox(self, sw_lon, sw_lat, ne_lon, ne_lat): 165 | def tweet_is_in_geobox(tweet): 166 | if tweet['coordinates'] and tweet['coordinates']['coordinates']: 167 | coords = tweet['coordinates']['coordinates'] 168 | return coords[0] > float(sw_lon) and coords[0] < float(ne_lon) and coords[1] > float(sw_lat) and coords[1] < float(ne_lat) 169 | return False 170 | cp = copy.deepcopy(self) 171 | cp.collection.set_custom_filter(tweet_is_in_geobox) 172 | return cp 173 | 174 | def place_name_contains_country(self, *args): 175 | def place_name_contains_terms(tweet): 176 | return tweet['place'] and any([d_term in tweet['place']['country'] for d_term in args]) 177 | cp = copy.deepcopy(self) 178 | cp.collection.set_custom_filter(place_name_contains_terms) 179 | return cp 180 | 181 | def get_top_entities(self, requested_entities): 182 | returndict = {} 183 | returnstructure = {} 184 | tweet_parser = smappdragon.TweetParser() 185 | #init dempty dict for all entity types 186 | for entity_type in requested_entities: 187 | returndict[entity_type] = {} 188 | 189 | for tweet in self.collection.get_iterator(): 190 | for entity_type in requested_entities: 191 | for entity in tweet_parser.get_entity(entity_type, tweet): 192 | if entity_type == 'user_mentions': 193 | entity_value = tweet_parser.get_entity_field('id_str', entity) 194 | elif entity_type == 'hashtags' or entity_type == 'symbols': 195 | entity_value = tweet_parser.get_entity_field('text', entity) 196 | else: 197 | entity_value = tweet_parser.get_entity_field('url', entity) 198 | 199 | if entity_value in returndict[entity_type]: 200 | returndict[entity_type][entity_value] += 1 201 | else: 202 | returndict[entity_type][entity_value] = 1 203 | 204 | for entity_type in returndict: 205 | returnstructure[entity_type] = {} 206 | if len(returndict[entity_type]) > 0: 207 | sorted_list = sorted(returndict[entity_type].items(), key=operator.itemgetter(1), reverse=True) 208 | # if the user put in 0 return all entites 209 | # otherwise slice the array and return the 210 | # number of top things they asked for 211 | # if the list is too short throw in None 212 | if requested_entities[entity_type] == 0: 213 | returnstructure[entity_type] = {name: count for name, count in sorted_list} 214 | elif len(sorted_list) < requested_entities[entity_type]: 215 | returnstructure[entity_type] = {name: count for name, count in sorted_list} 216 | for i in range(0, requested_entities[entity_type]-len(sorted_list)): 217 | returnstructure[entity_type][i] = None 218 | else: 219 | returnstructure[entity_type] = { \ 220 | name: count for name, count in sorted_list[0:requested_entities[entity_type]] \ 221 | } 222 | return returnstructure 223 | 224 | def limit_number_of_tweets(self, limit): 225 | cp = copy.deepcopy(self) 226 | cp.collection.set_limit(limit) 227 | return cp 228 | 229 | def dump_to_bson(self, output_file): 230 | filehandle = open(output_file, 'ab+') 231 | for tweet in self.collection.get_iterator(): 232 | filehandle.write(BSON.encode(tweet)) 233 | filehandle.close() 234 | 235 | def dump_to_json(self, output_file): 236 | filehandle = open(output_file, 'a') 237 | for tweet in self.collection.get_iterator(): 238 | filehandle.write(json_util.dumps(tweet)+'\n') 239 | filehandle.close() 240 | 241 | def dump_to_csv(self, output_file, input_fields, write_header=True, top_level=False): 242 | filehandle = open(output_file, 'a', encoding='utf-8') 243 | writer = csv.writer(filehandle) 244 | if write_header: 245 | writer.writerow(input_fields) 246 | tweet_parser = smappdragon.tools.tweet_parser.TweetParser() 247 | 248 | for tweet in self.collection.get_iterator(): 249 | if top_level: 250 | ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) 251 | else: 252 | ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields) 253 | ret_values = [col_val[1] for col_val in ret] 254 | writer.writerow(ret_values) 255 | filehandle.close() 256 | 257 | def dump_to_sqlite_db(self, output_file, input_fields, top_level=False): 258 | def replace_none(s): 259 | if s is None: 260 | return 'NULL' 261 | return s 262 | 263 | tweet_parser = smappdragon.tools.tweet_parser.TweetParser() 264 | column_str = ','.join([column for column in input_fields]).replace('.','__') 265 | question_marks = ','.join(['?' for column in input_fields]) 266 | 267 | con = sqlite3.connect(output_file) 268 | cur = con.cursor() 269 | cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) 270 | 271 | insert_list = [] 272 | # batch insert if more than 10k tweets 273 | for tweet in self.collection.get_iterator(): 274 | if top_level: 275 | ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) 276 | else: 277 | ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields) 278 | row = [replace_none(col_val[1]) for col_val in ret] 279 | insert_list.append(tuple(row)) 280 | if (len(insert_list) % 10000) == 0: 281 | cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) 282 | con.commit() 283 | insert_list = [] 284 | if len(insert_list) < 10000: 285 | cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) 286 | con.commit() 287 | con.close() 288 | 289 | def get_top_hashtags(self, num_top): 290 | return self.get_top_entities({'hashtags':num_top}) 291 | 292 | def get_top_urls(self, num_top): 293 | return self.get_top_entities({'urls':num_top}) 294 | 295 | def get_top_mentions(self, num_top): 296 | return self.get_top_entities({'user_mentions':num_top}) 297 | 298 | def get_top_media(self, num_top): 299 | return self.get_top_entities({'media':num_top}) 300 | 301 | def get_top_symbols(self, num_top): 302 | return self.get_top_entities({'symbols':num_top}) 303 | 304 | def get_top_terms(self, num_top, stop_words=None): 305 | term_counts = {} 306 | if not stop_words: 307 | stop_words = get_stop_words('en') 308 | for tweet in self.collection.get_iterator(): 309 | split_tweet = tweet['text'].split() 310 | for tweet_token in split_tweet: 311 | if tweet_token not in stop_words: 312 | term_counts[tweet_token] = 0 if tweet_token not in term_counts else term_counts[tweet_token]+1 313 | sorted_counts = sorted(term_counts.items(), key=operator.itemgetter(1), reverse=True)[:num_top] 314 | return_counts = {} 315 | for k, v in sorted_counts: 316 | return_counts[k] = v 317 | return return_counts 318 | 319 | def sample(self, k): 320 | ''' 321 | this method is especially troublesome 322 | i do not reccommend making any changes to it 323 | you may notice it uplicates code fro smappdragon 324 | there is no way around this as far as i can tell 325 | it really might screw up a lot of stuff, stip tweets 326 | has been purposely omitted as it isnt supported in pysmap 327 | ''' 328 | def new_get_iterator(): 329 | tweet_parser = smappdragon.TweetParser() 330 | it = iter(self.collection.get_iterator()) 331 | sample = list(itertools.islice(it, k)) 332 | random.shuffle(sample) 333 | for i, item in enumerate(it, start=k+1): 334 | j = random.randrange(i) 335 | if j < k: 336 | sample[j] = item 337 | for tweet in sample: 338 | if self.collection.limit != 0 and self.collection.limit <= count: 339 | return 340 | elif tweet_parser.tweet_passes_filter(self.collection.filter, tweet) \ 341 | and tweet_parser.tweet_passes_custom_filter_list(self.collection.custom_filters, tweet): 342 | yield tweet 343 | cp = copy.deepcopy(self) 344 | cp.collection.get_iterator = new_get_iterator 345 | return cp 346 | 347 | ''' 348 | author @yvan 349 | for a lower level set of tools see: https://github.com/SMAPPNYU/smappdragon 350 | ''' -------------------------------------------------------------------------------- /pysmap/twitterutil/smapp_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import csv 4 | import abc 5 | import copy 6 | import glob 7 | import random 8 | import sqlite3 9 | import pymongo 10 | import operator 11 | import itertools 12 | import smappdragon 13 | 14 | from datetime import datetime 15 | from bson import BSON, json_util 16 | from pysmap.twitterutil.smapp_collection import SmappCollection 17 | from langdetect import detect, lang_detect_exception, DetectorFactory 18 | from stop_words import get_stop_words 19 | 20 | class SmappDataset(object): 21 | def __init__(self, *args, **kwargs): 22 | input_servers_ports = {} 23 | self.collections = [] 24 | for input_list_or_datasource in args: 25 | if type(input_list_or_datasource) is SmappCollection: 26 | self.collections.append(input_list_or_datasource.collection) 27 | elif type(input_list_or_datasource) is type(self): 28 | self.collections.extend(input_list_or_datasource.collections) 29 | else: 30 | if input_list_or_datasource[0] == 'bson': 31 | if 'file_pattern' == input_list_or_datasource[1]: 32 | for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])): 33 | self.collections.append(smappdragon.BsonCollection(path)) 34 | else: 35 | self.collections.append(smappdragon.BsonCollection(input_list_or_datasource[1])) 36 | elif input_list_or_datasource[0] == 'json': 37 | if 'file_pattern' == input_list_or_datasource[1]: 38 | for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])): 39 | self.collections.append(smappdragon.JsonCollection(path)) 40 | else: 41 | self.collections.append(smappdragon.JsonCollection(input_list_or_datasource[1])) 42 | elif input_list_or_datasource[0] == 'csv': 43 | if 'file_pattern' == input_list_or_datasource[1]: 44 | for path in glob.glob(os.path.expanduser(input_list_or_datasource[2])): 45 | self.collections.append(smappdragon.CsvCollection(path)) 46 | else: 47 | self.collections.append(smappdragon.CsvCollection(input_list_or_datasource[1])) 48 | elif input_list_or_datasource[0] == 'mongo': 49 | host_port_key = input_list_or_datasource[1]+str(input_list_or_datasource[2]) 50 | if host_port_key not in input_servers_ports: 51 | new_connection = pymongo.MongoClient(input_list_or_datasource[1], int(input_list_or_datasource[2])) 52 | input_servers_ports[host_port_key] = new_connection 53 | if 'database_regex' in kwargs or 'collection_regex' in kwargs: 54 | mongo = pymongo.MongoClient(input_list_or_datasource[1], int(input_list_or_datasource[2])) 55 | if 'database_regex' in kwargs: 56 | db_regex = re.compile(kwargs['database_regex']) 57 | matched_dbs = [match.group(1) for db_name in mongo.database_names() for match in [db_regex.search(db_name)] if match] 58 | else: 59 | matched_dbs = [input_list_or_datasource[5]] 60 | 61 | for matched_db in matched_dbs: 62 | if 'collection_regex' in kwargs: 63 | collection_regex = re.compile(kwargs['collection_regex']) 64 | matched_collections = [match.group(1) for collection_name in mongo[matched_db].collection_names() for match in [collection_regex.search(collection_name)] if match] 65 | else: 66 | if len(input_list_or_datasource) > 6: 67 | matched_collections = [input_list_or_datasource[6]] 68 | else: 69 | matched_collections = [input_list_or_datasource[5]] 70 | for matched_collection in matched_collections: 71 | self.collections.append(smappdragon.MongoCollection( 72 | input_list_or_datasource[3], 73 | input_list_or_datasource[4], 74 | matched_db, 75 | matched_collection, 76 | passed_mongo=input_servers_ports[input_list_or_datasource[1]+str(input_list_or_datasource[2])] 77 | )) 78 | else: 79 | self.collections.append(smappdragon.MongoCollection( 80 | input_list_or_datasource[3], 81 | input_list_or_datasource[4], 82 | input_list_or_datasource[5], 83 | input_list_or_datasource[6], 84 | passed_mongo=input_servers_ports[input_list_or_datasource[1]+str(input_list_or_datasource[2])] 85 | )) 86 | else: 87 | raise IOError('Could not find your input: {}, it\'s mispelled or doesn\'t exist.'.format(input_list_or_datasource)) 88 | 89 | # simple helper method for getting the iterators out 90 | # of all collections in a SmappDataset, sample overrides 91 | # this method 92 | def get_collection_iterators(self): 93 | return itertools.chain(*[collection.get_iterator() for collection in self.collections]) 94 | 95 | # helper applies filters to all collections in dataset 96 | def apply_filter_to_collections(self, filter_to_set): 97 | self.collections = [collection.set_custom_filter(filter_to_set) for collection in self.collections] 98 | 99 | def __iter__(self): 100 | for tweet in self.get_collection_iterators(): 101 | yield tweet 102 | 103 | def set_custom_filter(self, custom_filter): 104 | cp = copy.deepcopy(self) 105 | cp.apply_filter_to_collections(custom_filter) 106 | return cp 107 | 108 | def get_tweet_texts(self): 109 | for tweet in self.get_collection_iterators(): 110 | yield tweet['text'] 111 | 112 | def count_tweets(self): 113 | return sum(1 for tweet in self.get_collection_iterators()) 114 | 115 | def count_tweet_terms(self, *args): 116 | def tweet_contains_terms(tweet): 117 | return any([term in tweet['text'] for term in args]) 118 | cp = copy.deepcopy(self) 119 | cp.apply_filter_to_collections(tweet_contains_terms) 120 | return sum(1 for tweet in cp.get_collection_iterators()) 121 | 122 | def get_tweets_containing(self, *args): 123 | def tweet_contains_terms(tweet): 124 | return any([term in tweet['text'] for term in args]) 125 | cp = copy.deepcopy(self) 126 | cp.apply_filter_to_collections(tweet_contains_terms) 127 | return cp 128 | 129 | def get_date_range(self, start, end): 130 | if type(start) is not datetime or type(end) is not datetime: 131 | raise ValueError('inputs to date_range must be python datetime.date objects') 132 | def tweet_is_in_date_range(tweet): 133 | return (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') >= start) and (datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') < end) 134 | cp = copy.deepcopy(self) 135 | cp.apply_filter_to_collections(tweet_is_in_date_range) 136 | return cp 137 | 138 | def find_date_range(self): 139 | date_min = datetime.max 140 | date_max = datetime.min 141 | for tweet in self.get_collection_iterators(): 142 | date_to_process = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') 143 | if date_to_process <= date_min: 144 | date_min = date_to_process 145 | if date_to_process >= date_max: 146 | date_max = date_to_process 147 | return {"date_min":date_min,"date_max":date_max} 148 | 149 | def tweet_language_is(self, *args): 150 | def language_in_tweet(tweet): 151 | return any([language_code in tweet['lang'] for language_code in args]) 152 | cp = copy.deepcopy(self) 153 | cp.apply_filter_to_collections(language_in_tweet) 154 | return cp 155 | 156 | def detect_tweet_language(self, *args): 157 | DetectorFactory.seed = 0 158 | def language_in_tweet(tweet): 159 | detected_lang = None 160 | try: 161 | detected_lang = detect(tweet['text']) 162 | except lang_detect_exception.LangDetectException: 163 | pass 164 | return any([detected_lang in args]) 165 | cp = copy.deepcopy(self) 166 | cp.apply_filter_to_collections(language_in_tweet) 167 | return cp 168 | 169 | def user_language_is(self, *args): 170 | def language_in_tweet(tweet): 171 | return any([language_code in tweet['user']['lang'] for language_code in args]) 172 | cp = copy.deepcopy(self) 173 | cp.apply_filter_to_collections(language_in_tweet) 174 | return cp 175 | 176 | def exclude_retweets(self): 177 | def tweet_is_not_retweet(tweet): 178 | return 'retweeted_status' not in tweet 179 | cp = copy.deepcopy(self) 180 | cp.apply_filter_to_collections(tweet_is_not_retweet) 181 | return cp 182 | 183 | def get_retweets(self): 184 | def tweet_is_retweet(tweet): 185 | return 'retweeted_status' in tweet 186 | cp = copy.deepcopy(self) 187 | cp.apply_filter_to_collections(tweet_is_retweet) 188 | return cp 189 | 190 | def user_location_contains(self, *args): 191 | def user_has_location(tweet): 192 | return tweet['user']['location'] and any([place_term in tweet['user']['location'] for place_term in args]) 193 | cp = copy.deepcopy(self) 194 | cp.apply_filter_to_collections(user_has_location) 195 | return cp 196 | 197 | def user_description_contains(self, *args): 198 | def user_description_contains_terms(tweet): 199 | return tweet['user']['description'] and any([d_term in tweet['user']['description'] for d_term in args]) 200 | cp = copy.deepcopy(self) 201 | cp.apply_filter_to_collections(user_description_contains_terms) 202 | return cp 203 | 204 | def user_id_is(self, *args): 205 | def user_id_created_tweet(tweet): 206 | return tweet['user']['id'] and any([u_id == tweet['user']['id'] for u_id in args]) 207 | cp = copy.deepcopy(self) 208 | cp.apply_filter_to_collections(user_id_created_tweet) 209 | return cp 210 | 211 | def get_geo_enabled(self): 212 | def geo_enabled_filter(tweet): 213 | return ("coordinates" in tweet 214 | and tweet["coordinates"] is not None 215 | and "coordinates" in tweet["coordinates"]) 216 | cp = copy.deepcopy(self) 217 | cp.apply_filter_to_collections(geo_enabled_filter) 218 | return cp 219 | 220 | def get_non_geo_enabled(self): 221 | def non_geo_enabled_filter(tweet): 222 | return ('coordinates' not in tweet or 223 | tweet['coordinates'] is None or 224 | 'coordinates' not in tweet['coordinates']) 225 | cp = copy.deepcopy(self) 226 | cp.apply_filter_to_collections(non_geo_enabled_filter) 227 | return cp 228 | 229 | def within_geobox(self, sw_lon, sw_lat, ne_lon, ne_lat): 230 | def tweet_is_in_geobox(tweet): 231 | if tweet['coordinates'] and tweet['coordinates']['coordinates']: 232 | coords = tweet['coordinates']['coordinates'] 233 | return coords[0] > float(sw_lon) and coords[0] < float(ne_lon) and coords[1] > float(sw_lat) and coords[1] < float(ne_lat) 234 | return False 235 | cp = copy.deepcopy(self) 236 | cp.apply_filter_to_collections(tweet_is_in_geobox) 237 | return cp 238 | 239 | def place_name_contains_country(self, *args): 240 | def place_name_contains_terms(tweet): 241 | return tweet['place'] and any([d_term in tweet['place']['country'] for d_term in args]) 242 | cp = copy.deepcopy(self) 243 | cp.apply_filter_to_collections(place_name_contains_terms) 244 | return cp 245 | 246 | 247 | def get_top_entities(self, requested_entities): 248 | returndict = {} 249 | returnstructure = {} 250 | tweet_parser = smappdragon.TweetParser() 251 | #init dempty dict for all entity types 252 | for entity_type in requested_entities: 253 | returndict[entity_type] = {} 254 | 255 | for tweet in self.get_collection_iterators(): 256 | for entity_type in requested_entities: 257 | for entity in tweet_parser.get_entity(entity_type, tweet): 258 | if entity_type == 'user_mentions': 259 | entity_value = tweet_parser.get_entity_field('id_str', entity) 260 | elif entity_type == 'hashtags' or entity_type == 'symbols': 261 | entity_value = tweet_parser.get_entity_field('text', entity) 262 | else: 263 | entity_value = tweet_parser.get_entity_field('url', entity) 264 | 265 | if entity_value in returndict[entity_type]: 266 | returndict[entity_type][entity_value] += 1 267 | else: 268 | returndict[entity_type][entity_value] = 1 269 | 270 | for entity_type in returndict: 271 | returnstructure[entity_type] = {} 272 | if len(returndict[entity_type]) > 0: 273 | sorted_list = sorted(returndict[entity_type].items(), key=operator.itemgetter(1), reverse=True) 274 | # if the user put in 0 return all entites 275 | # otherwise slice the array and return the 276 | # number of top things they asked for 277 | # if the list is too short throw in None 278 | if requested_entities[entity_type] == 0: 279 | returnstructure[entity_type] = {name: count for name, count in sorted_list} 280 | elif len(sorted_list) < requested_entities[entity_type]: 281 | returnstructure[entity_type] = {name: count for name, count in sorted_list} 282 | for i in range(0, requested_entities[entity_type]-len(sorted_list)): 283 | returnstructure[entity_type][i] = None 284 | else: 285 | returnstructure[entity_type] = { \ 286 | name: count for name, count in sorted_list[0:requested_entities[entity_type]] \ 287 | } 288 | return returnstructure 289 | 290 | def limit_number_of_tweets(self, limit): 291 | cp = copy.deepcopy(self) 292 | cp.collections = [collection.set_limit(limit) for collection in cp.collections] 293 | return cp 294 | 295 | def dump_to_bson(self, output_file, num_files=1): 296 | filehandles = [None]*num_files 297 | filename, file_extension = output_file.split(os.extsep, 1) 298 | 299 | # open all filehandles 300 | if num_files == 1: 301 | filehandles[0] = open(output_file, 'ab+') 302 | else: 303 | # open all filehandles 304 | filename, file_extension = output_file.split(os.extsep, 1) 305 | for i in range(num_files): 306 | filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'ab+') 307 | 308 | # write the tweets as evenly 309 | # as possible in each file 310 | tracker = 0 311 | for tweet in self.get_collection_iterators(): 312 | filehandles[tracker].write(BSON.encode(tweet)) 313 | if tracker == num_files-1: 314 | tracker = 0 315 | else: 316 | tracker += 1 317 | 318 | # close all filehandles 319 | for fh in filehandles: 320 | fh.close() 321 | 322 | def dump_to_json(self, output_file, num_files=1): 323 | filehandles = [None]*num_files 324 | 325 | if num_files == 1: 326 | filehandles[0] = open(output_file, 'a') 327 | else: 328 | # open all filehandles 329 | filename, file_extension = output_file.split(os.extsep, 1) 330 | for i in range(num_files): 331 | filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'a') 332 | 333 | # write the tweets as evenly 334 | # as possible in each file 335 | tracker = 0 336 | for tweet in self.get_collection_iterators(): 337 | filehandles[tracker].write(json_util.dumps(tweet)+'\n') 338 | if tracker == num_files-1: 339 | tracker = 0 340 | else: 341 | tracker += 1 342 | 343 | # close all filehandles 344 | for fh in filehandles: 345 | fh.close() 346 | 347 | def dump_to_csv(self, output_file, input_fields, write_header=True, top_level=False, num_files=1): 348 | filehandles = [None]*num_files 349 | writers = [None]*num_files 350 | 351 | if num_files == 1: 352 | filehandles[0] = open(output_file, 'a', encoding='utf-8') 353 | writers[0] = csv.writer(filehandles[0]) 354 | if write_header: 355 | writers[0].writerow(input_fields) 356 | else: 357 | # open all filehandles 358 | filename, file_extension = output_file.split(os.extsep, 1) 359 | for i in range(num_files): 360 | filehandles[i] = open('{}_{}.{}'.format(filename, i, file_extension), 'a') 361 | writers[i] = csv.writer(filehandles[i]) 362 | if write_header: 363 | writers[i].writerow(input_fields) 364 | 365 | tweet_parser = smappdragon.tools.tweet_parser.TweetParser() 366 | 367 | # write the tweets as evenly 368 | # as possible in each file 369 | tracker = 0 370 | for tweet in self.get_collection_iterators(): 371 | if top_level: 372 | ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) 373 | else: 374 | ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields) 375 | ret_values = [col_val[1] for col_val in ret] 376 | writers[tracker].writerow(ret_values) 377 | 378 | if tracker == num_files-1: 379 | tracker = 0 380 | else: 381 | tracker += 1 382 | 383 | # close all filehandles 384 | for fh in filehandles: 385 | fh.close() 386 | 387 | def dump_to_sqlite_db(self, output_file, input_fields, top_level=False, num_files=1): 388 | def replace_none(s): 389 | if s is None: 390 | return 'NULL' 391 | return s 392 | cons = [None]*num_files 393 | cursors = [None]*num_files 394 | 395 | tweet_parser = smappdragon.tools.tweet_parser.TweetParser() 396 | column_str = ','.join([column for column in input_fields]).replace('.','__') 397 | question_marks = ','.join(['?' for column in input_fields]) 398 | 399 | con = sqlite3.connect(output_file) 400 | cur = con.cursor() 401 | cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) 402 | 403 | insert_list = [] 404 | # batch insert if more than 10k tweets 405 | for tweet in self.get_collection_iterators(): 406 | if top_level: 407 | ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) 408 | else: 409 | ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields) 410 | row = [replace_none(col_val[1]) for col_val in ret] 411 | insert_list.append(tuple(row)) 412 | if (len(insert_list) % 10000) == 0: 413 | cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) 414 | con.commit() 415 | insert_list = [] 416 | if len(insert_list) < 10000: 417 | cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) 418 | con.commit() 419 | con.close() 420 | 421 | def get_top_hashtags(self, num_top): 422 | return self.get_top_entities({'hashtags':num_top}) 423 | 424 | def get_top_urls(self, num_top): 425 | return self.get_top_entities({'urls':num_top}) 426 | 427 | def get_top_mentions(self, num_top): 428 | return self.get_top_entities({'user_mentions':num_top}) 429 | 430 | def get_top_media(self, num_top): 431 | return self.get_top_entities({'media':num_top}) 432 | 433 | def get_top_symbols(self, num_top): 434 | return self.get_top_entities({'symbols':num_top}) 435 | 436 | def get_top_terms(self, num_top, stop_words=None): 437 | term_counts = {} 438 | if not stop_words: 439 | stop_words = get_stop_words('en') 440 | for tweet in self.get_collection_iterators(): 441 | split_tweet = tweet['text'].split() 442 | for tweet_token in split_tweet: 443 | if tweet_token not in stop_words: 444 | term_counts[tweet_token] = 0 if tweet_token not in term_counts else term_counts[tweet_token]+1 445 | sorted_counts = sorted(term_counts.items(), key=operator.itemgetter(1), reverse=True)[:num_top] 446 | return_counts = {} 447 | for k, v in sorted_counts: 448 | return_counts[k] = v 449 | return return_counts 450 | 451 | def sample(self, k): 452 | ''' 453 | this method is especially troublesome 454 | i do not reccommend making any changes to it 455 | you may notice it uplicates code fro smappdragon 456 | there is no way around this as far as i can tell 457 | it really might screw up a lot of stuff, stip tweets 458 | has been purposely omitted as it isnt supported in pysmap 459 | ''' 460 | def new_get_iterators(): 461 | tweet_parser = smappdragon.TweetParser() 462 | it = iter(self.get_collection_iterators()) 463 | sample = list(itertools.islice(it, k)) 464 | random.shuffle(sample) 465 | for i, item in enumerate(it, start=k+1): 466 | j = random.randrange(i) 467 | if j < k: 468 | sample[j] = item 469 | for tweet in sample: 470 | if all([collection.limit != 0 and collection.limit <= count for collection in self.collections]): 471 | return 472 | elif all([tweet_parser.tweet_passes_filter(collection.filter, tweet) \ 473 | and tweet_parser.tweet_passes_custom_filter_list(collection.custom_filters, tweet) for collection in self.collections]): 474 | yield tweet 475 | 476 | cp = copy.deepcopy(self) 477 | cp.get_collection_iterators = new_get_iterators 478 | return cp 479 | 480 | ''' 481 | author @yvan 482 | for a lower level set of tools see: https://github.com/SMAPPNYU/smappdragon 483 | ''' -------------------------------------------------------------------------------- /pysmap/viz/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | module 3 | ''' 4 | 5 | from . import plots 6 | __all__ = ['plots'] -------------------------------------------------------------------------------- /pysmap/viz/networks.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from smappdragon import TweetParser 3 | 4 | ''' 5 | generate a retweet graph from the selection of tweets. 6 | ''' 7 | def retweet_network(collection, tweet_fields, user_fields): 8 | def replace_none(s): 9 | if s is None: 10 | return 'NULL' 11 | return s 12 | 13 | tp = TweetParser() 14 | dg = nx.DiGraph(name="retweet graph") 15 | 16 | for tweet in collection: 17 | 18 | um_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet['user'], user_fields)} 19 | t_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet, tweet_fields)} 20 | 21 | if tweet['user']['id_str'] not in dg: 22 | dg.add_node(tweet['user']['id_str'], attr_dict=um_dict) 23 | if 'retweeted_status' in tweet: 24 | rtu_dict = {field:replace_none(value) for field,value in tp.parse_columns_from_tweet(tweet['retweeted_status']['user'], user_fields)} 25 | dg.add_node(tweet['retweeted_status']['user']['id_str'], attr_dict=rtu_dict) 26 | dg.add_edge(tweet['user']['id_str'], tweet['retweeted_status']['user']['id_str'], attr_dict=t_dict) 27 | return dg -------------------------------------------------------------------------------- /pysmap/viz/plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | plt.style.use('seaborn') 3 | 4 | from datetime import datetime, timedelta 5 | from collections import OrderedDict 6 | from smappdragon import TweetParser 7 | 8 | ''' 9 | this gets tweets by timeslice 10 | collection is a SmappCollection 11 | field is the field in a tweet on which you want to compare 12 | values_to_match are the values you want that field to match 13 | filter can be any extra filter like a custom smappdragon filter 14 | that can be applied to a tweet to make your graph 15 | period_type is the grouping you want, by day, by week, by month, etc 16 | start and end is the total date range you want to be queried, 17 | later ell do multipe fields 18 | output_path 19 | ''' 20 | def bar_graph_tweet_field_grouped_by_period(collection, field, values_to_match, custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title): 21 | if period_type == 'hours': 22 | time_delta = timedelta(hours=1) 23 | elif period_type == 'days': 24 | time_delta = timedelta(days=1) 25 | elif period_type == 'weeks': 26 | time_delta = timedelta(weeks=1) 27 | elif period_type == 'months': 28 | time_delta = timedelta(weeks=4) 29 | elif period_type == 'years': 30 | time_delta = timedelta(weeks=52) 31 | 32 | # calculate how many periods we need 33 | duration = end - start 34 | periods = round(duration // time_delta) 35 | 36 | # setup a dictionary 37 | # avoid having an empty dict 38 | field_counts = {} 39 | if periods <= 0: 40 | field_counts[0] = 0 41 | else: 42 | for period in range(periods): 43 | field_counts[period] = 0 44 | 45 | # split the input field for compound fields 46 | split_field = field.split('.') 47 | tweet_parser = TweetParser() 48 | 49 | for tweet in collection.get_date_range(start, end): 50 | flattened_tweet = tweet_parser.flatten_dict(tweet) 51 | 52 | for tweet_tuple in flattened_tweet: 53 | if tweet_tuple[0] == split_field: 54 | value = tweet_tuple[1] 55 | break 56 | 57 | # empty fild value matches all tweets, then only custom filter can be used to count 58 | if ((field == '') or (value in values_to_match)) and custom_filter(tweet): 59 | tweet_time = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') 60 | period = round((tweet_time - start) // time_delta) 61 | field_counts[period if period > 0 else 0] += 1 62 | 63 | data = { 64 | 'period':[key for key in field_counts.keys()], 65 | 'tweets':[val for val in field_counts.values()] 66 | } 67 | 68 | plt.plot(data['period'], data['tweets']) 69 | plt.xlabel(x_label) 70 | plt.ylabel(y_label) 71 | plt.title(graph_title) 72 | plt.savefig(output_path) 73 | 74 | def bar_graph_languages(collection, langs_to_match, period_type, start, end, output_path, x_label, y_label, graph_title): 75 | bar_graph_tweet_field_grouped_by_period(collection, 'lang', langs_to_match, lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title) 76 | 77 | def bar_graph_user_languages(collection, langs_to_match, period_type, start, end, output_path, x_label, y_label, graph_title): 78 | bar_graph_tweet_field_grouped_by_period(collection, 'user.lang', langs_to_match, lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title) 79 | 80 | def bar_graph_tweets(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 81 | bar_graph_tweet_field_grouped_by_period(collection, '', [], lambda tweet:True, period_type, start, end, output_path, x_label, y_label, graph_title) 82 | 83 | def bar_graph_tweets_with_urls(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 84 | def custom_filter(tweet): 85 | if len(tweet['entities']['urls']) > 0: 86 | return True 87 | return False 88 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 89 | 90 | def bar_graph_tweets_with_media(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 91 | def custom_filter(tweet): 92 | if len(tweet['entities']['media']) > 0: 93 | return True 94 | return False 95 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 96 | 97 | def bar_graph_tweets_with_mentions(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 98 | def custom_filter(tweet): 99 | if len(tweet['entities']['user_mentions']) > 0: 100 | return True 101 | return False 102 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 103 | 104 | def bar_graph_tweets_with_hashtags(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 105 | def custom_filter(tweet): 106 | if len(tweet['entities']['hashtags']) > 0: 107 | return True 108 | return False 109 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 110 | 111 | def bar_graph_tweets_with_symbols(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 112 | def custom_filter(tweet): 113 | if len(tweet['entities']['symbols']) > 0: 114 | return True 115 | return False 116 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 117 | 118 | def bar_graph_tweets_with_retweets(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 119 | def custom_filter(tweet): 120 | if 'retweeted_status' in tweet: 121 | return True 122 | return False 123 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 124 | 125 | def bar_graph_tweets_with_locations(collection, period_type, start, end, output_path, x_label, y_label, graph_title): 126 | def custom_filter(tweet): 127 | if 'retweeted_status' in tweet: 128 | return True 129 | return False 130 | bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, period_type, start, end, output_path, x_label, y_label, graph_title) 131 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup 3 | 4 | setup(name='pysmap', 5 | packages=['pysmap', 'pysmap.twitterutil', 'pysmap.viz', 'pysmap.mltools'], 6 | version='0.0.42', 7 | description='pysmap is a set of tools for working with twitter data', 8 | author='yvan', 9 | author_email='yns207@nyu.edu', 10 | url='https://github.com/SMAPPNYU/pysmap', 11 | keywords='twitter data tools pysmap', 12 | license='MIT', 13 | install_requires=[ 14 | 'smappdragon==0.0.43', 15 | 'stop-words>=2015.2.23.1', 16 | 'langdetect>=1.0.6', 17 | 'matplotlib>=2.0.0', 18 | 'pandas>=0.18.1', 19 | 'pymongo>=3.2.2', 20 | 'pytz>=2016.4', 21 | 'networkx>=1.11', 22 | 'keras>=2.0.8', 23 | 'opencv-python>=3.3.0.9', 24 | 'tensorflow>=1.3.0', 25 | 'h5py>=2.7.0' 26 | ] 27 | ) -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | module 3 | ''' 4 | from . import test_smapp_collection 5 | __all__ = ['test_tweet_parser', 'test_mongo_collection'] -------------------------------------------------------------------------------- /test/data/invalid.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMAPPNYU/pysmap/eb871992f40c53125129535e871525d5623c8c2d/test/data/invalid.bson -------------------------------------------------------------------------------- /test/data/valid-single.bson.json: -------------------------------------------------------------------------------- 1 | {"_id":{"$oid":"5637c49e0651ef2dda8b5dfd"},"contributors":null,"truncated":false,"text":"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat","is_quote_status":false,"in_reply_to_status_id":null,"random_number":0.0009388446238663972,"id":{"$numberLong":"661275583813431296"},"favorite_count":0,"source":"\u003ca href=\"https://twitter.com/Col_Connaughton\" rel=\"nofollow\"\u003eColin's Autotweeterpro5.3\u003c/a\u003e","retweeted":false,"coordinates":null,"timestamp_ms":"1446495359744","entities":{"user_mentions":[],"symbols":[],"hashtags":[{"indices":[74,83],"text":"jadehelm"},{"indices":[84,98],"text":"newworldorder"},{"indices":[99,103],"text":"usa"},{"indices":[104,112],"text":"tyranny"},{"indices":[113,120],"text":"threat"}],"urls":[{"url":"https://t.co/VA4bQRudLt","indices":[50,73],"expanded_url":"https://www.youtube.com/watch?v=0nJqymxVpwc","display_url":"youtube.com/watch?v=0nJqym…"}]},"in_reply_to_screen_name":null,"id_str":"661275583813431296","retweet_count":0,"in_reply_to_user_id":null,"favorited":false,"timestamp":{"$date":"2015-11-02T20:15:59.000Z"},"user":{"follow_request_sent":null,"profile_use_background_image":true,"default_profile_image":false,"id":379851447,"verified":false,"profile_image_url_https":"https://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","followers_count":3159,"profile_sidebar_border_color":"C0DEED","id_str":"379851447","profile_background_color":"C0DEED","listed_count":401,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","utc_offset":0,"statuses_count":477638,"description":"#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me","friends_count":2019,"location":"London UK","profile_link_color":"0084B4","profile_image_url":"http://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","following":null,"geo_enabled":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/379851447/1416509762","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"ISRAEL BOMBS BABIES","lang":"en","profile_background_tile":false,"favourites_count":15917,"screen_name":"Col_Connaughton","notifications":null,"url":null,"created_at":"Sun Sep 25 17:29:09 +0000 2011","contributors_enabled":false,"time_zone":"London","protected":false,"default_profile":true,"is_translator":false},"geo":null,"in_reply_to_user_id_str":null,"possibly_sensitive":true,"lang":"de","created_at":"Mon Nov 02 20:15:59 +0000 2015","filter_level":"low","in_reply_to_status_id_str":null,"place":null} 2 | -------------------------------------------------------------------------------- /test/data/valid.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SMAPPNYU/pysmap/eb871992f40c53125129535e871525d5623c8c2d/test/data/valid.bson -------------------------------------------------------------------------------- /test/data/valid.csv: -------------------------------------------------------------------------------- 1 | id_str,entities.hashtags.0,entities.hashtags.1,source,user.id,timestamp.$date,text 2 | 661275583813431296,"{""indices"": [74, 83], ""text"": ""jadehelm""}","{""indices"": [84, 98], ""text"": ""newworldorder""}","Colin's Autotweeterpro5.3",379851447,1446495359000,"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat" 3 | -------------------------------------------------------------------------------- /test/test_crowd_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os, sys, unittest, cv2, warnings 3 | 4 | from datetime import datetime 5 | from test.config import config 6 | from pysmap.mltools.crowd_model import CrowdModel 7 | from keras.applications.resnet50 import preprocess_input 8 | # from matplotlib.testing.decorators import image_comparison 9 | 10 | class TestCrowdModel(unittest.TestCase): 11 | def test_control(self): 12 | ''' 13 | a control test to make sure everything 14 | in the unittest framework is working 15 | ''' 16 | self.assertTrue(True) 17 | 18 | def test_model_loads_file(self): 19 | ''' 20 | test that the model loads a model 21 | file properly and produces a model object 22 | ''' 23 | cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False) 24 | assert type(cm) is CrowdModel 25 | 26 | def test_model_dl_file(self): 27 | ''' 28 | test that the model class can download a file 29 | from the server where we store model files 30 | ''' 31 | cm = CrowdModel(config['crowd']['dl_path'], dl=True, talk=True) 32 | statinfo = os.stat(config['crowd']['dl_path']) 33 | assert os.path.exists(config['crowd']['dl_path']) 34 | assert statinfo.st_size == 295489952 35 | assert type(cm) is CrowdModel 36 | files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']] 37 | preds = cm.predict_files(files) 38 | assert len(preds) > 0 39 | assert preds[0][0] > 0.9 # should be 0.997 40 | assert preds[1][0] < 0.1 # sould be 0.0034 41 | 42 | def test_model_predicts_imgs(self): 43 | ''' 44 | test that a model can make predictions from 45 | an array of imgs that have already been loaded 46 | ''' 47 | files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']] 48 | imgs = np.zeros((len(files),224,224,3)) 49 | for i, file in enumerate(files): 50 | img = cv2.imread(file).astype('float64') 51 | img = cv2.resize(img, (224,224)) 52 | imgs[i] = img 53 | cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False) 54 | preds = cm.predict_imgs(imgs) 55 | assert len(preds) > 0 56 | assert preds[0][0] > 0.9 # should be 0.997 57 | assert preds[1][0] < 0.1 # sould be 0.0034 58 | 59 | def test_model_predicts_files(self): 60 | ''' 61 | test that the model class cna predict from image files 62 | this loads the images, prepocesses then predicts 63 | ''' 64 | cm = CrowdModel(config['crowd']['resnet50'], dl=False, talk=False) 65 | files = [config['crowd']['crowd_img'], config['crowd']['noncrowd_img']] 66 | preds = cm.predict_files(files) 67 | assert len(preds) > 0 68 | assert preds[0][0] > 0.9 # should be 0.997 69 | assert preds[1][0] < 0.1 # sould be 0.00 70 | 71 | if __name__ == '__main__': 72 | warnings.filterwarnings("ignore") 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /test/test_networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import networkx as nx 4 | 5 | from datetime import datetime 6 | from test.config import config 7 | from pysmap import SmappCollection 8 | from pysmap import networks 9 | 10 | class TestNetworks(unittest.TestCase): 11 | def test_control(self): 12 | self.assertTrue(True) 13 | 14 | def test_make_retweet_network_graph(self): 15 | output_path = '{}/chart_tests/network-{}-retweet.graphml'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 16 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 17 | collection = SmappCollection('json', file_path) 18 | digraph = networks.retweet_network(collection, ['id_str', 'retweeted_status.id_str', 'created_at', 'text', 'lang'], ['id_str', 'screen_name', 'location', 'description']) 19 | nx.write_graphml(digraph, output_path) 20 | 21 | def test_empty_make_retweet_network_graph(self): 22 | output_path = '{}/chart_tests/network-{}-retweet-empty.graphml'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 23 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 24 | collection = SmappCollection('json', file_path) 25 | digraph = networks.retweet_network(collection, [], []) 26 | nx.write_graphml(digraph, output_path) -------------------------------------------------------------------------------- /test/test_plots.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from datetime import datetime 5 | from test.config import config 6 | from pysmap import SmappCollection 7 | from pysmap import plots 8 | 9 | class TestPlots(unittest.TestCase): 10 | def test_control(self): 11 | self.assertTrue(True) 12 | 13 | def test_tweet_field_grouped_by_timeslice_hours(self): 14 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 15 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 16 | collection = SmappCollection('json', file_path) 17 | def custom_filter(tweet): 18 | if '#JadeHelm' in tweet['text']: 19 | return True 20 | return False 21 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'hours', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 22 | 23 | def test_tweet_field_grouped_by_timeslice_days(self): 24 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 25 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 26 | collection = SmappCollection('json', file_path) 27 | def custom_filter(tweet): 28 | return True 29 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 30 | 31 | def test_tweet_field_grouped_by_timeslice_weeks(self): 32 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 33 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 34 | collection = SmappCollection('json', file_path) 35 | def custom_filter(tweet): 36 | return True 37 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'weeks', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 38 | 39 | def test_tweet_field_grouped_by_timeslice_months(self): 40 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 41 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 42 | collection = SmappCollection('json', file_path) 43 | def custom_filter(tweet): 44 | return True 45 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 46 | 47 | def test_tweet_field_grouped_by_timeslice_years(self): 48 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 49 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 50 | collection = SmappCollection('json', file_path) 51 | def custom_filter(tweet): 52 | return True 53 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'years', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 54 | 55 | def test_tweet_field_grouped_by_timeslice_custom_filter(self): 56 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 57 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 58 | collection = SmappCollection('json', file_path) 59 | def custom_filter(tweet): 60 | if '#JadeHelm' in tweet['text']: 61 | return True 62 | return False 63 | plots.bar_graph_tweet_field_grouped_by_period(collection, '', [], custom_filter, 'days', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 64 | 65 | def test_tweet_field_grouped_by_timeslice_single_level_field(self): 66 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 67 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 68 | collection = SmappCollection('json', file_path) 69 | def custom_filter(tweet): 70 | return True 71 | plots.bar_graph_tweet_field_grouped_by_period(collection, 'id_str', ['661283295670493185'], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 72 | 73 | def test_tweet_field_grouped_by_timeslice_compound_field(self): 74 | output_path = '{}/chart_tests/Bar-{}-bar.png'.format(os.path.dirname(os.path.realpath(__file__)), datetime.now()) 75 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 76 | collection = SmappCollection('json', file_path) 77 | def custom_filter(tweet): 78 | return True 79 | plots.bar_graph_tweet_field_grouped_by_period(collection, 'user.time_zone', ['Pacific Time (US & Canada)'], custom_filter, 'months', datetime(2015,9,1), datetime(2015,11,30), output_path, 'date', 'tweet counts', 'filtered tweets by hour') 80 | -------------------------------------------------------------------------------- /test/test_smapp_collection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from datetime import datetime 5 | from test.config import config 6 | from pysmap import SmappCollection 7 | 8 | class TestSmappCollection(unittest.TestCase): 9 | def test_control(self): 10 | self.assertTrue(True) 11 | 12 | def test_smapp_bson_collection_iterates(self): 13 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 14 | collection = SmappCollection('bson', file_path) 15 | self.assertTrue(len(list(collection)) > 0) 16 | 17 | def test_smapp_json_collection_iterates(self): 18 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 19 | collection = SmappCollection('json', file_path) 20 | self.assertTrue(len(list(collection)) > 0) 21 | 22 | def test_smapp_csv_collection_iterates(self): 23 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) 24 | collection = SmappCollection('csv', file_path) 25 | self.assertTrue(len(list(collection)) > 0) 26 | 27 | def test_limit_number_of_tweets(self): 28 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 29 | collection = SmappCollection('bson', file_path) 30 | self.assertTrue(len(list(collection.limit_number_of_tweets(100))) > 0) 31 | 32 | # def test_smapp_mongo_collection_iterates(self): 33 | # collection = SmappCollection('mongo', 34 | # config['mongo']['host'], 35 | # config['mongo']['port'], 36 | # config['mongo']['user'], 37 | # config['mongo']['password'], 38 | # config['mongo']['database'], 39 | # config['mongo']['collection']) 40 | # self.assertTrue(len(list(collection.limit_number_of_tweets(100))) > 0) 41 | 42 | def test_get_tweet_texts(self): 43 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 44 | collection = SmappCollection('bson', file_path) 45 | texts = [text for text in collection.limit_number_of_tweets(1).get_tweet_texts()] 46 | self.assertEqual(str, type(texts[0])) 47 | 48 | def test_count_tweet_terms(self): 49 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 50 | collection = SmappCollection('bson', file_path) 51 | count = collection.count_tweet_terms('jade') 52 | self.assertEqual(167, count) 53 | 54 | def test_count_tweet_terms_multiple(self): 55 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 56 | collection = SmappCollection('bson', file_path) 57 | count = collection.count_tweet_terms('jade', 'helm') 58 | self.assertEqual(176, count) 59 | 60 | def test_count_tweets(self): 61 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 62 | collection = SmappCollection('bson', file_path) 63 | count = collection.count_tweets() 64 | self.assertEqual(1187, count) 65 | 66 | def test_get_tweets_containing(self): 67 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 68 | collection = SmappCollection('bson', file_path) 69 | count = len([tweet for tweet in collection.get_tweets_containing('jade')]) 70 | self.assertEqual(167, count) 71 | 72 | def test_get_tweets_containing_multiple(self): 73 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 74 | collection = SmappCollection('bson', file_path) 75 | count = len([tweet for tweet in collection.get_tweets_containing('jade', 'helm')]) 76 | self.assertEqual(176, count) 77 | 78 | def test_get_date_range(self): 79 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 80 | collection = SmappCollection('bson', file_path) 81 | count = len([tweet for tweet in collection.get_date_range(datetime(2015,11,2), datetime(2015,11,3))]) 82 | self.assertEqual(26, count) 83 | 84 | def test_find_date_range(self): 85 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 86 | collection = SmappCollection('bson', file_path) 87 | range_obj = collection.find_date_range() 88 | self.assertEqual(datetime(2015, 11, 2, 19, 56, 33), range_obj['date_min']) 89 | self.assertEqual(datetime(2015, 11, 6, 21, 35, 54), range_obj['date_max']) 90 | 91 | def test_tweet_language_is(self): 92 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 93 | collection = SmappCollection('bson', file_path) 94 | count = len([tweet for tweet in collection.tweet_language_is('en')]) 95 | self.assertEqual(825, count) 96 | 97 | def test_detect_tweet_language(self): 98 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 99 | collection = SmappCollection('bson', file_path) 100 | count = len([tweet for tweet in collection.detect_tweet_language('en')]) 101 | self.assertEqual(907, count) 102 | 103 | def test_user_language_is(self): 104 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 105 | collection = SmappCollection('bson', file_path) 106 | count = len([tweet for tweet in collection.user_language_is('en')]) 107 | self.assertEqual(801, count) 108 | 109 | def test_exclude_retweets(self): 110 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 111 | collection = SmappCollection('bson', file_path) 112 | count = len([tweet for tweet in collection.exclude_retweets()]) 113 | self.assertEqual(682, count) 114 | 115 | def test_get_retweets(self): 116 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 117 | collection = SmappCollection('bson', file_path) 118 | count = len([tweet for tweet in collection.get_retweets()]) 119 | self.assertEqual(505, count) 120 | 121 | def test_user_location_contains(self): 122 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 123 | collection = SmappCollection('bson', file_path) 124 | count = len([tweet for tweet in collection.user_location_contains('TX')]) 125 | self.assertEqual(10, count) 126 | 127 | def test_user_description_contains(self): 128 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 129 | collection = SmappCollection('json', file_path) 130 | count = len([tweet for tweet in collection.user_description_contains('JESUS')]) 131 | self.assertEqual(15, count) 132 | 133 | def test_user_id_is(self): 134 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 135 | collection = SmappCollection('json', file_path) 136 | count = len([tweet for tweet in collection.user_id_is(379851447, 149751818)]) 137 | self.assertEqual(77, count) 138 | 139 | def test_place_name_contains_country(self): 140 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 141 | collection = SmappCollection('json', file_path) 142 | count = len([tweet for tweet in collection.place_name_contains_country('United States')]) 143 | self.assertEqual(6, count) 144 | 145 | def test_within_geobox(self): 146 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 147 | collection = SmappCollection('json', file_path) 148 | # geobox here is for us mountain time 149 | # i created a coordinate in our data file on the last object [-105.29, 40.33] 150 | # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997] 151 | count = len([tweet for tweet in collection.within_geobox(-113.95, 28.81, -100.05, 48.87)]) 152 | self.assertEqual(1, count) 153 | 154 | def test_get_geo_enabled(self): 155 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 156 | collection = SmappCollection('bson', file_path) 157 | count = len([tweet for tweet in collection.get_geo_enabled()]) 158 | self.assertEqual(1, count) 159 | 160 | def test_get_non_geo_enabled(self): 161 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 162 | collection = SmappCollection('bson', file_path) 163 | count = len([tweet for tweet in collection.get_non_geo_enabled()]) 164 | self.assertEqual(1186, count) 165 | 166 | def test_dump_to_bson(self): 167 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'): 168 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson') 169 | 170 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson' 171 | collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) 172 | collection.dump_to_bson(output_path) 173 | self.assertTrue(os.path.getsize(output_path) > 0) 174 | 175 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'): 176 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson') 177 | 178 | def test_dump_to_json(self): 179 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 180 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 181 | 182 | output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') 183 | collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) 184 | collection.dump_to_json(output_path) 185 | self.assertTrue(os.path.getsize(output_path) > 0) 186 | 187 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 188 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 189 | 190 | def test_dump_to_csv(self): 191 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): 192 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') 193 | 194 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' 195 | collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) 196 | collection.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 197 | self.assertTrue(os.path.getsize(output_path) > 0) 198 | 199 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): 200 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') 201 | 202 | def test_dump_to_sqlite_db(self): 203 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): 204 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db') 205 | 206 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db' 207 | collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) 208 | collection.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 209 | self.assertTrue(os.path.getsize(output_path) > 0) 210 | 211 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): 212 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db') 213 | 214 | def test_get_top_hashtags(self): 215 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 216 | collection = SmappCollection('bson', file_path) 217 | base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}} 218 | hashtags = collection.get_top_hashtags(5) 219 | self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys())) 220 | 221 | def test_get_top_urls(self): 222 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 223 | collection = SmappCollection('bson', file_path) 224 | urls = collection.get_top_urls(5) 225 | base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}} 226 | self.assertTrue(set(urls.keys()) == set(base_urls.keys())) 227 | 228 | def test_get_top_mentions(self): 229 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 230 | collection = SmappCollection('bson', file_path) 231 | top_mentions = collection.get_top_mentions(5) 232 | base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}} 233 | self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys())) 234 | 235 | def test_get_top_media(self): 236 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 237 | collection = SmappCollection('bson', file_path) 238 | top_media = collection.get_top_media(5) 239 | base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}} 240 | self.assertTrue(set(top_media.keys()) == set(base_top_media.keys())) 241 | 242 | def test_get_top_symbols(self): 243 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 244 | collection = SmappCollection('bson', file_path) 245 | top_symbols = collection.get_top_symbols(5) 246 | base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}} 247 | self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys())) 248 | 249 | def test_get_top_terms(self): 250 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 251 | collection = SmappCollection('bson', file_path) 252 | top_counts = collection.get_top_terms(10) 253 | base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193} 254 | self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys())) 255 | 256 | def test_base_top_entities_returns_dict(self): 257 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 258 | collection = SmappCollection('bson', file_path) 259 | returndict = collection.get_top_entities({'hashtags':5}) 260 | self.assertTrue(isinstance(returndict, dict)) 261 | 262 | def test_base_top_entities_returns_hashtags(self): 263 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 264 | collection = SmappCollection('bson', file_path) 265 | returndict = collection.get_top_entities({'hashtags':5}) 266 | self.assertTrue('hashtags' in returndict) 267 | 268 | def test_base_top_entities_returns_hashtags_and_media(self): 269 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 270 | collection = SmappCollection('bson', file_path) 271 | returndict = collection.get_top_entities({'user_mentions':5, 'media':3}) 272 | self.assertTrue('user_mentions' in returndict and 'media' in returndict) 273 | 274 | def test_base_top_entities_returns_counts(self): 275 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 276 | collection = SmappCollection('bson', file_path) 277 | returndict = collection.get_top_entities({'urls':5, 'symbols':3}) 278 | if len(returndict['urls']) > 0: 279 | self.assertTrue(len(returndict['urls']) == 5) 280 | if len(returndict['symbols']) > 0: 281 | self.assertTrue(len(returndict['symbols']) == 3) 282 | 283 | def test_sample_returns_right_number_of_items(self): 284 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 285 | collection = SmappCollection('bson', file_path) 286 | sample_collection = collection.sample(10) 287 | self.assertEqual(10, len(list(sample_collection))) 288 | 289 | def test_sample_returns_dif_tweets_than_fist_10_tweets(self): 290 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 291 | collection_one = SmappCollection('bson', file_path) 292 | sample_tweets = list(collection_one.sample(10)) 293 | collection_two = SmappCollection('bson', file_path) 294 | first_ten_tweets = list(collection_two.limit_number_of_tweets(10)) 295 | self.assertNotEqual(sample_tweets, first_ten_tweets) 296 | 297 | def test_sample_chains_and_dumps(self): 298 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 299 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 300 | 301 | output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') 302 | collection = SmappCollection('bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) 303 | sample_tweets = collection.sample(10) 304 | sample_tweets.dump_to_json(output_path) 305 | self.assertTrue(os.path.getsize(output_path) > 0) 306 | with open(output_path) as f: 307 | self.assertEqual(10, len([line for line in f])) 308 | 309 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 310 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 311 | 312 | def test_set_custom_filter_properly_filters(self): 313 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 314 | collection_one = SmappCollection('bson', file_path) 315 | full_collection_len = len(list(collection_one)) 316 | def is_tweet_a_retweet(tweet): 317 | if 'retweeted' in tweet and tweet['retweeted']: 318 | return True 319 | else: 320 | return False 321 | num_retweets = len(list(collection_one.set_custom_filter(is_tweet_a_retweet))) 322 | 323 | collection_two = SmappCollection('bson', file_path) 324 | def is_not_a_retweet(tweet): 325 | if 'retweeted' in tweet and tweet['retweeted']: 326 | return False 327 | else: 328 | return True 329 | num_non_retweets = len(list(collection_two.set_custom_filter(is_not_a_retweet))) 330 | self.assertEqual(num_retweets + num_non_retweets, full_collection_len) 331 | 332 | if __name__ == '__main__': 333 | unittest.main() 334 | -------------------------------------------------------------------------------- /test/test_smapp_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from datetime import datetime 5 | from test.config import config 6 | from pysmap import SmappDataset, SmappCollection 7 | from smappdragon import BsonCollection 8 | 9 | class TestSmappDataset(unittest.TestCase): 10 | def test_control(self): 11 | self.assertTrue(True) 12 | 13 | def test_smapp_dataset_takes_base_input_types(self): 14 | file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 15 | file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 16 | file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) 17 | collection = SmappDataset(['bson', file_path_bson], ['json', file_path_json], ['csv', file_path_csv]) 18 | self.assertTrue(len(list(collection)) > 0) 19 | 20 | def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self): 21 | file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 22 | file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 23 | file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 24 | file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) 25 | collection = SmappCollection('bson', file_path_bson_2) 26 | dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv]) 27 | dataset_2 = SmappDataset(dataset_1, ['json', file_path_json], collection) 28 | self.assertTrue(len(list(dataset_2)) > 0) 29 | 30 | # def test_smapp_dataset_takes_collection_regex(self): 31 | # dataset = SmappDataset(['mongo', 32 | # config['mongo']['host'], 33 | # config['mongo']['port'], 34 | # config['mongo']['user'], 35 | # config['mongo']['password'], 36 | # config['mongo']['database']], collection_regex='(^data$|^tweets$|^tweets_\d+$)') 37 | # self.assertTrue(len(list(dataset)) > 0) 38 | 39 | # def test_smapp_dataset_takes_database_regex(self): 40 | # dataset = SmappDataset(['mongo', 41 | # config['mongo']['host'], 42 | # config['mongo']['port'], 43 | # config['mongo']['user'], 44 | # config['mongo']['password'], 45 | # config['mongo']['collection']], database_regex='(^47Traitors$)') 46 | # self.assertTrue(len(list(dataset)) > 0) 47 | 48 | # def test_smapp_dataset_takes_database_regex_and_collection_regex(self): 49 | # dataset = SmappDataset(['mongo', 50 | # config['mongo']['host'], 51 | # config['mongo']['port'], 52 | # config['mongo']['user'], 53 | # config['mongo']['password']], database_regex='(^47Traitors$)', collection_regex='(^data$|^tweets$|^tweets_\d+$)') 54 | # self.assertTrue(len(list(dataset)) > 0) 55 | 56 | def test_smapp_dataset_file_pattern_takes_a_unix_pattern(self): 57 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') 58 | dataset = SmappDataset(['bson', 'file_pattern', file_path]) 59 | self.assertTrue(len(list(dataset)) > 0) 60 | 61 | def test_smapp_dataset_file_pattern_takes_home_path(self): 62 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') 63 | file_path = file_path.replace('/Users/yvanscher', '~') 64 | dataset = SmappDataset(['bson','file_pattern',file_path]) 65 | self.assertTrue(len(list(dataset)) > 0) 66 | 67 | def test_smapp_dataset_file_pattern_returns_two_collections(self): 68 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') 69 | dataset = SmappDataset(['bson','file_pattern',file_path]) 70 | self.assertTrue(all([type(collection) == BsonCollection for collection in dataset.collections])) 71 | 72 | def test_smapp_bson_collection_iterates(self): 73 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 74 | dataset = SmappDataset(['bson', file_path]) 75 | self.assertTrue(len(list(dataset)) > 0) 76 | 77 | def test_smapp_json_collection_iterates(self): 78 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 79 | dataset = SmappDataset(['json', file_path]) 80 | self.assertTrue(len(list(dataset)) > 0) 81 | 82 | def test_smapp_csv_collection_iterates(self): 83 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) 84 | dataset = SmappDataset(['csv', file_path]) 85 | self.assertTrue(len(list(dataset)) > 0) 86 | 87 | # limit before mongo because mongo should be limited or it takes too long 88 | def test_limit_number_of_tweets(self): 89 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 90 | dataset = SmappDataset(['bson', file_path]) 91 | self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0) 92 | 93 | # def test_smapp_mongo_collection_iterates(self): 94 | # dataset = SmappDataset(['mongo', 95 | # config['mongo']['host'], 96 | # config['mongo']['port'], 97 | # config['mongo']['user'], 98 | # config['mongo']['password'], 99 | # config['mongo']['database'], 100 | # config['mongo']['collection']]) 101 | # self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0) 102 | 103 | def test_get_tweet_texts(self): 104 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 105 | dataset = SmappDataset(['bson', file_path]) 106 | texts = [text for text in dataset.limit_number_of_tweets(1).get_tweet_texts()] 107 | self.assertEqual(str, type(texts[0])) 108 | 109 | def test_count_tweet_terms(self): 110 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 111 | dataset = SmappDataset(['bson', file_path]) 112 | count = dataset.count_tweet_terms('jade') 113 | self.assertEqual(167, count) 114 | 115 | def test_count_tweet_terms_multiple(self): 116 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 117 | dataset = SmappDataset(['bson', file_path]) 118 | count = dataset.count_tweet_terms('jade', 'helm') 119 | self.assertEqual(176, count) 120 | 121 | def test_count_tweets(self): 122 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 123 | dataset = SmappDataset(['bson', file_path]) 124 | count = dataset.count_tweets() 125 | self.assertEqual(1187, count) 126 | 127 | def test_get_tweets_containing(self): 128 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 129 | dataset = SmappDataset(['bson', file_path]) 130 | count = len([tweet for tweet in dataset.get_tweets_containing('jade')]) 131 | self.assertEqual(167, count) 132 | 133 | def test_get_tweets_containing_multiple(self): 134 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 135 | dataset = SmappDataset(['bson', file_path]) 136 | count = len([tweet for tweet in dataset.get_tweets_containing('jade', 'helm')]) 137 | self.assertEqual(176, count) 138 | 139 | def test_get_date_range(self): 140 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 141 | dataset = SmappDataset(['bson', file_path]) 142 | count = len([tweet for tweet in dataset.get_date_range(datetime(2015,11,2), datetime(2015,11,3))]) 143 | self.assertEqual(26, count) 144 | 145 | def test_find_date_range(self): 146 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 147 | dataset = SmappDataset(['bson', file_path]) 148 | range_obj = dataset.find_date_range() 149 | self.assertEqual(datetime(2015, 11, 2, 19, 56, 33), range_obj['date_min']) 150 | self.assertEqual(datetime(2015, 11, 6, 21, 35, 54), range_obj['date_max']) 151 | 152 | def test_tweet_language_is(self): 153 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 154 | dataset = SmappDataset(['bson', file_path]) 155 | count = len([tweet for tweet in dataset.tweet_language_is('en')]) 156 | self.assertEqual(825, count) 157 | 158 | def test_detect_tweet_language(self): 159 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 160 | dataset = SmappDataset(['bson', file_path]) 161 | count = len([tweet for tweet in dataset.detect_tweet_language('en')]) 162 | self.assertEqual(907, count) 163 | 164 | def test_user_language_is(self): 165 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 166 | dataset = SmappDataset(['bson', file_path]) 167 | count = len([tweet for tweet in dataset.user_language_is('en')]) 168 | self.assertEqual(801, count) 169 | 170 | def test_exclude_retweets(self): 171 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 172 | dataset = SmappDataset(['bson', file_path]) 173 | count = len([tweet for tweet in dataset.exclude_retweets()]) 174 | self.assertEqual(682, count) 175 | 176 | def test_get_retweets(self): 177 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 178 | dataset = SmappDataset(['bson', file_path]) 179 | count = len([tweet for tweet in dataset.get_retweets()]) 180 | self.assertEqual(505, count) 181 | 182 | def test_user_location_contains(self): 183 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 184 | dataset = SmappDataset(['bson', file_path]) 185 | count = len([tweet for tweet in dataset.user_location_contains('TX')]) 186 | self.assertEqual(10, count) 187 | 188 | def test_user_description_contains(self): 189 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 190 | dataset = SmappDataset(['json', file_path]) 191 | count = len([tweet for tweet in dataset.user_description_contains('JESUS')]) 192 | self.assertEqual(15, count) 193 | 194 | def test_user_id_is(self): 195 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 196 | dataset = SmappDataset(['json', file_path]) 197 | count = len([tweet for tweet in dataset.user_id_is(379851447, 149751818)]) 198 | self.assertEqual(77, count) 199 | 200 | def test_place_name_contains_country(self): 201 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 202 | dataset = SmappDataset(['json', file_path]) 203 | count = len([tweet for tweet in dataset.place_name_contains_country('United States')]) 204 | self.assertEqual(6, count) 205 | 206 | def test_within_geobox(self): 207 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) 208 | dataset = SmappDataset(['json', file_path]) 209 | # geobox here is for us mountain time 210 | # i created a coordinate in our data file on the last object [-105.29, 40.33] 211 | # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997] 212 | count = len([tweet for tweet in dataset.within_geobox(-113.95, 28.81, -100.05, 48.87)]) 213 | self.assertEqual(1, count) 214 | 215 | def test_get_geo_enabled(self): 216 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 217 | dataset = SmappDataset(['bson', file_path]) 218 | count = len([tweet for tweet in dataset.get_geo_enabled()]) 219 | self.assertEqual(1, count) 220 | 221 | def test_get_non_geo_enabled(self): 222 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 223 | dataset = SmappDataset(['bson', file_path]) 224 | count = len([tweet for tweet in dataset.get_non_geo_enabled()]) 225 | self.assertEqual(1186, count) 226 | 227 | 228 | def test_dump_to_bson(self): 229 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'): 230 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson') 231 | 232 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson' 233 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 234 | dataset.dump_to_bson(output_path) 235 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson') > 0) 236 | 237 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson'): 238 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson') 239 | 240 | def test_dump_to_json(self): 241 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 242 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 243 | 244 | output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') 245 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 246 | dataset.dump_to_json(output_path) 247 | self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')) > 0) 248 | 249 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 250 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 251 | 252 | def test_dump_to_csv(self): 253 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): 254 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') 255 | 256 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' 257 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 258 | dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 259 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv') > 0) 260 | 261 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): 262 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') 263 | 264 | def test_dump_to_sqlite_db(self): 265 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): 266 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db') 267 | 268 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db' 269 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 270 | dataset.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) 271 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db') > 0) 272 | 273 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): 274 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db') 275 | 276 | def test_dump_to_bson_parallel(self): 277 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'): 278 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson') 279 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'): 280 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson') 281 | 282 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson' 283 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 284 | dataset.dump_to_bson(output_path, num_files=2) 285 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.bson') > 0) 286 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.bson') > 0) 287 | 288 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'): 289 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson') 290 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'): 291 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson') 292 | 293 | def test_dump_to_json_parallel(self): 294 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json'): 295 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json') 296 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json'): 297 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json') 298 | 299 | output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') 300 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 301 | dataset.dump_to_json(output_path, num_files=2) 302 | self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output_0.bson.json')) > 0) 303 | self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output_1.bson.json')) > 0) 304 | 305 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json'): 306 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson.json') 307 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json'): 308 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson.json') 309 | 310 | def test_dump_to_csv_parallel(self): 311 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'): 312 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv') 313 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'): 314 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv') 315 | 316 | output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' 317 | dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 318 | dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], num_files=2) 319 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.csv') > 0) 320 | self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.csv') > 0) 321 | 322 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'): 323 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv') 324 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'): 325 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv') 326 | 327 | def test_get_top_hashtags(self): 328 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 329 | dataset = SmappDataset(['bson', file_path]) 330 | base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}} 331 | hashtags = dataset.get_top_hashtags(5) 332 | self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys())) 333 | 334 | def test_get_top_urls(self): 335 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 336 | dataset = SmappDataset(['bson', file_path]) 337 | urls = dataset.get_top_urls(5) 338 | base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}} 339 | self.assertTrue(set(urls.keys()) == set(base_urls.keys())) 340 | 341 | def test_get_top_mentions(self): 342 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 343 | dataset = SmappDataset(['bson', file_path]) 344 | top_mentions = dataset.get_top_mentions(5) 345 | base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}} 346 | self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys())) 347 | 348 | def test_get_top_media(self): 349 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 350 | dataset = SmappDataset(['bson', file_path]) 351 | top_media = dataset.get_top_media(5) 352 | base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}} 353 | self.assertTrue(set(top_media.keys()) == set(base_top_media.keys())) 354 | 355 | def test_get_top_symbols(self): 356 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 357 | dataset = SmappDataset(['bson', file_path]) 358 | top_symbols = dataset.get_top_symbols(5) 359 | base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}} 360 | self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys())) 361 | 362 | def test_get_top_terms(self): 363 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 364 | dataset = SmappDataset(['bson', file_path]) 365 | top_counts = dataset.get_top_terms(10) 366 | base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193} 367 | self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys())) 368 | 369 | def test_base_top_entities_returns_dict(self): 370 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 371 | dataset = SmappDataset(['bson', file_path]) 372 | returndict = dataset.get_top_entities({'hashtags':5}) 373 | self.assertTrue(isinstance(returndict, dict)) 374 | 375 | def test_base_top_entities_returns_hashtags(self): 376 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 377 | dataset = SmappDataset(['bson', file_path]) 378 | returndict = dataset.get_top_entities({'hashtags':5}) 379 | self.assertTrue('hashtags' in returndict) 380 | 381 | def test_base_top_entities_returns_hashtags_and_media(self): 382 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 383 | dataset = SmappDataset(['bson', file_path]) 384 | returndict = dataset.get_top_entities({'user_mentions':5, 'media':3}) 385 | self.assertTrue('user_mentions' in returndict and 'media' in returndict) 386 | 387 | def test_base_top_entities_returns_counts(self): 388 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 389 | dataset = SmappDataset(['bson', file_path]) 390 | returndict = dataset.get_top_entities({'urls':5, 'symbols':3}) 391 | if len(returndict['urls']) > 0: 392 | self.assertTrue(len(returndict['urls']) == 5) 393 | if len(returndict['symbols']) > 0: 394 | self.assertTrue(len(returndict['symbols']) == 3) 395 | 396 | def test_sample_returns_right_number_of_items(self): 397 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 398 | dataset = SmappDataset(['bson', file_path]) 399 | sample_collection = dataset.sample(10) 400 | self.assertEqual(10, len(list(sample_collection))) 401 | 402 | def test_sample_returns_dif_tweets_than_fist_10_tweets(self): 403 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 404 | dataset = SmappDataset(['bson', file_path]) 405 | sample_tweets = list(dataset.sample(10)) 406 | dataset_two = SmappDataset(['bson', file_path]) 407 | first_ten_tweets = list(dataset_two.limit_number_of_tweets(10)) 408 | self.assertNotEqual(sample_tweets, first_ten_tweets) 409 | 410 | def test_sample_chains_and_dumps(self): 411 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 412 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 413 | 414 | output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') 415 | collection = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) 416 | sample_tweets = collection.sample(10) 417 | sample_tweets.dump_to_json(output_path) 418 | self.assertTrue(os.path.getsize(output_path) > 0) 419 | with open(output_path) as f: 420 | self.assertEqual(10, len([line for line in f])) 421 | 422 | if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): 423 | os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') 424 | 425 | def test_set_custom_filter_properly_filters(self): 426 | file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) 427 | dataset_one = SmappDataset(['bson', file_path]) 428 | full_collection_len = len(list(dataset_one)) 429 | def is_tweet_a_retweet(tweet): 430 | if 'retweeted' in tweet and tweet['retweeted']: 431 | return True 432 | else: 433 | return False 434 | num_retweets = len(list(dataset_one.set_custom_filter(is_tweet_a_retweet))) 435 | 436 | dataset_two = SmappDataset(['bson', file_path]) 437 | def is_not_a_retweet(tweet): 438 | if 'retweeted' in tweet and tweet['retweeted']: 439 | return False 440 | else: 441 | return True 442 | num_non_retweets = len(list(dataset_two.set_custom_filter(is_not_a_retweet))) 443 | self.assertEqual(num_retweets + num_non_retweets, full_collection_len) 444 | 445 | 446 | if __name__ == '__main__': 447 | unittest.main() 448 | --------------------------------------------------------------------------------