├── .gitignore ├── LICENSE ├── README.md ├── Sentence Parse.PNG ├── StatusStreamerOutput.txt ├── editor.py ├── geo_converter.py ├── geosearchclass.py ├── ngrams.py ├── p_files ├── LA.txt ├── clive.txt ├── nyc.txt └── nyc.txt~ ├── params.txt ├── real_time_vis.py ├── sample.py ├── scan_and_respond.py ├── scraper.py ├── streamer.py ├── suggest_bot.py ├── test_real_time_vis.py ├── test_write.py ├── tweeter.py ├── utils.py └── write.py /.gitignore: -------------------------------------------------------------------------------- 1 | phrases.json 2 | tweets.json 3 | consumerkeyandsecret 4 | output.txt 5 | params.txt 6 | grammar.pickle 7 | grammr*pickle 8 | poemsforrobots.txt 9 | deprecated/ 10 | *.pickle 11 | *.json 12 | #* 13 | *.pyc 14 | *~ 15 | .DS_Store 16 | 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Saito Group 1-17-2017 2 | 3 | About: 4 | ---------------------------------------------------------------------- 5 | This library is composed of several tools for scraping geolocated 6 | tweets and visualizing data gleaned from these tweets. It also has a 7 | robotic assistant tool, called ```suggest_bot``` which can help you 8 | write poems in the style of a document you pass in. Another 9 | tool, called ```scan_and_respond``` allows you to scan an area for 10 | search terms and then tweet at those people! 11 | 12 | Geo-tag your tweets! 13 | -------------------- 14 | We rely on geo-tagged tweets. Please allow your location to be seen 15 | when tweeting, especially when using this application! You can modify 16 | this by logging into your main twitter account and under "Security and 17 | Privacy" check the box next to "Tweet location". THANKS! 18 | 19 | 20 | Install: 21 | ---------------------------------------------------------------------- 22 | git, python 2.7.X, pip 23 | Python packages required: tweepy, nltk, matplotlib, geopy, argparse, 24 | curses, bs4 (beautiful soup), locale 25 | 26 | On Windows: upgrade powershell 27 | (you may still have unicode problems when printing to command line) 28 | 29 | ``` 30 | python -m pip install 31 | ``` 32 | 33 | For each required package listed above run: 34 | ``` 35 | pip install 36 | ``` 37 | Now we need some data, so we’ll use the nltk downloader 38 | Run a python shell from the command line: 39 | ``` 40 | python 41 | import nltk 42 | nltk.download() 43 | ``` 44 | On main page, highlight book, click download and that should be it... 45 | These are the exact packages from nltk that are required in case you want less data: 46 | 1) under corpora -> highlight stopwords 47 | 2) under corpora -> highlight treebank 48 | 3) under all packages -> highlight punkt 49 | 4) under models -> highlight averaged-perceptron-tagger 50 | 51 | This created a folder called “nltk_data” in your home folder which is 52 | used by the program 53 | 54 | Navigate to the folder where you want getweets to be 55 | ``` 56 | git clone https://github.com/saitogroup/geotweets.git 57 | ``` 58 | get consumerkeyandsecret (see below) and put that in the folder 59 | cd into folder 60 | run sample.py from the command line (see below) 61 | 62 | 63 | Consumer Key and Secret: 64 | ---------------------------------------------------------------------- 65 | The program looks for a file in the geotweets folder called 66 | consumerkeyandsecret This should have at least 2 lines, with the 67 | consumer key on the first line, the secret (the longer one) on the 68 | next and then (for streaming and posting) 2 more lines. An access 69 | token on the 3rd and the access token secret on the 4th. You can get 70 | these by going to https://apps.twitter.com in a web browser and 71 | creating an app. Then hit the button to create access tokens. You may 72 | have to set the app permissions to "read and write" if you want to use 73 | this to send tweets on your behalf. After creating the app, copy the 4 74 | alphanumeric keys into a blank file called "consumerkeyandsecret" as 75 | described above and put this file in your "geotweets" folder. 76 | 77 | 78 | TOOLS: 79 | ---------------------------------------------------------------------- 80 | sample: 81 | ------- 82 | A simple tool, called 'sample' allows you to scrape and save up to 83 | 100 geolocated tweets in batch form. You can optionally search within 84 | this set for specific words or hash tags and visualize the top word 85 | frequency. See sample.py for details or from command line run: 86 | ``` 87 | python sample.py --help 88 | python sample.py --doc 89 | ``` 90 | USAGE : 91 | ``` 92 | python sample.py [-h][-d][-v][-f FILENAME][-o OUTPUT][-vis] 93 | ``` 94 | 95 | scraper 96 | -------- 97 | Given a URL this will scrape a website and save the text to scraped_text.txt 98 | ``` 99 | scraper.py [-d][-h][-u URL][-o OUTPUT_FILE] 100 | ``` 101 | 102 | 103 | real time visualizer: 104 | --------------------- 105 | Another tool, called 'real_time_vis' creates a word frequency 106 | distribution chart which can grow and change in near real time as more 107 | tweets are grabbed. If you use -s, you'll get streaming results, which 108 | are currently being tweeted. Otherwise you will get batched quotes, 109 | every 5 seconds using the REST API, which will return tweets that are 110 | from the recent past. See real_time_vis.py for details or from the 111 | command line run: 112 | 113 | ``` 114 | python real_time_vis.py --help 115 | python real_time_vis.py --doc 116 | ``` 117 | USAGE : 118 | ``` 119 | python real_time_vis.py [-h][-d][-f FILENAME][-n NUMBER][-s][-a ADDRESS] 120 | ``` 121 | 122 | Both files use a parameter file with geolocation and search 123 | terms. See params.txt for an example. 124 | 125 | You may have to adjust your PYTHONPATH variable to run the program 126 | from the command line. Otherwise, using the python interpreter you can 127 | run it. 128 | 129 | 130 | 131 | suggest_bot 132 | ----------- 133 | This is a robotically assisted poetry engine. The user can create 134 | poems using a large supplied word corpus or use their own. It can also 135 | add words to the corpus from the twitter-sphere using the search 136 | option. It can also parse those twitter messages into phrases using 137 | natural language processing. 138 | 139 | USAGE: 140 | ``` 141 | python suggest_bot.py [-d][-h][-p PARAMS][-i INPUT | -m INPUT][-o OUTPUT][-a ADDRESS] 142 | ``` 143 | 1) Once you are running the program, if you call the 's' command, you 144 | can search twitter. This will use the parameters in the params.txt 145 | file as usual. 146 | 147 | 2) If you want to parse the tweets and then use phrases, 148 | simply repond 'y' to the query after you hit 's'. There is also a 149 | default corpus. 150 | 151 | 3)This is also a default set of words, that you can use 152 | by calling the 'd' command. 153 | 154 | 4)You can also load your own corpus, which will then just use those words 155 | randomly as suggestions. 156 | 157 | 5) Finally, while using the word suggester, if you ever find that you 158 | made an error, simply hit e and an inline editor will pop up. There is 159 | currently an error that was patched but hasn't been pushed to all 160 | python versions, so you currently cannot insert words. Sorry! 161 | 162 | 6) Finally, I would suggest trying out the markov chain poetry assistant. It can help create poems that mimic the natural statistics of the input text. Simply supply the progra 163 | m with a grammatical text of poems or literature. 164 | ``` 165 | python suggest_bot.py -m 166 | ``` 167 | 168 | scan_and_respond 169 | ---------------- 170 | 171 | This tool scans tweets and asks the user to verify them before sending 172 | a tweet response. The relevant tweets are also saved to a JSON 173 | file. This requires write access, which means the consumerkeyandsecret 174 | file must contain all 4 lines. 175 | 176 | ``` 177 | scan_and_respond.py [-h] [-d] [-f FILENAME] [-a ADDRESS] [-o OUTPUT] 178 | ``` 179 | 180 | HELP: 181 | ---------------------------------------------------------------------- 182 | All programs can be run from the command line (a.k.a. terminal in OS X). 183 | 184 | By typing 185 | ```python -h``` 186 | you will get help on the various command line tool options. 187 | By typing 188 | ```python -d``` 189 | you will get the programs documentation string. 190 | If a parameter says something like: 191 | ```-o OUTPUT``` Then simply substitute a file for the capitalized word, like so: 192 | ``` 193 | python suggest_bot.py -m my_poetic_text.txt 194 | ``` 195 | If a USAGE says something like ```[-x | -y]``` then you can only use parameter x OR y but not both. 196 | 197 | 198 | EXAMPLES: 199 | ---------------------------------------------------------------------- 200 | Grabbing geo-located tweets using paramter file params.txt (default), 201 | print to command line and write to output.txt (default): 202 | ``` 203 | python sample.py --verbose 204 | ``` 205 | Visualizing the data, using params.txt (default): 206 | ``` 207 | python real_time_vis.py 208 | ``` 209 | Streaming real time data to create a word frequency chart using a local address: 210 | ``` 211 | python real_time_vis.py -a "175 5th Avenue NYC" -s 212 | ``` 213 | Scraping a website and saving to an output file: 214 | ``` 215 | python scraper.py -u http://www.cnn.com -o scraped_text.txt 216 | ``` 217 | Using suggest_bot with a file of random words, which will NOT be a markov chain: 218 | ``` 219 | python suggest_bot.py -i random_not_necessarily_grammatical_text.txt 220 | ``` 221 | 222 | UTILITIES: 223 | ---------------------------------------------------------------------- 224 | These modules contain methods to assist the "tools" listed above: 225 | ``` 226 | tweeter.py: this allows you to tweet at people, programmatically 227 | utils.py 228 | geo_converter.py: this returns geocoordinates for a given address 229 | geosearchclass.py: searches the REST API 230 | streamer.py : creates a multithreaded twitter API streamer 231 | editor.py : creates a command line editor 232 | ngrams.py : creates a markov chain ngram word generator 233 | ``` 234 | 235 | write 236 | ----- 237 | This program classifies tweets into phrase types and 238 | produces a JSON array containing these, called phrases.json. It uses 239 | parameters from params.txt. This requires quite a bit of processing 240 | time, which can be reduced by using a lower "count". 241 | 242 | The below two modules run unit tests: 243 | ``` 244 | test_real_time_vis 245 | test_write 246 | ``` 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Sentence Parse.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Delta-Ark/Geo_Bot-complex/46bcc112a8b7c2bfe063b73cd2a44fb21f4bb933/Sentence Parse.PNG -------------------------------------------------------------------------------- /StatusStreamerOutput.txt: -------------------------------------------------------------------------------- 1 | Status( 2 | contributors=None, 3 | truncated=False, 4 | text=u'#pokemongo @ Alameda, California https://t.co/ksXeNFloaS', 5 | is_quote_status=False, 6 | in_reply_to_status_id=None, 7 | id=751221344306491392, 8 | favorite_count=0, _ 9 | api=, 10 | author= 11 | 12 | User(follow_request_sent=None, 13 | profile_use_background_image=False, 14 | _json={u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False}, 15 | id=31076073, _api=, verified=False, profile_image_url_https=u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', profile_sidebar_fill_color=u'252429', is_translator=False, geo_enabled=True, profile_text_color=u'666666', followers_count=751, protected=False, location=u'Alameda, California ', default_profile_image=False, id_str=u'31076073', utc_offset=-25200, statuses_count=8670, description=u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", friends_count=895, profile_link_color=u'FF0000', profile_image_url=u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', notifications=None, profile_background_image_url_https=u'https://abs.twimg.com/images/themes/theme9/bg.gif', profile_background_color=u'000000', profile_banner_url=u'https://pbs.twimg.com/profile_banners/31076073/1359676007', profile_background_image_url=u'http://abs.twimg.com/images/themes/theme9/bg.gif', screen_name=u'ddellamorte', lang=u'en', profile_background_tile=False, favourites_count=885, name=u'Brotha Nero', url=u'http://devourthepodcast.com', created_at=datetime.datetime(2009, 4, 14, 6, 53, 58), contributors_enabled=False, time_zone=u'Pacific Time (US & Canada)', profile_sidebar_border_color=u'FFFFFF', default_profile=False, following=False, listed_count=31), 16 | 17 | 18 | _json={u'contributors': None, u'truncated': False, u'text': u'#pokemongo @ Alameda, California https://t.co/ksXeNFloaS', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 751221344306491392, u'favorite_count': 0, u'source': u'Instagram', u'retweeted': False, u'coordinates': {u'type': u'Point', u'coordinates': [-122.257, 37.764]}, u'timestamp_ms': u'1467940100157', u'entities': {u'user_mentions': [], u'symbols': [], u'hashtags': [{u'indices': [0, 10], u'text': u'pokemongo'}], u'urls': [{u'url': u'https://t.co/ksXeNFloaS', u'indices': [33, 56], u'expanded_url': u'https://www.instagram.com/p/BHlNxzDBgdI/', u'display_url': u'instagram.com/p/BHlNxzDBgdI/'}]}, u'in_reply_to_screen_name': None, u'id_str': u'751221344306491392', u'retweet_count': 0, u'in_reply_to_user_id': None, u'favorited': False, u'user': {u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False}, 19 | DEPRECATED: u’geo': {u'type': u'Point', u'coordinates': [37.764, -122.257]}, 20 | u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'lang': u'und', u'created_at': u'Fri Jul 08 01:08:20 +0000 2016', u'filter_level': u'low', u'in_reply_to_status_id_str': None, u'place': {u'full_name': u'Alameda, CA', u'url': u'https://api.twitter.com/1.1/geo/id/000e96b4e9f8503f.json', u'country': u'United States', u'place_type': u'city', u'bounding_box': {u'type': u'Polygon', u'coordinates': [[[-122.332411, 37.720367], [-122.332411, 37.797229], [-122.224562, 37.797229], [-122.224562, 37.720367]]]}, u'country_code': u'US', u'attributes': {}, u'id': u'000e96b4e9f8503f', u'name': u'Alameda'}}, 21 | 22 | coordinates={u'type': u'Point', u'coordinates': [-122.257, 37.764]}, 23 | timestamp_ms=u'1467940100157', 24 | entities={u'user_mentions': [], u'symbols': [], u'hashtags': [{u'indices': [0, 10], u'text': u'pokemongo'}], u'urls': [{u'url': u'https://t.co/ksXeNFloaS', u'indices': [33, 56], u'expanded_url': u'https://www.instagram.com/p/BHlNxzDBgdI/', u'display_url': u'instagram.com/p/BHlNxzDBgdI/'}]}, 25 | in_reply_to_screen_name=None, 26 | in_reply_to_user_id=None, 27 | retweet_count=0, 28 | id_str=u'751221344306491392', 29 | favorited=False, 30 | source_url=u'http://instagram.com', 31 | 32 | 33 | user= 34 | 35 | User( 36 | follow_request_sent=None, profile_use_background_image=False, 37 | 38 | _json={u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False}, 39 | 40 | id=31076073, 41 | _api=, 42 | verified=False, 43 | profile_image_url_https=u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', 44 | profile_sidebar_fill_color=u'252429', 45 | is_translator=False, 46 | geo_enabled=True, 47 | profile_text_color=u'666666', 48 | followers_count=751, 49 | protected=False, 50 | location=u'Alameda, California ', 51 | default_profile_image=False, 52 | id_str=u'31076073', 53 | utcoffset=-25200, 54 | statuses_count=8670, 55 | description=u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", 56 | friends_count=895, 57 | profile_link_color=u'FF0000', 58 | profile_image_url=u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', 59 | notifications=None, 60 | profile_background_image_url_https=u'https://abs.twimg.com/images/themes/theme9/bg.gif', 61 | profile_background_color=u'000000', 62 | profile_banner_url=u'https://pbs.twimg.com/profile_banners/31076073/1359676007', 63 | profile_background_image_url=u'http://abs.twimg.com/images/themes/theme9/bg.gif', 64 | screen_name=u'ddellamorte', 65 | lang=u'en', profile_background_tile=False, favourites_count=885, name=u'Brotha Nero', url=u'http://devourthepodcast.com', created_at=datetime.datetime(2009, 4, 14, 6, 53, 58), contributors_enabled=False, time_zone=u'Pacific Time (US & Canada)', profile_sidebar_border_color=u'FFFFFF', default_profile=False, following=False, listed_count=31), 66 | 67 | 68 | geo={u'type': u'Point', u'coordinates': [37.764, -122.257]}, 69 | in_reply_to_user_id_str=None, 70 | possibly_sensitive=False, 71 | lang=u'und', 72 | created_at=datetime.datetime(2016, 7, 8, 1, 8, 20), 73 | filter_level=u'low', 74 | in_reply_to_status_id_str=None, 75 | 76 | place= 77 | Place( 78 | _api=, 79 | country_code=u'US', 80 | url=u'https://api.twitter.com/1.1/geo/id/000e96b4e9f8503f.json', 81 | country=u'United States', 82 | place_type=u'city', 83 | bounding_box=BoundingBox(_api=, type=u'Polygon', coordinates=[ 84 | [ 85 | [-122.332411, 37.720367], [-122.332411, 37.797229], [-122.224562, 37.797229], [-122.224562, 37.720367] 86 | ] 87 | ]), 88 | full_name=u'Alameda, CA', 89 | attributes={}, 90 | id=u'000e96b4e9f8503f', 91 | name=u'Alameda'), 92 | source=u'Instagram', 93 | retweeted=False) 94 | -------------------------------------------------------------------------------- /editor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # editor.py 3 | # Saito 2017 4 | 5 | 6 | """Creates a simple text editor 7 | """ 8 | import curses 9 | from curses.textpad import Textbox 10 | import locale 11 | 12 | 13 | def emacs_textbox(stdscr, initial_text): 14 | stdscr.clear() 15 | instructions = """ 16 | To Save and Exit hit Control-G 17 | 18 | This editing buffer uses Emacs commands (No Control-Y though) 19 | *** A command Control-G is == Control + g (don't capitalize) *** 20 | --------------------------------------------------------------- 21 | Movement: 22 | Use arrow keys 23 | 24 | OR: 25 | Start of line: Control-A 26 | End of line: Control-E 27 | Back Control-B 28 | Forward Control-F 29 | Down line Control-N Cursor down; move down one line. 30 | Previous line Control-P Cursor up; move up one line. 31 | 32 | COPY + PASTE: Use mouse + keyboard shortcuts to copy and paste 33 | 34 | Deletion: 35 | Delete under cursor Control-D 36 | Delete backwards Control-H 37 | Kill line Control-K 38 | """ 39 | stdscr.addstr(instructions) 40 | stdscr.refresh() 41 | 42 | ending = """------------------------------------------------------\n 43 | EDIT BELOW ONLY 44 | ------------------------------------------------------\n""" 45 | stdscr.addstr(ending) 46 | stdscr.refresh() 47 | stdscr.addstr(initial_text) 48 | stdscr.refresh() 49 | box = Textbox(stdscr, insert_mode=False) # Inf recursion bug when True 50 | box.edit() 51 | message = box.gather() 52 | remove_index = len(ending) + len(instructions) 53 | return message[remove_index + 15:] 54 | 55 | 56 | def create_editor(initial_text): 57 | locale.setlocale(locale.LC_ALL, '') 58 | code = locale.getpreferredencoding() 59 | initial_text = initial_text.encode(code, 'replace') # or 'ignore' 60 | msg = curses.wrapper(emacs_textbox, initial_text) 61 | return msg 62 | 63 | 64 | def main(): 65 | initial_text = u""" 66 | This is my po\xe9m 67 | It is not very clever 68 | But I'm fond of it 69 | """ 70 | msg = create_editor(initial_text) 71 | print msg 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /geo_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # geo_converter.py 3 | # Saito 2016 4 | """This is a module for converting from the the geocoordinate, radius 5 | convention to the bounding box convention. It allows use of the same 6 | parameter file for the Twitter REST API and Streaming API. 7 | 8 | Usage: 9 | bounding_box = convert_geocoordinates( 10 | latitude_degrees, longitude_degrees, radius_miles) 11 | where 12 | bounding_box == [southwest corner, northeast corner] == 13 | [lon1, lat1, lon2, lat2] 14 | 15 | and can be used when calling the streaming API. Notice it's longitude 16 | then latitude! 17 | 18 | """ 19 | from __future__ import division 20 | 21 | import math 22 | import sys 23 | from geopy.geocoders import Nominatim 24 | 25 | 26 | def deg_to_rads(degs): 27 | rads = (degs / 360) * 2 * math.pi 28 | return rads 29 | 30 | 31 | def convert_geocoordinates(latitude_degrees, longitude_degrees, radius_miles): 32 | """latitude and longitude in degrees, radius in miles, are converted 33 | to a bounding box representation, where box = [lat1, lon1, lat2, 34 | lon2] 35 | This doesn't work near the poles! 36 | """ 37 | if latitude_degrees > 85 or latitude_degrees < -85: 38 | print "latitude is >85 or < -85. This won't work near poles!" 39 | sys.exit(0) 40 | if longitude_degrees > 180 or longitude_degrees < -180: 41 | print "longitude is >180 or < -180" 42 | sys.exit(0) 43 | 44 | radius_km = radius_miles / 0.62137 45 | if radius_km > 100: 46 | print "bounding box may be inaccurate for large radii" 47 | # print radius_km 48 | circ_of_earth_km = 40075.1612 49 | lat_rads = deg_to_rads(latitude_degrees) 50 | circ_of_earth_km_at_lat = math.cos(abs(lat_rads)) * circ_of_earth_km 51 | # print circ_of_earth_km_at_lat 52 | lon_km_per_degree = circ_of_earth_km_at_lat / 360 53 | print "lon_km_per_degree = {} at {}".format(lon_km_per_degree, 54 | latitude_degrees) 55 | lon_delta = radius_km / lon_km_per_degree 56 | # print "longitudinal delta = {}".format(lon_delta) 57 | lon1 = longitude_degrees - lon_delta 58 | lon2 = longitude_degrees + lon_delta 59 | # print lon1 60 | # print lon2 61 | # check if within range: 62 | 63 | # equator to pole distance in km 64 | eq2pole = 10001.965729 65 | lat_km_per_degree = eq2pole / 90 66 | lat_delta = radius_km / lat_km_per_degree 67 | # print lat_delta 68 | lat1 = latitude_degrees - lat_delta 69 | lat2 = latitude_degrees + lat_delta 70 | 71 | # check all points and correct if possible 72 | lat1 = correct_latitude(lat1) 73 | lat2 = correct_latitude(lat2) 74 | lon1 = correct_longitude(lon1) 75 | lon2 = correct_longitude(lon2) 76 | bounding_box = [lon1, lat1, lon2, lat2] 77 | return bounding_box 78 | 79 | 80 | def correct_longitude(lon): 81 | if lon > 180: 82 | return -180 + (lon - 180) 83 | elif lon < -180: 84 | return 180 - (abs(lon) - 180) 85 | else: 86 | return lon 87 | 88 | 89 | def correct_latitude(lat): 90 | if lat > 90 or lat < -90: 91 | print "This doesn't work near the poles!!!!" 92 | sys.exit(0) 93 | return lat 94 | 95 | 96 | def get_bounding_box_from(GeoSearchClass): 97 | latitude = GeoSearchClass._latitude 98 | longitude = GeoSearchClass._longitude 99 | radius = GeoSearchClass._radius 100 | bounding_box = convert_geocoordinates(latitude, longitude, radius) 101 | return bounding_box 102 | 103 | 104 | def get_search_terms_from(GeoSearchClass): 105 | """parses search_term string of form "", "sf", "#sf+#tech" from the 106 | params file and returns as list for use with streaming class 107 | 108 | """ 109 | search_string = GeoSearchClass._search_term 110 | if search_string == "" or search_string is None: 111 | search_terms = None 112 | else: 113 | search_terms = search_string.split("+") 114 | return search_terms 115 | 116 | 117 | # want to get geocoordinates for a location and visa versa 118 | # to do: 119 | # test! 120 | 121 | 122 | def get_geocoords_from_address(address): 123 | """address is a string, like '555 5th Ave. NYC, NY, 12021'. This is 124 | searched and an approximate geocoordinate is returned, if possible 125 | in form (latitude, longitude) 126 | 127 | """ 128 | geolocator = Nominatim() # from geopy.geocoders.Nominatim 129 | location = geolocator.geocode(address) 130 | lat = location.latitude 131 | lon = location.longitude 132 | coords = (lat, lon) 133 | 134 | # do some check to see if coords were returned 135 | if not coords: 136 | return None 137 | 138 | # maybe do some coordinate conversion 139 | print "found these coords = {}".format(coords) 140 | back_projected_address = geolocator.reverse("{}, {}".format(lat, lon)) 141 | print "back_projected_address = {}".format(back_projected_address) 142 | return coords 143 | 144 | 145 | # def get_timezone_from_coordinates(latitude, longitude): 146 | # """given a latitude and a longitude, this returns the IANA Time Zone 147 | # Database (Olson database), which can be used to get a local time and 148 | # returns a pytz tzinfo timezone object""" 149 | 150 | # from geopy.geocoders import GoogleV3 151 | # g=GoogleV3(api_key=None, domain='maps.googleapis.com', scheme='https', client_id=None, secret_key=None, timeout=1, proxies=None) 152 | # timezone = g.timezone(latitude, longitude) 153 | # return timezone 154 | 155 | 156 | if __name__ == '__main__': 157 | print __doc__ 158 | 159 | # run some tests 160 | bounding_box = convert_geocoordinates(0, -122.4093, 0) 161 | print "longitudinal precision should be 111.32" 162 | print "should be same first and second" 163 | print "bounding_box = {}".format(bounding_box) 164 | 165 | bounding_box = convert_geocoordinates(37.7821, -122.4093, 1) 166 | print "should be ~ [-122.426, 37.771, -122.398, 37.790 ]" 167 | print "bounding_box = {}".format(bounding_box) 168 | 169 | bounding_box = convert_geocoordinates(45, -179.99, 10) 170 | print "longitudinal precision should be 78.84" 171 | print "should be sensible around Meridian" 172 | print "bounding_box = {}".format(bounding_box) 173 | 174 | bounding_box = convert_geocoordinates(84.7821, -122, 10) 175 | print "longitudinal precision should be ~9" 176 | print "should be sensible around pole" 177 | print "bounding_box = {}".format(bounding_box) 178 | 179 | get_geocoords_from_address('901 S Van Ness Ave., San Francisco, CA, 94110') 180 | -------------------------------------------------------------------------------- /geosearchclass.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import ast 3 | import codecs 4 | import os 5 | import sys 6 | 7 | import utils 8 | 9 | 10 | class GeoSearchClass(object): 11 | """Create a geo search with data validation 12 | 13 | For examples of usages, see geotweets.py 14 | 15 | Usage: 16 | g = GeoSearchClass() 17 | g.latitude =37.7821 18 | g.longitude =-122.4093 19 | g.radius =3 20 | g.search_term="#SF" 21 | g.result_type='mixed' 22 | g.count = 15 23 | 24 | Simple example: 25 | g = GeoSearchClass() 26 | g.search() 27 | g.print_search_results() 28 | 29 | OR to properly initialize: 30 | g = GeoSearchClass(params_filename, consumer_key_and_secret_filename) 31 | 32 | To initialize the geosearchclass with a parameter file and the 33 | consumer key and secret file: 34 | g = GeoSearchClass(params_filename,consumer_key_and_secret_filename) 35 | and use: 36 | g.search() 37 | g.print_search_results() 38 | """ 39 | 40 | def __init__(self, params_file='params.txt', 41 | keys_file="consumerkeyandsecret", 42 | api=None): 43 | if params_file: 44 | self.set_params_from_file(params_file) 45 | else: 46 | self.use_default_params() 47 | self.keys_file = keys_file 48 | if api: 49 | self.api = api 50 | self.credits_retrieved = True 51 | # elif self.get_creds(keys_file): 52 | # self.credits_retrieved = True 53 | else: 54 | self.credits_retrieved = False 55 | 56 | def use_default_params(self): 57 | self._search_term = None 58 | self._result_type = "mixed" 59 | self._count = 15 60 | self._latitude = 37.7821 61 | self._longitude = -122.4093 62 | self._radius = 3 63 | self._geo_string = None 64 | self.search_results = None 65 | 66 | def set_params_from_file(self, filename): 67 | with codecs.open(filename, encoding='utf-8', mode='rU') as f: 68 | params = dict() 69 | params.update(ast.literal_eval(f.read())) 70 | for key in params.keys(): 71 | print key + ' : ' + str(params[key]) 72 | self._latitude = params['latitude'] 73 | self._longitude = params['longitude'] 74 | self._radius = params['radius'] 75 | self._search_term = params['search_term'] 76 | self._result_type = params['result_type'] 77 | self._count = params['count'] 78 | self.tweet_text = params['tweet_text'] 79 | 80 | def search(self): 81 | '''Perform a geolocated search using the class attributes 82 | 'search_term', 'result_type', 'count', and 'geo_string'. 83 | 84 | Requires an api object as returned by the tweepy module. 85 | 86 | USAGE: 87 | search_results = search(api) 88 | 89 | See: http://docs.tweepy.org/en/v3.5.0/api.html#API.search 90 | ''' 91 | if not self.credits_retrieved: 92 | (self.api, __) = utils.get_credentials(self.keys_file, True) 93 | self.credits_retrieved = True 94 | geo_string = getattr(self, "geo_string") 95 | if self._geo_string is None: 96 | raise Exception("initialize geo string") 97 | search_results = self.api.search(q=self._search_term, 98 | geocode=geo_string, 99 | result_type=self._result_type, 100 | count=self._count) 101 | self.search_results = search_results 102 | return self.search_results 103 | 104 | def print_search_results(self): 105 | '''Pretty prints the list of SearchResult objects returned using the 106 | api.search method. 107 | 108 | The results are formated and give some info about the tweet. 109 | 110 | ''' 111 | 112 | # printSROInfo() #This is for SRO object investigation 113 | search_results = self.search_results 114 | print "Actual number of tweets returned from Twitter: " + str(len( 115 | search_results)) 116 | 117 | for sr in search_results: 118 | print 119 | print '@' + sr.user.screen_name 120 | if sr.geo: 121 | print 'coordinates = ' + str((sr.geo)['coordinates']) 122 | print "created_at = " + str(sr.created_at) 123 | print "tweet id: " + str(sr.id) 124 | print "retweet_count = " + str( 125 | sr.retweet_count) + "favorite_count = " + str( 126 | sr.favorite_count) 127 | print sr.text 128 | 129 | def write_search_results(self, output_file=u'output.txt'): 130 | '''Writes search results to output file, defaults to "output.txt". 131 | 132 | 133 | USAGE: 134 | write_results( output_file = 'output.txt') 135 | 136 | 137 | Details: It uses unicode encoding to capture all of the 138 | possible tweet characters. It gets the filesystemencoding for 139 | each OS. 140 | 141 | ''' 142 | search_results = self.search_results 143 | tweet_text = u'' 144 | for sr in search_results: 145 | coords = u'' 146 | if sr.geo: 147 | coords = u' coordinates = ' + str((sr.geo)['coordinates']) 148 | s = u'\n\n\n@' + sr.user.screen_name + coords + u' : \n' + sr.text 149 | 150 | tweet_text = tweet_text + s 151 | 152 | # print tweet_text 153 | # print "tweet text type = " + str(type(tweet_text)) 154 | fileSystemEncoding = sys.getfilesystemencoding() 155 | # OUTPUT_FILE = os.path.expanduser(u'./output.txt') 156 | OUTPUT_FILE = os.path.expanduser(u'./' + output_file) 157 | # with codecs.open(OUTPUT_FILE, encoding='utf-8', mode="w") as f: 158 | with codecs.open(OUTPUT_FILE, 159 | encoding=fileSystemEncoding, 160 | mode="w") as f: 161 | f.write(tweet_text) 162 | return 163 | 164 | def json_search_results(self, output_file='search_results.json'): 165 | '''Writes search results as json to output file 'search_results.json 166 | 167 | 168 | USAGE: 169 | json_search_results( output_file = 'search_results.json') 170 | 171 | 172 | Details: It uses unicode encoding to capture all of the 173 | possible tweet characters. It gets the filesystemencoding for 174 | each OS. 175 | 176 | ''' 177 | import json 178 | print 'writing results to file {}'.format(output_file) 179 | fileSystemEncoding = sys.getfilesystemencoding() 180 | # OUTPUT_FILE = os.path.expanduser(u'./output.txt') 181 | OUTPUT_FILE = os.path.expanduser(u'./' + output_file) 182 | # with codecs.open(OUTPUT_FILE, encoding='utf-8', mode="w") as f: 183 | with codecs.open(OUTPUT_FILE, 184 | encoding=fileSystemEncoding, 185 | mode="w") as f: 186 | for sr in self.search_results: 187 | j = json.dumps(sr._json, indent=1) 188 | f.write(j) 189 | return 190 | 191 | def _print_SRO_info(self): 192 | ''' 193 | This gives a verbose amount of info about the SearchResult object 194 | 195 | USAGE: 196 | print_SRO_info() 197 | ''' 198 | search_results = self.search_results 199 | print '\n\n\n\n' 200 | print 'The methods of each SearchResult object :' 201 | print dir(search_results[0]) 202 | print '\n\n\n\n' 203 | print 'The methods of each User object in a SRO:' 204 | print dir(search_results[0].user) 205 | print '\n\n\n\n' 206 | print 'Example of the first SRO object:' 207 | sr1 = search_results[0] 208 | print sr1.created_at 209 | # print sr1.retweets 210 | print sr1.retweet_count 211 | # print sr1.favorite 212 | # print sr1.favorited 213 | print sr1.favorite_count 214 | 215 | @property 216 | def count(self): 217 | "Number of results to return" 218 | return self._count 219 | 220 | @count.setter 221 | def count(self, value): 222 | if isinstance(value, basestring): 223 | value = float(value) 224 | if isinstance(value, (float, int)): 225 | if not (value > 0 and value < 101 and value == int(value)): 226 | raise ValueError( 227 | "count is '" + str(value) + 228 | "' but count must be an integer and 0 < count < 101") 229 | self._count = value 230 | 231 | @property 232 | def result_type(self): 233 | "Type of results to return: mixed, popular or recent" 234 | return self._result_type 235 | 236 | @result_type.setter 237 | def result_type(self, rt): 238 | if not (rt == "mixed" or rt == "popular" or rt == "recent"): 239 | raise ValueError( 240 | "result_type must be 'mixed', 'recent', or 'popular' NOT '" + 241 | str(rt) + "'") 242 | self._result_type = rt 243 | 244 | @property 245 | def latitude(self): 246 | "90 > Latitude > -90" 247 | return self._latitude 248 | 249 | @latitude.setter 250 | def latitude(self, value): 251 | if (value == ''): 252 | raise ValueError("You must put in a value") 253 | value = float(value) 254 | if not (value > -90.0 and value < 90.0): 255 | raise ValueError("latitude must be in bounds: 90.0>latitude>-90.0") 256 | self._latitude = value 257 | 258 | @property 259 | def longitude(self): 260 | "180 > Longitude > -180" 261 | return self._longitude 262 | 263 | @longitude.setter 264 | def longitude(self, value): 265 | if (value == ''): 266 | raise ValueError("You must put in a value") 267 | value = float(value) 268 | if not (value > -180.0 and value < 180.0): 269 | raise ValueError( 270 | "longitude must be in bounds: 180.0>longitude>-180.0") 271 | self._longitude = value 272 | 273 | @property 274 | def radius(self): 275 | "Radius of search, must be >0" 276 | return self._radius 277 | 278 | @radius.setter 279 | def radius(self, value): 280 | if (value == ''): 281 | raise ValueError("You must put in a value") 282 | value = float(value) 283 | if not (value > 0): 284 | raise ValueError("radius must be > 0.0 miles") 285 | self._radius = value 286 | 287 | @property 288 | def geo_string(self): 289 | "Formats the geo string using latitude, longitude and radius" 290 | self._geo_string = str(self._latitude) + "," + \ 291 | str(self._longitude) + "," + str(self._radius) + "mi" 292 | return self._geo_string 293 | 294 | 295 | def att_test(obj, atr, val_list): 296 | ''' 297 | Perform a unit test on attributes of a class 298 | 299 | USAGE: 300 | att_test(this_object, attribute_name_as_string, values_to_test_as_list) 301 | ''' 302 | print "\n\nTesting " + atr + " validation" 303 | for val in val_list: 304 | try: 305 | print "trying to set attribute to " + str(val) 306 | setattr(obj, atr, val) 307 | except ValueError as e: 308 | print e 309 | 310 | 311 | def main(): 312 | c = GeoSearchClass() 313 | 314 | print c.__doc__ 315 | print c.__dict__ 316 | # att_test(c, "count", [1,35, 101, -1, 3.5, "hello", "15"]) 317 | # att_test(c, "result_type", ["mixed","popular","recent","other",15, " mIxEd"]) 318 | # att_test(c,"latitude",[0, -90, 90, 300, "-50", "hello", 1.3]) 319 | # att_test(c,"longitude",[0, -180, 180, 300, "-100", "hello", 1.3]) 320 | # att_test(c,"radius",[0, -1, 10, 100, 1000]) 321 | print "\n\ncurrent geo_string " + c.geo_string 322 | print c.result_type 323 | 324 | 325 | if __name__ == '__main__': 326 | main() 327 | -------------------------------------------------------------------------------- /ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # ngrams.py 3 | # Saito 2017 4 | 5 | import random 6 | 7 | import utils 8 | 9 | 10 | def make_ngram(text, n): 11 | ngram = dict() 12 | tokens = utils.tokenize_normal_words(text) 13 | i = 0 14 | while i < (len(tokens)-(n-1)): 15 | l = list() 16 | for j in range(n-1): 17 | token = tokens[i+j] 18 | token = token.lower() 19 | # print token 20 | l.append(token) 21 | key = tuple(l) 22 | # print key 23 | value = tokens[i+n-1] 24 | value = value.lower() 25 | if key in ngram: 26 | ngram[key].append(value) 27 | else: 28 | ngram[key] = list() 29 | ngram[key].append(value) 30 | i += 1 31 | return ngram 32 | 33 | 34 | def generate(ngram, seed): 35 | """given an ngram dictionary and a string or tuple of words, this \ 36 | returns a word. For efficiency, pass in all words as a list""" 37 | if type(seed) is not tuple: 38 | l = list() 39 | tokens = utils.tokenize_normal_words(seed) 40 | tokens = [t.lower() for t in tokens] 41 | l.extend(tokens) 42 | seed = tuple(l) 43 | 44 | word = "" 45 | if seed in ngram: 46 | word = random.choice(ngram[seed]) 47 | # print "found in dictionary" 48 | # print ngram[seed] 49 | 50 | # elif words is None: 51 | # print "Combining all dictionary values." 52 | # words = sum(ngram.values(), []) 53 | # word = random.choice(words) 54 | # else: 55 | # word = random.choice(words) 56 | return word 57 | 58 | 59 | def make_bigram_trigram_dictionary(text): 60 | bigram = make_ngram(text, 2) 61 | # print bigram 62 | trigram = make_ngram(text, 3) 63 | # print trigram 64 | bigram.update(trigram) 65 | # print "printing bigram" 66 | # print bigram 67 | return bigram 68 | 69 | 70 | def main(): 71 | initial_text = u""" 72 | This is my poem. 73 | It is not very clever, 74 | But I'm fond of it. 75 | """ 76 | 77 | print initial_text 78 | ngram = make_bigram_trigram_dictionary(initial_text) 79 | word = generate(ngram, 'this') 80 | print "response should be 'is'" 81 | print word 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /p_files/LA.txt: -------------------------------------------------------------------------------- 1 | #LA 2 | 3 | {"latitude" : 34.043931 4 | , 5 | "longitude": -118.243936 6 | , 7 | "radius" : 100 8 | , 9 | "search_term" : "" 10 | , 11 | "result_type" : "recent" 12 | , 13 | "count" : 100} 14 | -------------------------------------------------------------------------------- /p_files/clive.txt: -------------------------------------------------------------------------------- 1 | #Clive, Iowa 2 | 3 | {"latitude" : 41.608593 4 | , 5 | "longitude": -93.788713 6 | , 7 | "radius" : 100 8 | , 9 | "search_term" : "" 10 | , 11 | "result_type" : "recent" 12 | , 13 | "count" : 100} 14 | -------------------------------------------------------------------------------- /p_files/nyc.txt: -------------------------------------------------------------------------------- 1 | #NYC 2 | 3 | {"latitude" : 40.734073 4 | , 5 | "longitude": -73.990663 6 | , 7 | "radius" : 10 8 | , 9 | "search_term" : "" 10 | , 11 | "result_type" : "recent" 12 | , 13 | "count" : 100} 14 | -------------------------------------------------------------------------------- /p_files/nyc.txt~: -------------------------------------------------------------------------------- 1 | #NYC 2 | 3 | {"latitude" : 40.734073 4 | , 5 | "longitude": -73.990663 6 | , 7 | "radius" : 10 8 | , 9 | "search_term" : "" 10 | , 11 | "result_type" : "recent" 12 | , 13 | "count" : 100} 14 | -------------------------------------------------------------------------------- /params.txt: -------------------------------------------------------------------------------- 1 | # This file contains the parameters for the geo located search. 2 | 3 | # You must list all parameters in python dictionary format as shown to 4 | # use this input method. Just follow the example below. Note the 5 | # commas after each entry. Note: latitude, longitude, radius and 6 | # count all take numbers. Radius is set in miles and can be a decimal 7 | # or whole number like 0.1 or 3. search_term, result_type are both 8 | # strings and must be in quotes. search_term can be set to, for 9 | # example "#SF+tech" to use one hash tag search terms and one normal 10 | # word. It can also just be a word in quotes, "sf" or the word None 11 | # without quotes for no term. Result_type can be either "mixed", 12 | # "popular", or "recent". Count must be an integer between 0 and 100 13 | # OR None. Finally, a term called "tweet_text" is optional and is for 14 | # posting tweets on your behalf using the scan_and_respond tool. It 15 | # should be a unicode string (hence the preceding 'u'. 16 | # 17 | # Example of params.txt: 18 | # {"latitude" : 37.7821, 19 | # "longitude": -122.4093, 20 | # "radius" : 10, 21 | # "search_term" : "#SF+tech", 22 | # "result_type" : "mixed", 23 | # "count" : 100} 24 | 25 | 26 | 27 | {"latitude" : 37.772296 28 | , 29 | "longitude": -122.412911 30 | , 31 | "radius" : 10 32 | , 33 | "search_term" : "" 34 | , 35 | "result_type" : "mixed" 36 | , 37 | "count" : 100 38 | , 39 | "tweet_text" : u'''WASSUP! This tweet was written using the saito 'geotweets' project on github! Check it out!''' 40 | } 41 | 42 | 43 | -------------------------------------------------------------------------------- /real_time_vis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # real_time_vis.py 3 | # Saito 2015 4 | 5 | """This grabs tweets and visualizes them in real time using params.txt. 6 | 7 | You can get the tweets using the streaming API or the REST API. The 8 | rest API requires 5 second pauses between successive calls to the 9 | twitter server. This is the default. Use the --stream or -s flag to 10 | enable the streaming API. The Streaming API gets all tweets that are 11 | geotagged within the bounding box. The geolocation is approximately 12 | converted, by inscribing a bounding box square in the circle around 13 | the geocoordinates. The tweets are also saved in JSON form to 14 | a file called 'tweets.json'. 15 | 16 | USAGE: 17 | $ python real_time_vis.py [-h][-d][-f FILENAME][-n NUMBER][-s][-a ADDRESS] 18 | OR for help, try: 19 | $ ./real_time_vis.py -h 20 | OR: 21 | $ python real_time_vis.py 22 | 23 | 24 | Example using default parameter file 'params.txt', with 20 top words 25 | to display, on a growing chart: 26 | 27 | $ ./real_time_vis --number 20 28 | Or using the streaming API with an address: 29 | $ ./real_time_vis -n 20 -s -a "175 5th Avenue NYC" 30 | 31 | 32 | TO EXIT: 33 | To exit one of these multithreaded programs, use a keyboard interrupt 34 | like CTRL+C. 35 | 36 | """ 37 | from __future__ import division 38 | 39 | import Queue 40 | import argparse 41 | import sys 42 | 43 | import matplotlib.pyplot as plt 44 | 45 | import geo_converter 46 | import geosearchclass 47 | import streamer 48 | import utils 49 | 50 | 51 | global stream # so that CTRL + C kills stream 52 | 53 | 54 | def update_fdist(fdist, new_words): 55 | for word in new_words: 56 | if word in fdist: 57 | fdist[word] += 1 58 | else: 59 | fdist[word] = 1 60 | return fdist 61 | 62 | 63 | def remove_infrequent_words(samples, fdist): 64 | trimmed_samples = [] 65 | for item in samples: 66 | if fdist[item] > 2: 67 | trimmed_samples.append(item) 68 | return trimmed_samples 69 | 70 | 71 | def updating_plot(geosearchclass, number_of_words, grow=True): 72 | search_results = geosearchclass.search() 73 | filtered_words = utils.tokenize_and_filter(search_results) 74 | fdist = utils.get_freq_dist(filtered_words) 75 | # set up plot 76 | samples = [item for item, _ in fdist.most_common(number_of_words)] 77 | freqs = [fdist[sample] for sample in samples] 78 | plt.grid(True, color="silver") 79 | plt.plot(freqs, range(len(freqs))) 80 | plt.yticks(range(len(samples)), [s for s in samples]) 81 | plt.ylabel("Samples") 82 | plt.xlabel("Counts") 83 | plt.title("Top Words Frequency Distribution") 84 | plt.ion() 85 | plt.show() 86 | 87 | # set up loop 88 | old_ids = set([s.id for s in search_results]) 89 | for i in xrange(100): 90 | plt.pause(5) 91 | # use mixed above, change to recent here 92 | geosearchclass.result_type = "recent" 93 | # perturbation study 94 | # if i%2: # for testing purposes 95 | # # #change location every odd time to nyc 96 | # # geosearchclass.latitude =40.734073 97 | # # geosearchclass.longitude =-73.990663 98 | # # perturb latitude 99 | # geosearchclass.latitude =geosearchclass.latitude + .001 100 | 101 | # else: 102 | # #now back to sf 103 | # # geosearchclass.latitude = 37.7821 104 | # # geosearchclass.longitude = -122.4093 105 | # geosearchclass.longitude =geosearchclass.longitude + .001 106 | 107 | search_results = geosearchclass.search() 108 | new_search_results = utils.new_tweets(search_results, old_ids) 109 | if new_search_results: 110 | filtered_words = utils.tokenize_and_filter(new_search_results) 111 | fdist = update_fdist(fdist, filtered_words) 112 | if grow: 113 | newsamples = [item 114 | for item, _ in fdist.most_common(number_of_words) 115 | ] 116 | s1 = set(newsamples) 117 | s2 = set(samples) 118 | s1.difference_update(s2) 119 | if s1: 120 | print "New words: " + str(list(s1)) 121 | newsamples = list(s1) 122 | samples.extend(newsamples) 123 | plt.yticks(range(len(samples)), [s for s in samples]) 124 | freqs = [fdist[sample] for sample in samples] 125 | plt.plot(freqs, range(len(freqs))) 126 | if grow: 127 | plt.draw() 128 | print '%d new tweet(s)' % len(new_search_results) 129 | old_ids.update(set([s.id for s in new_search_results])) 130 | else: 131 | print "no updates" 132 | 133 | # g = geosearchclass.GeoSearchClass() 134 | # g.set_params_from_file('params.txt') 135 | # search_results = g.search() 136 | 137 | 138 | def updating_stream_plot(q, number_of_words=30): 139 | """This plot uses the streaming API to get real time twitter 140 | information from a given region, determined by a geo-coordinate 141 | bounding box. The upper left and lower right determine the 142 | bounding box. 143 | 144 | q is a queue instance, which holds tweets 145 | 146 | number_of_words determines the average number of words in the 147 | plot. Once the plot reaches 2 x number_of_words, it is shrunk down 148 | to the new set of words and starts growing again 149 | 150 | To exit the program early, hit CTRL + Z to stop the python script 151 | and then CTRL + D twice to kill the terminal process and close the 152 | window. 153 | 154 | """ 155 | setup = False 156 | fdist = None 157 | samples = None 158 | draw_time = 0.1 159 | samples = [] 160 | plt.ion() 161 | plt.grid(True, color="silver") 162 | 163 | for i in range(100000): 164 | status = q.get() 165 | search_results = [status] 166 | while not q.empty(): 167 | print "getting another tweet" 168 | status = q.get() 169 | search_results.append(status) 170 | 171 | if not setup: 172 | print "Gathering enough data to begin plotting" 173 | while len(samples) < 1: 174 | status = q.get() 175 | search_results.append(status) 176 | filtered_words = utils.tokenize_and_filter(search_results) 177 | if fdist is None: 178 | fdist = utils.get_freq_dist(filtered_words) 179 | else: 180 | fdist = update_fdist(fdist, filtered_words) 181 | n_words = min(10, len(fdist)) 182 | samples = [item for item, _ in fdist.most_common(n_words)] 183 | # print "len(samples) = {}".format(len(samples)) 184 | samples = remove_infrequent_words(samples, fdist) 185 | freqs = [fdist[sample] for sample in samples] 186 | plt.plot(freqs, range(len(freqs))) 187 | plt.yticks(range(len(samples)), [s for s in samples]) 188 | plt.ylabel("Samples") 189 | plt.xlabel("Counts") 190 | plt.title("Top Words Frequency Distribution") 191 | plt.show() 192 | plt.pause(draw_time) 193 | setup = True 194 | 195 | else: 196 | filtered_words = utils.tokenize_and_filter(search_results) 197 | fdist = update_fdist(fdist, filtered_words) 198 | newsamples = [item 199 | for item, _ in fdist.most_common(number_of_words)] 200 | newsamples = remove_infrequent_words(newsamples, fdist) 201 | s1 = set(newsamples) 202 | s2 = set(samples) 203 | s1.difference_update(s2) 204 | if s1: 205 | print "New words: " + str(list(s1)) 206 | newsamples = list(s1) 207 | samples.extend(newsamples) 208 | if len(samples) > 2*number_of_words: 209 | samples = newsamples 210 | plt.close() 211 | plt.yticks(range(len(samples)), [s for s in samples]) 212 | freqs = [fdist[sample] for sample in samples] 213 | plt.plot(freqs, range(len(freqs))) 214 | plt.draw() 215 | plt.pause(draw_time) 216 | kill_plot() 217 | return 218 | 219 | 220 | def kill_plot(): 221 | print "turning interactive off" 222 | plt.ioff() 223 | print "closing plot" 224 | plt.close() 225 | return 226 | 227 | 228 | def get_parser(): 229 | """ Creates a command line parser 230 | 231 | --doc -d 232 | --help -h 233 | --filename -f 234 | --grow -g 235 | --number -n 236 | """ 237 | # Create command line argument parser 238 | parser = argparse.ArgumentParser( 239 | description='Create an updating word frequency distribution chart.') 240 | 241 | parser.add_argument('-d', 242 | '--doc', 243 | action='store_true', 244 | help='print module documentation and exit') 245 | parser.add_argument( 246 | '-f', 247 | '--filename', 248 | help='''specify a FILENAME to use as the parameter file. 249 | If not specified, will use 'params.txt'.''') 250 | parser.add_argument( 251 | '-a', 252 | '--address', 253 | help='''give an ADDRESS to get geocoordinates for. 254 | Put the address in quotes''') 255 | # parser.add_argument('-r', 256 | # '--rest', 257 | # action='store_true', 258 | # help='Use the REST API to create a growing chart\ 259 | # as new words arrive.') 260 | parser.add_argument('-n', 261 | '--number', 262 | help='specify NUMBER of words to display. The\ 263 | streaming plot will grow to twice this number\ 264 | before shrinking again') 265 | parser.add_argument('-s', 266 | '--stream', 267 | action='store_true', 268 | help='Use streaming API to update a growing plot. \ 269 | Otherwise, results will be batched.\ 270 | Use Interrupt signal, like CTRL + C to exit. \ 271 | This uses the LOCATION and SEARCH_TERM from\ 272 | parameter file. The tweets are saved to tweets.json.') 273 | return parser 274 | 275 | 276 | def main(): 277 | parser = get_parser() 278 | args = parser.parse_args() 279 | # print args 280 | # print args.help 281 | 282 | if args.doc: 283 | print __doc__ 284 | import sys 285 | sys.exit(0) 286 | 287 | if args.number: 288 | number = int(args.number) 289 | else: 290 | number = 30 291 | 292 | g = geosearchclass.GeoSearchClass() 293 | 294 | if args.filename: 295 | print 'Using parameters from ' + str(args.filename) 296 | g.set_params_from_file(args.filename) 297 | else: 298 | print "Using search values from params.txt" 299 | g.set_params_from_file('params.txt') 300 | 301 | if args.address: 302 | print "Finding geocoordates for address:\n{}".format(args.address) 303 | coords = geo_converter.get_geocoords_from_address(args.address) 304 | if coords: 305 | g.latitude = coords[0] 306 | print "Found this latitude:" 307 | print g.latitude 308 | g.longitude = coords[1] 309 | print "Found this longitude:" 310 | print g.longitude 311 | else: 312 | print "Failed to find coordinates. Exiting." 313 | sys.exit() 314 | 315 | if args.stream: 316 | print "using streaming queue" 317 | q = Queue.Queue() 318 | bounding_box = geo_converter.get_bounding_box_from(g) 319 | search_terms = geo_converter.get_search_terms_from(g) 320 | print "bounding_box = {}".format(bounding_box) 321 | print "search_terms = {}".format(search_terms) 322 | global stream 323 | fn = 'tweets.json' 324 | stream = streamer.start_stream(q, bounding_box, fn, search_terms) 325 | updating_stream_plot(q, number) 326 | else: 327 | print "using REST API updating plot" 328 | updating_plot(g, number, True) # set grow flag to True 329 | 330 | 331 | if __name__ == '__main__': 332 | try: 333 | main() 334 | except KeyboardInterrupt: 335 | print "Main function interrupted" 336 | if "stream" in globals(): 337 | streamer.kill_stream(stream) 338 | kill_plot() 339 | sys.exit() 340 | -------------------------------------------------------------------------------- /sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # sample.py 3 | # Saito 2015 4 | 5 | """This program is for grabbing and saving a sample of geo-located tweets 6 | 7 | 8 | USAGE: 9 | $ ./sample.py [-h][-d][-v][-f FILENAME][-o OUTPUT][-vis] 10 | 11 | Print command line help: 12 | $ ./sample.py --help (or just -h) 13 | 14 | Example: This uses parameter file 'params.txt', prints results to 15 | command line and writes them to 'out.txt': 16 | $ ./sample.py --verbose --filename params.txt --output out.txt 17 | 18 | The program requires a file in this folder called consumerkeyandsecret 19 | which contains only a consumer key on the first line and consumer 20 | secret (the longer one) on the second line. See README. 21 | 22 | The program can optionally take a parameter file as input. Please see 23 | the file "params.txt" for an example. 24 | 25 | Example of params.txt: 26 | {"latitude" : 37.7821, 27 | "longitude": -122.4093, 28 | "radius" : 10, 29 | "search_term" : "#SF+tech", 30 | "result_type" : "mixed", 31 | "count" : 15} 32 | 33 | """ 34 | 35 | import sys 36 | import argparse 37 | import geosearchclass 38 | 39 | 40 | def get_parser(): 41 | """ Creates a command line parser 42 | 43 | --doc -d 44 | --help -h 45 | --filename -f 46 | --verbose -v 47 | --output -o 48 | --visualize -vis 49 | --default 50 | """ 51 | 52 | parser = argparse.ArgumentParser( 53 | description='Perform a geo-located search.') 54 | 55 | parser.add_argument( 56 | '-d', '--doc', action='store_true', 57 | help='print module documentation and exit') 58 | parser.add_argument( 59 | '-f', '--filename', 60 | help='''specify a FILENAME to use as the parameter file. 61 | If not specified, will use 'params.txt'.''') 62 | parser.add_argument( 63 | '-v', '--verbose', action='store_true', 64 | help='additionally print output to command line') 65 | parser.add_argument( 66 | '--default', action='store_true', 67 | help="""ignore parameter file and use default search 68 | terms from geosearchclass""") 69 | parser.add_argument( 70 | '-o', '--output', 71 | help='''specify an OUTPUT file to write to. 72 | Default is output.txt''') 73 | parser.add_argument( 74 | '-j', '--json', 75 | help='''specify an OUTPUT JSON file to write to.''') 76 | parser.add_argument('-vis', '--visualize', 77 | action='store_true', help='visualize using nlp tools') 78 | 79 | # automatically grabs arguments from sys.argv[] 80 | 81 | return parser 82 | 83 | 84 | def main(): 85 | 86 | parser = get_parser() 87 | args = parser.parse_args() 88 | 89 | if args.doc: 90 | print __doc__ 91 | sys.exit() 92 | 93 | g = geosearchclass.GeoSearchClass() 94 | 95 | if args.filename: 96 | print 'Using parameters from ' + str(args.filename) 97 | # turn parameter file into dictionary 98 | g.set_params_from_file(args.filename) 99 | else: 100 | if args.default: 101 | print 'Using default search terms' 102 | else: 103 | print 'Using parameters from params.txt' 104 | g.set_params_from_file('params.txt') 105 | 106 | g.search() 107 | # print formatted results with extra info to terminal 108 | if args.verbose: 109 | g.print_search_results() 110 | 111 | if args.output: 112 | g.write_search_results(args.output) 113 | else: 114 | g.write_search_results() 115 | 116 | if args.json: 117 | g.json_search_results(args.json) 118 | 119 | if args.visualize: 120 | import utils 121 | filtered_words = utils.tokenize_and_filter(g.search_results) 122 | utils.visualize(filtered_words) 123 | 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /scan_and_respond.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # real_time_vis.py 3 | # Saito 2016 4 | 5 | """ 6 | Scans tweets and asks the user to verify them before sending a tweet response. 7 | 8 | A queue is created of tweets as they arrive via the REST API. The user 9 | is then asked to look over these tweets and decide if they are 10 | relevant. If they are, the relevant parts are saved to a JSON file. If 11 | they respond flag -r was passed, a public tweet is sent out with the 12 | user tagged in it. 13 | 14 | """ 15 | 16 | from __future__ import division 17 | 18 | import Queue 19 | import argparse 20 | import codecs 21 | import json 22 | import threading 23 | import sys 24 | import time 25 | 26 | import geo_converter 27 | import geosearchclass 28 | import tweeter 29 | import utils 30 | from utils import new_tweets 31 | 32 | 33 | def scan(geosearchclass, q): 34 | global keep_scanning 35 | search_results = geosearchclass.search() 36 | old_ids = [sr.id for sr in search_results] 37 | for s in search_results: 38 | q.put(s) 39 | while keep_scanning: 40 | for i in range(5): 41 | if keep_scanning: 42 | time.sleep(1) 43 | else: 44 | return 45 | geosearchclass.result_type = "recent" 46 | search_results = geosearchclass.search() 47 | new_search_results = new_tweets(search_results, old_ids) 48 | if new_search_results: 49 | for nsr in new_search_results: 50 | q.put(nsr) 51 | return 52 | 53 | 54 | def verify(geosearchclass, filename): 55 | q = Queue.Queue() 56 | global keep_scanning 57 | keep_scanning = True 58 | thread = threading.Thread(target=scan, args=(geosearchclass, q)) 59 | thread.daemon = True 60 | thread.start() 61 | respond = False 62 | with codecs.open(filename, encoding='utf-8', mode='a') as json_file: 63 | json_file.seek(0) 64 | json_file.truncate() 65 | 66 | print """\n\n\tThis program will present a series of tweets and ask for you to 67 | verify if they should be responded to. If so, they will be saved 68 | to the JSON file. When you quit scanning, the public tweets will 69 | be sent out.\n""" 70 | 71 | print """Would you like to send tweet responses at the end of this verification 72 | session?""" 73 | response = "" 74 | while response != 'y' and response != 'n': 75 | response = raw_input("[y for Yes, n for No] : ") 76 | print response 77 | if response == 'y': 78 | respond = True 79 | elif response == 'n': 80 | respond = False 81 | 82 | first = True 83 | while True: 84 | if q.empty(): 85 | time.sleep(5) 86 | continue 87 | status = q.get() 88 | print "\n\nVerify if this tweet is what you want:" 89 | simplified_tweet = utils.get_simplified_tweet(status) 90 | response = "" 91 | while response != 'y' and response != 'n' and response != 'q': 92 | response = raw_input("[y for Yes, n for No, q for Quit] : ") 93 | if response == 'y': 94 | j = json.dumps(simplified_tweet, indent=1) 95 | if first: 96 | json_file.write('[\n') 97 | json_file.write(j) 98 | first = False 99 | continue 100 | json_file.write(',\n') 101 | json_file.write(j) 102 | elif response == 'n': 103 | continue 104 | elif response == 'q': 105 | keep_scanning = False 106 | thread.join() 107 | json_file.write('\n]') 108 | break 109 | responder(geosearchclass, respond, filename) 110 | return 111 | 112 | 113 | def responder(geosearchclass, respond, filename): 114 | if not respond: 115 | print "No responses sent!" 116 | return 117 | with codecs.open(filename, encoding='utf-8', mode='rU') as json_file: 118 | json_string = json_file.read() 119 | tweets = json.loads(json_string) 120 | for tweet in tweets: 121 | user = tweet[0] 122 | response_text = geosearchclass.tweet_text + u" @" + user 123 | if len(response_text) > 140: 124 | raise ValueError("Tweet text is > 140 characters. Can't post. \ 125 | Shorten the tweet text in the params file") 126 | id = int(tweet[2]) 127 | users_text = tweet[3] 128 | 129 | print "\n\n\nPlease confirm you want to respond to this tweet" 130 | print user 131 | print users_text 132 | print "with this text: " 133 | print response_text 134 | response = raw_input("[y for Yes, anything for No] : ") 135 | if response == 'y': 136 | status = tweeter.tweet(geosearchclass.api, response_text, id) 137 | print "This tweet was posted: " 138 | utils.get_simplified_tweet(status) 139 | 140 | return 141 | 142 | 143 | def get_parser(): 144 | """ Creates a command line parser 145 | 146 | --doc -d 147 | --help -h 148 | --filename -f 149 | --respond -r 150 | """ 151 | # Create command line argument parser 152 | parser = argparse.ArgumentParser( 153 | description='Create an updating word frequency distribution chart.') 154 | 155 | parser.add_argument('-d', 156 | '--doc', 157 | action='store_true', 158 | help='print module documentation and exit') 159 | parser.add_argument( 160 | '-f', 161 | '--filename', 162 | help='''specify a FILENAME to use as the parameter file. 163 | If not specified, will use 'params.txt'.''') 164 | parser.add_argument('-a', 165 | '--address', 166 | help='''give an ADDRESS to get geocoordinates for.''') 167 | parser.add_argument( 168 | '-o', '--output', 169 | help='''specify an OUTPUT file to write to. 170 | Default is tweets.json''') 171 | return parser 172 | 173 | 174 | def main(): 175 | parser = get_parser() 176 | args = parser.parse_args() 177 | 178 | if args.doc: 179 | print __doc__ 180 | import sys 181 | sys.exit(0) 182 | 183 | # pass in an API to GeoSearchClass to get full access for posting 184 | (api, __) = utils.get_credentials('consumerkeyandsecret', False) 185 | g = geosearchclass.GeoSearchClass('params.txt', None, api) 186 | 187 | if args.filename: 188 | print 'Using parameters from ' + str(args.filename) 189 | g.set_params_from_file(args.filename) 190 | else: 191 | print "Using search values from params.txt" 192 | g.set_params_from_file('params.txt') 193 | 194 | if args.output: 195 | fn = str(args.output) 196 | else: 197 | fn = 'tweets.json' 198 | print 'Output file: ' + fn 199 | 200 | if args.address: 201 | print "Finding geocoordates for address:\n{}".format(args.address) 202 | coords = geo_converter.get_geocoords_from_address(args.address) 203 | if coords: 204 | g.latitude = coords[0] 205 | g.longitude = coords[1] 206 | else: 207 | print "Failed to find coordinates" 208 | sys.exit() 209 | 210 | verify(g, fn) 211 | 212 | 213 | if __name__ == '__main__': 214 | try: 215 | main() 216 | except KeyboardInterrupt: 217 | print "Main function interrupted" 218 | print "JSON file may be in incomplete format" 219 | sys.exit() 220 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # scraper.py 3 | # Saito 2017 4 | 5 | import argparse 6 | import sys 7 | import urllib 8 | 9 | from bs4 import BeautifulSoup 10 | 11 | import utils 12 | 13 | 14 | def scrape(url): 15 | html = urllib.urlopen(url).read() 16 | soup = BeautifulSoup(html, "html.parser") 17 | [x.extract() for x in soup.find_all('script')] 18 | text = soup.get_text(" ", strip=True) 19 | # ftext = text.split(" ") 20 | return text 21 | 22 | 23 | def scrape_and_save_to_file(url, filename="scraped_text.txt"): 24 | text = scrape(url) 25 | utils.save_file(filename, text) 26 | return text 27 | 28 | 29 | def get_parser(): 30 | """ Creates a command line parser 31 | 32 | --doc -d 33 | --help -h 34 | --url -u 35 | --output -o 36 | 37 | This automatically grabs arguments from sys.argv[] 38 | """ 39 | 40 | parser = argparse.ArgumentParser( 41 | description='Scrape a website.') 42 | 43 | parser.add_argument( 44 | '-d', '--doc', action='store_true', 45 | help='print module documentation and exit') 46 | parser.add_argument( 47 | '-u', '--url', 48 | help='''specify a url to scrape. Use the full name like 49 | http://www.cnn.com''') 50 | parser.add_argument( 51 | '-o', '--output', 52 | help='''specify an OUTPUT file to write to. 53 | Default is scraped_text.txt''') 54 | 55 | return parser 56 | 57 | 58 | def main(): 59 | parser = get_parser() 60 | args = parser.parse_args() 61 | 62 | if args.doc: 63 | print __doc__ 64 | sys.exit() 65 | 66 | if args.url: 67 | url = args.url 68 | else: 69 | url = "http://chrisnovello.com/teaching/risd/computer-utopias/" 70 | 71 | if args.output: 72 | print '\nwriting file to ' + str(args.output) 73 | output_file = args.output 74 | else: 75 | print "\nwriting to scraped_text.txt" 76 | output_file = "scraped_text.txt" 77 | text = scrape_and_save_to_file(url, output_file) 78 | 79 | # # Example 80 | # url = "http://chrisnovello.com/teaching/risd/computer-utopias/" 81 | # # text = scrape(url) 82 | # text = scrape_and_save_to_file(url) 83 | print text 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | 89 | -------------------------------------------------------------------------------- /streamer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ This is a utility that allows tweets to be read off in real time 3 | 4 | To stop, use a KeyboardInterrupt like CTRL + C""" 5 | 6 | 7 | import Queue 8 | import json 9 | import sys 10 | import threading 11 | import time 12 | 13 | import tweepy 14 | 15 | import utils 16 | 17 | 18 | global stream # so that CTRL + C kills stream 19 | 20 | 21 | class ListenerQueue(tweepy.streaming.StreamListener): 22 | """A StreamListener implementation for accessing Twitter Streaming API 23 | that writes to a queue object sent on initialization. 24 | 25 | Usage: myListener = ListenerQueue(queue) 26 | Stream(authorization, myListener) 27 | 28 | """ 29 | 30 | def __init__(self, queue, filename, search_terms): 31 | super(ListenerQueue, self).__init__() 32 | self.queue = queue 33 | self.search_terms = search_terms 34 | self.json_file = open(filename, 'a') 35 | self.json_file.seek(0) 36 | self.json_file.truncate() 37 | 38 | def has_all_search_terms(self, text): 39 | for term in self.search_terms: 40 | if text.find(term) > -1: 41 | continue 42 | else: 43 | return False 44 | return True 45 | 46 | def on_status(self, status): 47 | text = status.text 48 | if self.search_terms: 49 | if not self.has_all_search_terms(text): 50 | return True 51 | 52 | self.queue.put(status) 53 | # sj = status._json 54 | sj = utils.get_simplified_tweet(status) 55 | # filter_lev = status.filter_level 56 | # print filter_lev 57 | j = json.dumps(sj, indent=1) 58 | self.json_file.write(j) 59 | return True 60 | 61 | def on_error(self, status): 62 | # error codes: https://dev.twitter.com/overview/api/response-codes 63 | print status 64 | if status == 420: 65 | print "Too many attempts made to contact the Twitter server" 66 | print "Wait awhile to use the tool again" 67 | return False # returning False in on_data disconnects the stream 68 | 69 | def on_disconnect(self): 70 | super(ListenerQueue, self).on_disconnect() 71 | print "stream disconnected" 72 | self.json_file.close() 73 | if self.json_file.closed: 74 | print "json file closed successfully" 75 | 76 | 77 | # def stream_to_json_file(fn='tweets.json'): 78 | # auth = get_creds() 79 | # L = ListenerJSON(fn) 80 | # stream = Stream(auth, L) 81 | # stream.filter(locations=[-122.75, 36.8, -121.75, 37.8], async=True) 82 | # # can find terms: by adding track=['python'] 83 | # print "waiting 15s" 84 | # time.sleep(15) 85 | # print "terminating" 86 | # stream.disconnect() 87 | # L.json_file.close() 88 | 89 | 90 | def get_tweets_from_q(queue): 91 | while True: 92 | status = queue.get(True, 5) 93 | print u"Tweet Message : {}\n\n".format(status.text) 94 | queue.task_done() 95 | 96 | 97 | def start_stream(q, bounding_box, fn='tweets.json', search_terms=None): 98 | '''Takes in a Queue object, a bounding_box of [lon, lat, lon, lat] for 99 | SW and NE corners, a filename and a search term list. Examples in: 100 | bounding_box = geo_converter.get_bounding_box_from(g) 101 | search_terms = geo_converter.get_search_terms_from(g) 102 | ''' 103 | global stream 104 | (__, auth) = utils.get_credentials("consumerkeyandsecret", False) 105 | L = ListenerQueue(q, fn, search_terms) 106 | stream = tweepy.Stream(auth, L) 107 | stream.filter(locations=bounding_box, filter_level='none', async=True) 108 | # if search_terms: 109 | # # OR semantics: 110 | # stream.filter(locations=bounding_box, track=search_terms, async=True) 111 | # else: 112 | # stream.filter(locations=bounding_box, async=True) 113 | return stream 114 | 115 | 116 | def kill_stream(stream): 117 | if stream: 118 | print "attempting to disconnect stream from kill_stream" 119 | stream.disconnect() 120 | print "closing file in 1 second..." 121 | time.sleep(1) 122 | stream.listener.json_file.close() 123 | else: 124 | print "stream not set" 125 | 126 | 127 | def main(): 128 | print __doc__ 129 | 130 | q = Queue.Queue() 131 | bounding_box = [-122.75, 36.8, -121.75, 37.8] 132 | global stream 133 | stream = start_stream(q, bounding_box) 134 | 135 | # t = threading.Thread(target=start_stream, args=(q, bounding_box)) 136 | # t.daemon = True 137 | # t.start() 138 | # t.join() 139 | # print "waiting 15s" 140 | # time.sleep(15) 141 | # kill_stream(stream) 142 | 143 | # stream_to_json_file() 144 | 145 | # get_tweets_from_q(q) 146 | # now read in the files 147 | # https://dev.twitter.com/streaming/overview/request-parameters 148 | 149 | 150 | if __name__ == '__main__': 151 | try: 152 | main() 153 | except KeyboardInterrupt: 154 | print "Main function interrupted" 155 | if "stream" in globals(): 156 | print "trying to kill stream" 157 | kill_stream(stream) 158 | sys.exit() 159 | 160 | 161 | 162 | 163 | # class ListenerJSON(StreamListener): 164 | # """A StreamListener implementation for accessing Twitter Streaming API 165 | # that writes to a JSON file 166 | 167 | # """ 168 | 169 | # def __init__(self, filename): 170 | # super(ListenerJSON, self).__init__() 171 | # self.json_file = open(filename, 'a') 172 | 173 | # def on_status(self, status): 174 | # # print data 175 | # # print u"Tweet Message : {}\n\n".format(status.text) 176 | # print type(status) 177 | # sj = status._json 178 | # j = json.dumps(sj, indent=1) 179 | # self.json_file.write(j) 180 | # return True 181 | 182 | # def on_error(self, status): 183 | # # error codes: https://dev.twitter.com/overview/api/response-codes 184 | # print status 185 | # if status == 420: 186 | # return False # returning False in on_data disconnects the stream 187 | 188 | # def on_disconnect(self): 189 | # super(ListenerJSON, self).on_disconnect() 190 | # print "made it to disconnector" 191 | # self.json_file.close() 192 | -------------------------------------------------------------------------------- /suggest_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # suggest_bot.py 3 | # Saito 2017 4 | 5 | """This creates robot assisted poems! 6 | 7 | 8 | """ 9 | import argparse 10 | import random 11 | import sys 12 | 13 | import geo_converter 14 | import editor 15 | import geosearchclass 16 | import ngrams 17 | import utils 18 | import write 19 | 20 | 21 | def create_poem(g=None, default_words=None, ngram=None): 22 | """ This creates a poem with user input by suggesting from the words supplied. 23 | 24 | A user can use the word, decline the word, or add their own input. 25 | g is for geosearchclass. It is none by default. 26 | default_words is a list of words that can be enabled by default. 27 | """ 28 | words = [] 29 | formatted_poem = '''''' 30 | # for no, yes and finish (print poem) 31 | options = ['y', 'n', 's', 'd', 'r', 'e', 'f', '\n'] 32 | keep_adding = True 33 | added_default = False 34 | use_phrases = False 35 | random_word = False 36 | print "\n\n\n" 37 | print """ 38 | 39 | This robot poet will present a series of suggestions. You can 40 | either use these suggestions, edit them, or type your own 41 | input. You may also add more words from geolocated tweets to 42 | your word corpus. The words you choose or add will be 43 | succeessively added to a poem, which will be printed and saved 44 | to an output file. To add a new line, type '\\n'. To finish 45 | writing type f (for finish). 46 | 47 | y: yes use this word 48 | n: no, skip this and give me a new phrase 49 | s: search: add more geolocated terms from twitter 50 | d: default words added to corpus 51 | r: get random word, when running markov model 52 | e: edit the text 53 | \\n: enter line 54 | f: finish 55 | 56 | """ 57 | 58 | if ngram: 59 | print "Populating seed words from markov chain ngram" 60 | values = sum(ngram.values(), []) 61 | words.extend(values) 62 | chosen = "" 63 | while keep_adding: 64 | if len(words) == 0: 65 | print "Nothing in corpus. Type d for default words or s to search\ 66 | twitter" 67 | if ngram and formatted_poem and not random_word: 68 | tokens = utils.tokenize_normal_words(formatted_poem) 69 | num = random.random() 70 | potential_word = "" 71 | if len(tokens) > 0: 72 | # This is for trigrams 73 | if num > 0.66 and len(tokens) > 1: # 50% of time get trigram 74 | potential_word = tokens_to_word(tokens, ngram, 2) 75 | if potential_word: 76 | chosen = potential_word 77 | else: 78 | potential_word = tokens_to_word(tokens, ngram, 1) 79 | if potential_word: 80 | chosen = potential_word 81 | else: 82 | chosen = random.choice(words) 83 | elif num > 0.33: # 30% of time get bigram 84 | potential_word = tokens_to_word(tokens, ngram, 1) 85 | if potential_word: 86 | chosen = potential_word 87 | else: 88 | chosen = random.choice(words) 89 | else: # 20% of time get random word 90 | chosen = random.choice(words) 91 | else: 92 | chosen = random.choice(words) 93 | elif words: 94 | chosen = random.choice(words) 95 | random_word = False 96 | else: 97 | pass 98 | if chosen: 99 | print chosen, 100 | response_string = " " + str(options) + " or your own :" 101 | response = raw_input(response_string) 102 | # include the chosen word: 103 | if response == "y": 104 | if len(words) == 0: 105 | continue 106 | formatted_poem = formatted_poem + ''' ''' + chosen 107 | print 108 | print formatted_poem 109 | continue 110 | elif response == "n": 111 | continue 112 | elif response == "r": 113 | random_word = True 114 | elif response == "s": 115 | print "Searching geo-located tweets to add to vocab" 116 | print "This can only be used once every 5 seconds" 117 | if g is None: 118 | g = geosearchclass.GeoSearchClass() 119 | search_results = g.search() 120 | 121 | phrase_response = "" 122 | while phrase_response not in ["y", "n"]: 123 | phrase_response = raw_input("\nWould you like to use phrases (\ 124 | (otherwise, just words)? [y/n]: ") 125 | if phrase_response == "y": 126 | list_of_info_dicts = write.parse_tweets(search_results) 127 | filtered_words = [] 128 | if len(list_of_info_dicts) < 1: 129 | filtered_words = utils.tokenize_and_filter( 130 | search_results) 131 | else: 132 | for d in list_of_info_dicts: 133 | filtered_words.append(d['phrase']) 134 | elif phrase_response == "n": 135 | filtered_words = utils.tokenize_and_filter(search_results) 136 | else: 137 | continue 138 | print "\n\n\nAdding these Twitter words: " 139 | print filtered_words 140 | print "\n" 141 | words.extend(filtered_words) 142 | continue 143 | elif response == "d": 144 | if not added_default: 145 | print "\nadding in these words to corpus:" 146 | print default_words 147 | print "\n\n\n" 148 | words.extend(default_words) 149 | options.remove('d') 150 | added_default = True 151 | elif response == "e": 152 | formatted_poem = editor.create_editor(formatted_poem) 153 | print formatted_poem 154 | elif response not in options: 155 | response = response.replace('\\n', '\n') 156 | formatted_poem = formatted_poem + ''' ''' + response 157 | print 158 | print formatted_poem 159 | continue 160 | elif response == "f": 161 | print 162 | print formatted_poem 163 | keep_adding = False 164 | return formatted_poem 165 | 166 | 167 | def tokens_to_word(tokens, ngram, n): 168 | seed = tokens[-n:] 169 | t = tuple(seed) 170 | word = ngrams.generate(ngram, t) 171 | return word 172 | 173 | 174 | def get_parser(): 175 | """ Creates a command line parser 176 | 177 | --doc -d 178 | --help -h 179 | --params -p 180 | --input -i 181 | --markov -m 182 | --output -o 183 | --address -a 184 | 185 | This automatically grabs arguments from sys.argv[] 186 | """ 187 | 188 | parser = argparse.ArgumentParser( 189 | description='Create a robot assisted poem.') 190 | 191 | parser.add_argument( 192 | '-d', '--doc', action='store_true', 193 | help='print module documentation and exit') 194 | parser.add_argument( 195 | '-m', '--markov', 196 | help='''specify a TEXT file to mimic. This will\ 197 | Train a markov chain word predictor\ 198 | using this text. This will basically create a robotic poet!''') 199 | parser.add_argument( 200 | '-p', '--params', 201 | help='''specify a PARAMS file to use as the parameter file. 202 | If not specified, will use 'params.txt' for searches.''') 203 | parser.add_argument( 204 | '-i', '--input', 205 | help='''specify an input file to use as word seed file. 206 | ''') 207 | parser.add_argument( 208 | '-o', '--output', 209 | help='''specify an OUTPUT file to write to. 210 | Default is output.txt''') 211 | parser.add_argument( 212 | '-a', 213 | '--address', 214 | help='''give an ADDRESS to get geocoordinates for.''') 215 | 216 | return parser 217 | 218 | 219 | def main(): 220 | parser = get_parser() 221 | args = parser.parse_args() 222 | 223 | if args.doc: 224 | print __doc__ 225 | sys.exit() 226 | 227 | g = geosearchclass.GeoSearchClass() 228 | 229 | if args.params: 230 | print 'Using parameters from ' + str(args.params) 231 | # turn parameter file into dictionary 232 | g.set_params_from_file(args.params) 233 | 234 | if args.address: 235 | print "Finding geocoordates for address:\n{}".format(args.address) 236 | coords = geo_converter.get_geocoords_from_address(args.address) 237 | if coords: 238 | g.latitude = coords[0] 239 | print "Found this latitude:" 240 | print g.latitude 241 | g.longitude = coords[1] 242 | print "Found this longitude:" 243 | print g.longitude 244 | else: 245 | print "Failed to find coordinates. Exiting." 246 | sys.exit() 247 | 248 | if args.input: 249 | text = utils.load_file(args.input) 250 | tokens = utils.tokenize_normal_words(text) 251 | for_poem = utils.filter_words(tokens) 252 | else: 253 | for_poem = get_default_words() 254 | 255 | if args.markov: 256 | if args.input: 257 | raise StandardError("Can only input a single text file. \ 258 | use --markov ") 259 | else: 260 | text = utils.load_file(args.markov) 261 | # ngram = ngrams.make_ngram(text, 2) 262 | ngram = ngrams.make_bigram_trigram_dictionary(text) 263 | formatted_poem = create_poem(g, for_poem, ngram) 264 | else: 265 | formatted_poem = create_poem(g, for_poem) 266 | 267 | if args.output: 268 | print '\nwriting formatted poem to ' + str(args.output) 269 | output_file = args.output 270 | else: 271 | print "\nwriting formatted poem to poem.txt" 272 | output_file = "poem.txt" 273 | 274 | utils.save_file(output_file, formatted_poem) 275 | 276 | 277 | def get_default_words(): 278 | # These are some good default words used in the poem creator above 279 | for_poem = [ # emerging tech shit 280 | 'Agricultural', 'ecological', 'systems', 'meat', 'genetically', 281 | 'modified', 'precision', 'vertical', 'farming', 'printing', 'contour', 282 | 'crafting', 'artificial', 'uterus', 'transplant', 'cryonics', 283 | 'vitrification', 'suspended animation', 'de-extinction', 284 | 'genetic engineering', 'gene therapy', 'life extension', 285 | 'engineered negligible senescence', 286 | 'nanomedicine', 'nanosensors', 'regenerative', 'medicine', 287 | 'stem-cell', 'tissue engineering', 'robot assisted surgery', 288 | 'synthetic biology', 'synthetic genomics', 'virus', 289 | 'whole genome sequencing', 'bionic contact lens', 290 | 'head-mounted display', 'virtual', 291 | 'retinal', 'e-textiles', 'molecular', 'electronics', 'thermal', 292 | 'copper', 'pillar', 'airborne wind turbine', 'artificial', 293 | 'photosynthesis', 'biofuels', 'solar', 'power', 'fusion', 'fuel cell', 294 | 'molten salt', 'photovoltaic', 'translation', 'machine vision', 295 | 'speech recognition', 'fourth-generation', 'optical discs', 'storage', 296 | 'holographic data', 'millipede', 'optical computing', 297 | 'quantum computing', 'quantum cryptography', 'RFID', 'software-defined', 298 | 'three-dimensional', 'integrated', 'circuit', 'artificial muscle', 299 | 'superconductivity', 'superfluidity', 'metamaterials', 'cloaking', 300 | 'metal', 'multi-function', 'superalloy', 'synthetic diamond', 301 | 'weapon', 'laser', 'particle-beam', 'coilgun', 'plasma', 'stealth', 302 | 'brain computer interface', 'retinal implant', 303 | 'self reconfiguring modular robot', 'swarm robotics', 'pulse', 304 | 'solar sail', 'backpack', 305 | 'helicopter', 'delivery drone', 'detonation', 'engine', 'driverless\ 306 | car', 'automated', 'vacuum', 'collection', 'cloak', 'immersive', 307 | 'dilemma', 308 | # japanese shit 309 | 'august', 'black', 'chinese', 'Gaugin', 'heaven', 'illusion', 310 | 'island', 'Kibune', 'Michinoku', 'milky', 'Mogami', 'mother', 311 | 'mount', 'mountain', 'Musashi', 'night', 'observe', 'October', 312 | 'portrait', 'river', 'Roman', 'SUNSHINE', 'should', 'submit', 313 | 'tangled', 'Tokiwa', 'washing', 'watching', 'world', 'Yoshino', 314 | 'actual', 'admires', 'after', 'afterlife', 'again', 'against', 315 | 'alive', 'almost', 'always', 'amidah', 'ancient', 'another', 316 | 'armor', 'armored', 'arrayed', 'arrows', 'autumn', 'autumns', 317 | 'awakening', 'bamboo', 'bathe', 'beads', 'become', 'becoming', 318 | 'begins', 'behind', 'between', 'beyond', 'birth', 'blade', 319 | 'blind', 'bloom', 'blooming', 'blossoms', 'break', 'breaks', 320 | 'breeze', 'bridge', 'brings', 'brother', 'brush', 'buried', 321 | 'burning', 'butterfly', 'calligraphy', 'calling', 'camellia', 322 | 'cancer', 'candle', 'canyon', 'caress', 'carry', 'ceaseless', 323 | 'cedars', 'center', 'certain', 'change', 'chanted', 'chases', 324 | 'cherries', 'cherry', 'child', 'chill', 'chorus', 'chrysanthemum', 325 | 'chrysanthemums', 'cicada', 'clock', 'closer', 'color', 'combing', 326 | 'compare', 'completely', 'content', 'continent', 'corona', 327 | 'could', 'crest', 'crossing', 'curve', 'dancers', 'darkens', 328 | 'darkness', 'death', 'deepens', 'delusions', 'deserted', 329 | 'destitute', 'distance', 'dream', 'dreaming', 'dreams', 'drips', 330 | 'drops', 'drums', 'dying', 'early', 'eclipse', 'egret', 'ended', 331 | 'entangling', 'escaped', 'evening', 'every', 'exhausted', 332 | 'faintly', 'falling', 'falls', 'feeling', 'field', 'finished', 333 | 'fireflies', 'firefly', 'fireworks', 'first', 'flash', 'flesh', 334 | 'flies', 'float', 'flowers', 'flowing', 'flows', 'follow', 335 | 'forever', 'forlorn', 'forth', 'fragile', 'frozen', 'garden', 336 | 'gates', 'gauntlet', 'gauzy', 'gazing', 'geese', 'giant', 337 | 'glances', 'going', 'grapes', 'grass', 'grasses', 'guards', 338 | 'guided', 'gunshots', 'harbor', 'heart', 'heaven', 'hillside', 339 | 'holding', 'horse', 'house', 'houses', 'hundred', 'hydrangea', 340 | 'idling', 'image', 'insane', 'interrogation', 'invisible', 341 | 'irrevocable', 'itself', 'journey', 'juice', 'karma', 'killed', 342 | 'knotty', 'knowing', 'knowledge', 'later', 'leave', 'leaving', 343 | 'letting', 'light', 'lightning', 'lilacs', 'limit', 'little', 344 | 'lodging', 'longing', 'looks', 'loving', 'making', 'mantle', 345 | 'marshes', 'memories', 'messengers', 'meteor', 'midnight', 346 | 'might', 'mirror', 'mirrored', 'missed', 'month', 'moonlight', 347 | 'mother', 'motorcycle', 'mouth', 'moving', 'myself', 'night', 348 | 'nightingale', 'nights', 'north', 'nothing', 'nowhere', 'ocean', 349 | 'octopus', 'opening', 'orchid', 'other', 'paradise', 'parting', 350 | 'passes', 'passions', 'pattern', 'pealing', 'pears', 'people', 351 | 'period', 'petal', 'place', 'plain', 'planters', 'playing', 352 | 'poems', 'poppy', 'press', 'primal', 'primeval', 'purple', 353 | 'quivered', 'rabbits', 'radiation', 'radio', 'rapids', 'reaches', 354 | 'reality', 'really', 'recklessly', 'reconciled', 'relax', 355 | 'remember', 'replies', 'returning', 'right', 'ripple', 'ripples', 356 | 'rising', 'river', 'riverbank', 'rocky', 'rowing', 'running', 357 | 'saying', 'seals', 'seeing', 'serpent', 'shadow', 'shall', 358 | 'shaped', 'shattered', 'shell', 'shelves', 'shift', 'shining', 359 | 'shore', 'short', 'shower', 'sided', 'silkworm', 'silkworms', 360 | 'single', 'sleep', 'slept', 'slightest', 'slowly', 'smell', 361 | 'snail', 'soiled', 'soldiers', 'solitary', 'somehow', 'something', 362 | 'sometimes', 'sound', 'speak', 'spill', 'spilling', 'spray', 363 | 'spreads', 'spring', 'squid', 'stable', 'stars', 'station', 364 | 'steel', 'stirrups', 'stolen', 'stomach', 'stone', 'storm', 365 | 'straighten', 'strands', 'strange', 'straw', 'streaming', 366 | 'stripes', 'study', 'submit', 'summer', 'sunlight', 'sunrise', 367 | 'sunset', 'sutra', 'sweet', 'swimsuit', 'tangled', 'taste', 368 | 'temple', 'tethered', 'their', 'there', 'these', 'thighs', 369 | 'thing', 'things', 'think', 'thought', 'thousand', 'throat', 370 | 'through', 'throughout', 'tiger', 'tight', 'tossing', 'total', 371 | 'toward', 'trace', 'transferred', 'traps', 'truth', 'turning', 372 | 'turns', 'twilight', 'unborn', 'under', 'utterly', 'vanished', 373 | 'village', 'visible', 'waiting', 'wandering', 'warrior', 374 | 'warriors', 'washed', 'water', 'waves', 'weight', 'where', 375 | 'which', 'whistling', 'white', 'whitecaps', 'willow', 'wings', 376 | 'winter', 'wisteria', 'without', 'woman', 'world', 'yanking', 377 | 'years', 'yesterday', 'yielded', 'young'] 378 | return for_poem 379 | 380 | if __name__ == '__main__': 381 | main() 382 | 383 | -------------------------------------------------------------------------------- /test_real_time_vis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # test_real_time_vis.py 3 | # Saito 2015 4 | """ Test unit for real_time_vis """ 5 | 6 | import unittest 7 | import time 8 | from utils import new_tweets 9 | from real_time_vis import update_fdist 10 | import geosearchclass 11 | import utils 12 | 13 | 14 | class TestRTV(unittest.TestCase): 15 | 16 | def setUp(self): 17 | self.g = geosearchclass.GeoSearchClass() 18 | self.g.latitude = 37.7821 19 | self.g.longitude = -122.4093 20 | self.g.radius = 100 21 | self.g.search_term = "" 22 | self.g.result_type = 'mixed' 23 | self.g.count = 100 24 | self.sr = self.g.search() 25 | 26 | def test_new_tweets(self): 27 | sr2 = self.sr[0:10] # 10 old same one 28 | old = [s.id for s in sr2] 29 | old = set(old) 30 | print 'len(sr) = %d' % len(self.sr) 31 | print 'len(sr2) = %d' % len(sr2) 32 | self.assertEqual( 33 | len(new_tweets(self.sr, old)), 90) 34 | 35 | sr2 = self.sr 36 | old = [s.id for s in sr2] 37 | old = set(old) 38 | self.assertEqual( 39 | len(new_tweets(self.sr, old)), 0) 40 | 41 | self.g.latitude = 40.734073 42 | self.g.longitude = -73.990663 43 | self.g.radius = 10 44 | self.g.search_term = "" 45 | self.g.result_type = 'mixed' 46 | self.g.count = 10 47 | sr2 = self.g.search() # all different (15 old different ones) 48 | old = [s.id for s in sr2] 49 | old = set(old) 50 | self.assertEqual( 51 | len(new_tweets(self.sr, old)), 100) 52 | 53 | def test_update_fdist(self): 54 | filtered_words = utils.tokenize_and_filter(self.sr) 55 | fdist = utils.get_freq_dist(filtered_words) 56 | # take distribution and send it empty list 57 | fdist2 = update_fdist(fdist, []) 58 | self.assertEqual(fdist, fdist2) 59 | 60 | time.sleep(5) 61 | self.g.latitude = 40.734073 62 | self.g.longitude = -73.990663 63 | self.g.count = 100 64 | self.sr = self.g.search() 65 | filtered_words = utils.tokenize_and_filter(self.sr) 66 | # updating with entirely new word set -> should be longer 67 | old_len_fdist = len(fdist) 68 | fdist = update_fdist(fdist, filtered_words) 69 | self.assertTrue(len(fdist) > old_len_fdist) 70 | 71 | def tearDown(self): 72 | pass 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /test_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # test_write.py 3 | # Saito 2015 4 | 5 | """ Test unit for write """ 6 | 7 | import unittest 8 | # import time 9 | # import geosearchclass 10 | import nltk 11 | import logging 12 | from write import traverse_tree_grab_phrase 13 | from write import traverse_tree_grab_phrases 14 | from write import parse_sentence 15 | from write import get_grammar 16 | 17 | 18 | class TestWrite(unittest.TestCase): 19 | # def __init__(self): 20 | # super(TestWrite, self).__init__() 21 | 22 | @classmethod 23 | def setUpClass(cls): 24 | pass 25 | # self.g = geosearchclass.GeoSearchClass() 26 | # self.g.latitude = 37.7821 27 | # self.g.longitude = -122.4093 28 | # self.g.radius = 10 29 | # self.g.search_term = "" 30 | # self.g.result_type = 'mixed' 31 | # self.g.count = 2 32 | # self.sr = self.g.search() 33 | 34 | def setUp(self): 35 | # set to DEBUG, INFO, WARNING, ERROR, CRITICAL : 36 | logging.basicConfig( 37 | format='%(levelname)s: %(message)s', level=logging.INFO) 38 | self.tokens = nltk.word_tokenize( 39 | 'Numerous passing references to the phrase have occurred in movies') 40 | self.grammar = get_grammar('grammar_20ids_HM0VM0.pickle') 41 | self.tree = parse_sentence(self.tokens, self.grammar) 42 | 43 | def test_traverse_tree_grab_phrase(self): 44 | print 'printing tree!!!' 45 | print self.tree 46 | 47 | label = 'VP' 48 | phrase = traverse_tree_grab_phrase(self.tree, label) 49 | print "For label {} returned this phrase: {}".format(label, phrase) 50 | self.assertEqual(phrase, 'have occurred in movies') 51 | 52 | label = 'NP' 53 | phrase = traverse_tree_grab_phrase(self.tree, label) 54 | print "For label {} returned this phrase: {}".format(label, phrase) 55 | self.assertEqual(phrase, 'Numerous passing references') 56 | 57 | label = 'PP' 58 | phrase = traverse_tree_grab_phrase(self.tree, label) 59 | print "For label {} returned this phrase: {}".format(label, phrase) 60 | self.assertEqual(phrase, 'to the phrase') 61 | 62 | def test_traverse_tree_grab_phrases(self): 63 | # # Now testing other function 64 | labels = [u'VP', u'NP', u'PP'] 65 | phrases = dict.fromkeys(labels) 66 | for k in phrases.keys(): 67 | phrases[k] = [] 68 | phrases = traverse_tree_grab_phrases(self.tree, phrases) 69 | for k, v in phrases.items(): 70 | print '{} : {}'.format(k, v) 71 | self.assertEqual( 72 | phrases['NP'], ['Numerous passing references', 73 | 'the phrase', 'movies']) 74 | self.assertEqual( 75 | phrases['VP'], ['have occurred in movies', 'occurred in movies']) 76 | self.assertEqual(phrases['PP'], ['to the phrase']) # maybe 'in movies' 77 | 78 | def tearDown(self): 79 | pass 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | # suite = unittest.TestLoader().loadTestsFromTestCase(TestWrite) 84 | # unittest.TextTestRunner(verbosity=2).run(suite) 85 | 86 | # tw = TestWrite() 87 | # tw.setUp() 88 | # tw.test_traverse_tree_grab_phrases() 89 | -------------------------------------------------------------------------------- /tweeter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """This is a utility module that allows a user to send tweets and 4 | read timelines""" 5 | 6 | import geosearchclass 7 | import utils 8 | 9 | 10 | def tweet(api, text, in_reply_to_status_id=None): 11 | """Send a tweet, possibly in response to another tweet 12 | 13 | REF: http://docs.tweepy.org/en/v3.5.0/api.html#API.update_status 14 | """ 15 | if len(text) > 140: 16 | raise ValueError("Text is over 140 Characters. Can\'t tweet") 17 | return 18 | if in_reply_to_status_id: 19 | status = api.update_status( 20 | status=text, in_reply_to_status_id=in_reply_to_status_id) 21 | else: 22 | status = api.update_status(status=text) 23 | return status 24 | 25 | 26 | def get_user_timeline(api, screen_name, count=20): 27 | """ 28 | This returns a users timeline 29 | 30 | REF: http://docs.tweepy.org/en/v3.5.0/api.html#API.user_timeline 31 | """ 32 | statuses = api.user_timeline( 33 | screen_name=screen_name, count=count) 34 | return statuses 35 | # API.user_timeline( 36 | # [id/user_id/screen_name][, since_id][, max_id][, count][, page]) 37 | 38 | 39 | def main(): 40 | print __doc__ 41 | print tweet.__name__ 42 | print tweet.__doc__ 43 | print get_user_timeline.__name__ 44 | print get_user_timeline.__doc__ 45 | 46 | # TESTING 47 | # (api, __) = utils.get_credentials('consumerkeyandsecret', False) 48 | # g = geosearchclass.GeoSearchClass('params.txt', None, api) 49 | 50 | # Robotic Tweet: 51 | # print g.tweet_text 52 | # tweet_text = g.tweet_text + " @SaitoGroup" 53 | # print tweet_text 54 | # api = g.api 55 | # status = tweet(api, tweet_text, 745399390219739137) 56 | # utils.get_simplified_tweet(status) 57 | 58 | 59 | # Get user timeline: 60 | # screen_name = "SaitoGroup" 61 | # print "returning user timeline for {}".format(screen_name) 62 | # statuses = get_user_timeline(g, screen_name, 50) 63 | # for status in statuses: 64 | # utils.get_simplified_tweet(status) 65 | # print "\n NEXT TWEET \n" 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # NLTK stuff 2 | 3 | """This is a utils file for the other programs. 4 | 5 | It contains Natural language processing tools from NLTK, some basic 6 | visualizer, a tweet status object info extractor and a new tweet 7 | identifier. 8 | 9 | """ 10 | 11 | 12 | import codecs 13 | import os 14 | import re 15 | import sys 16 | 17 | import nltk 18 | import tweepy 19 | from nltk.corpus import stopwords 20 | 21 | 22 | def get_credentials(keys_file="consumerkeyandsecret", app_only=True): 23 | '''This function gives credentials to the API. 24 | 25 | When app_only is true, application only authorization level 26 | credentials are supplied. This is sufficient for searching tweet 27 | history. It must be False for streaming access and to post tweets. 28 | 29 | It requires that your consumerkeyandsecret have 4 lines, with the 30 | consumer key on the first line, the secret on the next and then an 31 | access token on the 3rd and the access token secret on the 32 | 4th. You can get these by logging on to your twitter account and 33 | creating an app. 34 | 35 | USAGE: (api, auth) = get_creds(keys_file, [app_only=[True/False]]) 36 | The second argument is optional 37 | 38 | ''' 39 | with open(keys_file, 'rU') as myfile: 40 | auth_data = [line.strip() for line in myfile] 41 | CONSUMER_KEY = auth_data[0] 42 | CONSUMER_SECRET = auth_data[1] 43 | auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) 44 | if not app_only: 45 | ACCESS_TOKEN = auth_data[2] 46 | ACCESS_TOKEN_SECRET = auth_data[3] 47 | auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) 48 | api = tweepy.API(auth) 49 | return (api, auth) 50 | 51 | 52 | def get_simplified_tweet(status): 53 | """ Takes in a tweet status object and parses it""" 54 | user = status.user.screen_name 55 | print user 56 | d = status.created_at 57 | isotime = d.isoformat() 58 | print isotime 59 | id_string = status.id_str 60 | print id_string 61 | loc_name = None 62 | loc = None 63 | if status.place: 64 | if status.place.full_name: 65 | loc_name = status.place.full_name 66 | print loc_name 67 | if status.place.bounding_box: 68 | loc = status.place.bounding_box.origin() 69 | print loc 70 | text = status.text 71 | print text 72 | simplified_tweet = [user, isotime, id_string, text, loc_name, loc] 73 | return simplified_tweet 74 | 75 | 76 | def new_tweets(new_sr, old_ids): 77 | '''returns only search_results that do not have ids listed in old_ids 78 | new_sr is the new search results, old_ids is a set of ids 79 | 80 | ''' 81 | new_tweets = [] 82 | if old_ids: 83 | new_tweets = [sr for sr in new_sr if sr.id not in old_ids] 84 | else: 85 | new_tweets = new_sr 86 | return new_tweets 87 | 88 | 89 | def get_freq_dist(word_list): 90 | """Returns a frequency distribution for a list of words""" 91 | fdist = nltk.probability.FreqDist(word_list) 92 | return fdist 93 | 94 | 95 | def tokenize_and_filter(search_results): 96 | """Tokenizes and then filters search results""" 97 | tokens = tokenize_results(search_results) 98 | filtered_words = filter_words(tokens) 99 | return filtered_words 100 | 101 | 102 | def tokenize_results(search_results): 103 | """This takes in search_results i.e. status return from a twitter 104 | search and tokenizes the results""" 105 | tweet_text = u'' 106 | for sr in search_results: 107 | tweet_text = tweet_text + sr.text 108 | tokenizer = nltk.tokenize.casual.TweetTokenizer() 109 | tokens = tokenizer.tokenize(tweet_text) 110 | # tokens = nltk.tokenize.word_tokenize(tweet_text) 111 | return tokens 112 | 113 | 114 | def tokenize_normal_words(text_string): 115 | """ This takes in a normal string and tokenizes it into a word list """ 116 | tokens = nltk.word_tokenize(text_string) 117 | return tokens 118 | 119 | 120 | def filter_words(word_list): 121 | """remove stop words and do some basic filtering""" 122 | tokens = [word.lower() for word in word_list] 123 | filtered_words = [ 124 | word for word in tokens if word not in stopwords.words('english')] 125 | # remove urls with another filter using reg expressions 126 | p = re.compile(r'//t.co/') 127 | filtered_words = [word for word in filtered_words if not p.match(word)] 128 | p2 = re.compile(r'https') 129 | filtered_words = [word for word in filtered_words if not p2.match(word)] 130 | filtered_words = [word for word in filtered_words if len(word) > 2] 131 | return filtered_words 132 | 133 | 134 | def visualize(word_list): 135 | """Takes in a word list and visualizes the distribution of the top 30 words. 136 | 137 | This works well when combined with tokenize_and_filter(search_results).""" 138 | # import matplotlib 139 | # matplotlib.use('qt4agg') # workaround for virtual environments 140 | import matplotlib.pyplot as plt 141 | 142 | fdist = get_freq_dist(word_list) 143 | textOb = nltk.text.Text(word_list) 144 | print "\nCollocations: " 145 | print textOb.collocations() 146 | # fdist.plot(30) 147 | samples = [item for item, _ in fdist.most_common(30)] 148 | freqs = [fdist[sample] for sample in samples] 149 | 150 | plt.grid(True, color="silver") 151 | plt.plot(freqs, range(1, 1+len(freqs))) 152 | plt.yticks(range( 153 | 1, 1 + len(samples)), [s for s in samples], rotation=0) 154 | plt.ylabel("Samples") 155 | plt.xlabel("Counts") 156 | plt.show() 157 | return fdist 158 | 159 | 160 | def save_file(filename, text): 161 | fileSystemEncoding = sys.getfilesystemencoding() 162 | OUTPUT_FILE = os.path.expanduser(u'./' + filename) 163 | with codecs.open(OUTPUT_FILE, 164 | encoding=fileSystemEncoding, 165 | mode="w") as f: 166 | f.write(text) 167 | 168 | 169 | def load_file(filename): 170 | fileSystemEncoding = sys.getfilesystemencoding() 171 | # with codecs.open(filename, encoding='utf-8', mode='rU') as f: 172 | with codecs.open(filename, encoding=fileSystemEncoding, mode='rU') as f: 173 | text = f.read() 174 | return text 175 | -------------------------------------------------------------------------------- /write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # write.py 3 | # Saito 2015 4 | 5 | """ 6 | This program classifies tweets into phrase type. 7 | 8 | It produces a JSON array, "phrases.json" with properties: 9 | phrase 10 | tweeter 11 | type 12 | geolocation 13 | """ 14 | # TODO: 15 | # Try a faster parser, like chart parser or something 16 | 17 | 18 | import nltk 19 | import json 20 | import cPickle 21 | import re 22 | import types 23 | import logging 24 | from nltk.corpus import treebank 25 | from nltk import treetransforms 26 | #from nltk.grammar import WeightedProduction, Nonterminal 27 | from nltk.grammar import ProbabilisticProduction, Nonterminal 28 | 29 | 30 | class PCFGViterbiParser(nltk.ViterbiParser): 31 | 32 | def __init__(self, grammar, trace=0): 33 | super(PCFGViterbiParser, self).__init__(grammar, trace) 34 | 35 | def parse(self, tokens): 36 | tagged = nltk.pos_tag(tokens) 37 | missing = False 38 | for tok, pos in tagged: 39 | if not self._grammar._lexical_index.get(tok): 40 | missing = True 41 | self._grammar._productions.append( 42 | ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) 43 | # WeightedProduction(Nonterminal(pos), [tok], prob=0.000001)) 44 | if missing: 45 | self._grammar._calculate_indexes() 46 | 47 | # returns a generator, so call 'next' to get the ProbabilisticTree 48 | tree = super(PCFGViterbiParser, self).parse(tokens) 49 | if issubclass(tree.__class__, nltk.tree.Tree): 50 | print 'returning a tree' 51 | return tree 52 | elif isinstance(tree, types.GeneratorType): 53 | try: 54 | return next(tree) 55 | except(StopIteration): 56 | tweet = ' '.join(tokens) 57 | print u'Couldn\'t parse {}'.format(tweet) 58 | return None 59 | else: 60 | error("Type of tree is: {}".format(type(tree))) 61 | 62 | 63 | def train_pcfg(): 64 | print 'training grammar' 65 | productions = [] 66 | # print len(treebank.fileids()) 67 | trees = [] 68 | # up to 199 less for shorter grammar for quicker training 69 | for fileid in treebank.fileids()[0:20]: 70 | for tree in treebank.parsed_sents(fileid): 71 | # perform optional tree transformations, e.g.: 72 | # Remove branches A->B->C into A->B+C so we can avoid infinite 73 | # productions 74 | tree.collapse_unary(collapsePOS=False) 75 | # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser) 76 | # horizontal and vertical Markovization: remember parents and siblings in tree 77 | # This gives a performance boost, but makes the grammar HUGE 78 | # If we use these we would need to implement a tag forgetting method 79 | #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0) 80 | tree.chomsky_normal_form() 81 | productions += tree.productions() 82 | S = nltk.Nonterminal('S') 83 | grammar = nltk.induce_pcfg(S, productions) 84 | print "grammar trained!" 85 | return grammar 86 | 87 | 88 | def traverse_tree_grab_phrases(tree, phrases): 89 | """Finds all examples of each label and returns the phrases dictionary. 90 | 91 | Usage: phrases = traverse_tree_grab_phrase(tree, 'VP') 92 | 93 | Phrases is a dictionary with a key for each label you which to 94 | find, and each value is a list. 95 | """ 96 | 97 | for subtree in tree: 98 | logging.debug('type of subtree= {}'.format(type(subtree))) 99 | if issubclass(subtree.__class__, nltk.tree.Tree): 100 | logging.debug('this subtree has label {}'.format(subtree.label())) 101 | if subtree.label() in phrases.keys(): 102 | logging.debug('found {} label'.format(subtree.label())) 103 | tokens = subtree.leaves() 104 | phrase = ' '.join(tokens) 105 | logging.debug(u'which has this phrase \n {}'.format(phrase)) 106 | phrases[subtree.label()].append(phrase) 107 | logging.debug('going one deeper') 108 | phrases = traverse_tree_grab_phrases(subtree, phrases) 109 | elif type(subtree) == unicode: 110 | logging.debug(subtree) 111 | return phrases 112 | 113 | 114 | def traverse_tree_grab_phrase(tree, label): 115 | """Finds the first example of the label and returns the phrase. 116 | 117 | Usage: phrase = traverse_tree_grab_phrase(tree, 'VP') 118 | 119 | For exhaustive search try the sister function 120 | traverse_tree_grab_phrases(tree,phrases) 121 | """ 122 | phrase = None 123 | logging.debug("tree type: {}".format(type(tree))) 124 | 125 | for subtree in tree: 126 | logging.debug('type of subtree= {}'.format(type(subtree))) 127 | if issubclass(subtree.__class__, nltk.tree.Tree): 128 | logging.debug('this subtree has label {}'.format(subtree.label())) 129 | logging.debug('subtree {} == label {} : {}'.format( 130 | subtree.label(), label, subtree.label() == label)) 131 | if subtree.label() == label: 132 | logging.debug('found {} label'.format(label)) 133 | tokens = subtree.leaves() 134 | phrase = ' '.join(tokens) 135 | logging.debug(u'which has this phrase \n {}\n'.format(phrase)) 136 | return phrase 137 | else: 138 | phrase = traverse_tree_grab_phrase(subtree, label) 139 | if phrase != None: 140 | return phrase 141 | return phrase 142 | 143 | 144 | def get_phrases_from_tree(tree, exhaustive=False): 145 | labels = [u'VP', u'NP', u'PP'] 146 | phrases = dict.fromkeys(labels) 147 | for k in phrases.keys(): 148 | phrases[k] = [] 149 | if exhaustive: 150 | phrases = traverse_tree_grab_phrases(tree, phrases) 151 | else: 152 | for label in phrases.keys(): 153 | # print '\n\n\n\nlooking for {}'.format(label) 154 | 155 | phrase = traverse_tree_grab_phrase(tree, label) 156 | 157 | if phrase is not None: 158 | phrases[label].append(phrase) 159 | return phrases 160 | 161 | 162 | def parse_sentence(tokenized_sentence, grammar): 163 | """ Parses a tokenized sentence and returns a tree 164 | """ 165 | # parser = nltk.parse.ViterbiParser(grammar) 166 | parser = PCFGViterbiParser(grammar, trace=0) 167 | tree = parser.parse(tokenized_sentence) 168 | return tree 169 | 170 | 171 | def json_phrases(phrases, filename): 172 | with open(filename, 'w') as f: 173 | j = json.dumps(phrases, indent=1) 174 | f.write(j) 175 | return 176 | 177 | 178 | def pickle_grammar(grammar, fn): 179 | """ Write grammar to file (serialized, marshalled) 180 | """ 181 | with open(fn, 'w') as f: 182 | #cPickle.dump(grammar, f, protocol=cPickle.HIGHEST_PROTOCOL) 183 | cPickle.dump(grammar, f, protocol=0) 184 | 185 | 186 | def unpickle_grammar(fn): 187 | """ Read grammar from a file and return it""" 188 | with open(fn, 'rU') as f: 189 | grammar = cPickle.load(f) 190 | return grammar 191 | 192 | 193 | def get_grammar(fn='grammar.pickle'): 194 | 195 | try: 196 | grammar = unpickle_grammar(fn) 197 | print 'Loaded grammar' 198 | return grammar 199 | except IOError: 200 | print 'No grammar file, gotta train' 201 | grammar = train_pcfg() 202 | pickle_grammar(grammar, fn) 203 | return grammar 204 | 205 | 206 | def create_info_phrase_add_to_list(phrases, status, dict_list): 207 | keys = ['phrase', 'phrase_type', 'tweet', 208 | 'coordinates', 'time', 'screen_name'] 209 | 210 | for pos in phrases: 211 | for phrase in phrases[pos]: 212 | print u'phrase: {}'.format(phrase) 213 | d = dict.fromkeys(keys) 214 | d['phrase_type'] = pos 215 | d['phrase'] = phrase 216 | d['tweet'] = status.text 217 | d['screen_name'] = status.user.screen_name 218 | d['time'] = str(status.created_at) 219 | if status.geo: 220 | d['coordinates'] = status.geo['coordinates'] 221 | dict_list.append(d) 222 | del d 223 | 224 | return 225 | 226 | 227 | def parse_tweets(search_results): 228 | grammar = get_grammar('grammar_20ids_HM0VM0.pickle') 229 | list_of_info_dicts = [] 230 | for sr in search_results: 231 | print u'tweet text: {}'.format(sr.text) 232 | # nltk.tree.Tree.draw(tree) 233 | sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle') 234 | sentences = sentence_detector.tokenize(sr.text) 235 | tokenizer = nltk.tokenize.casual.TweetTokenizer() 236 | for sent in sentences: 237 | if not sent: 238 | logging.debug('sent is None') 239 | continue 240 | tokens = tokenizer.tokenize(sent) 241 | logging.debug(tokens) 242 | p = re.compile(r'https.*') 243 | # tokens = [word for word in tokens if not word == u'\xe2'] 244 | tokens = [word for word in tokens if not p.match(word)] 245 | logging.debug(tokens) 246 | if not tokens: 247 | continue 248 | tree = parse_sentence(tokens, grammar) 249 | if not tree: 250 | logging.debug('tree was None') 251 | continue 252 | print tree 253 | phrases = get_phrases_from_tree(tree, exhaustive=True) 254 | print 'printing phrases dictionary for this tweet' 255 | for k, v in phrases.items(): 256 | print u'{} : {}'.format(k, v) 257 | 258 | create_info_phrase_add_to_list(phrases, sr, list_of_info_dicts) 259 | 260 | i = 1 261 | for d in list_of_info_dicts: 262 | print '\n\n\n printing dictionary {}'.format(i) 263 | for k, v in d.items(): 264 | print u'{} : {}'.format(k, v) 265 | i += 1 266 | 267 | json_phrases(list_of_info_dicts, 'phrases.json') 268 | return list_of_info_dicts 269 | 270 | 271 | def main(): 272 | # set to DEBUG, INFO, WARNING, ERROR, CRITICAL : 273 | logging.basicConfig( 274 | format='%(levelname)s: %(message)s', level=logging.INFO) 275 | import geosearchclass 276 | g = geosearchclass.GeoSearchClass() 277 | print "Using search values from params.txt" 278 | g.set_params_from_file('params.txt') 279 | search_results = g.search() 280 | parse_tweets(search_results) 281 | # grammar = get_grammar() 282 | # #sentences = treebank.sentences()[34:35] 283 | # sentences = [nltk.word_tokenize('Numerous passing references to the phrase have occurred in movies')] 284 | # #print sentences 285 | 286 | # sentence_trees = parse_sentences(sentences, grammar) 287 | # phrases = get_phrases(sentence_trees) 288 | # print 'Now printing the phrases: ' 289 | # for k,v in phrases.items(): 290 | # print '{} : {}'.format(k,v) 291 | # json_phrases(phrases, 'phrases.json') 292 | 293 | 294 | if __name__ == '__main__': 295 | main() 296 | --------------------------------------------------------------------------------