├── .gitignore
├── LICENSE
├── README.md
├── Sentence Parse.PNG
├── StatusStreamerOutput.txt
├── editor.py
├── geo_converter.py
├── geosearchclass.py
├── ngrams.py
├── p_files
    ├── LA.txt
    ├── clive.txt
    ├── nyc.txt
    └── nyc.txt~
├── params.txt
├── real_time_vis.py
├── sample.py
├── scan_and_respond.py
├── scraper.py
├── streamer.py
├── suggest_bot.py
├── test_real_time_vis.py
├── test_write.py
├── tweeter.py
├── utils.py
└── write.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | phrases.json
 2 | tweets.json
 3 | consumerkeyandsecret
 4 | output.txt
 5 | params.txt
 6 | grammar.pickle
 7 | grammr*pickle
 8 | poemsforrobots.txt
 9 | deprecated/
10 | *.pickle
11 | *.json
12 | #*
13 | *.pyc
14 | *~
15 | .DS_Store
16 | 
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Saito Group 1-17-2017
  2 | 
  3 | About:
  4 | ----------------------------------------------------------------------
  5 | This library is composed of several tools for scraping geolocated
  6 | tweets and visualizing data gleaned from these tweets. It also has a
  7 | robotic assistant tool, called ```suggest_bot``` which can help you
  8 | write poems in the style of a document you pass in. Another
  9 | tool, called ```scan_and_respond``` allows you to scan an area for
 10 | search terms and then tweet at those people! 
 11 | 
 12 | Geo-tag your tweets!
 13 | --------------------
 14 | We rely on geo-tagged tweets. Please allow your location to be seen
 15 | when tweeting, especially when using this application! You can modify
 16 | this by logging into your main twitter account and under "Security and
 17 | Privacy" check the box next to "Tweet location". THANKS!
 18 | 
 19 | 
 20 | Install:
 21 | ----------------------------------------------------------------------
 22 | git, python 2.7.X, pip
 23 | Python packages required: tweepy, nltk, matplotlib, geopy, argparse,
 24 | curses, bs4 (beautiful soup), locale
 25 | 
 26 | On Windows: upgrade powershell
 27 |    (you may still have unicode problems when printing to command line)
 28 |    
 29 | ```
 30 | python -m pip install
 31 | ```
 32 | 
 33 | For each required package listed above run:
 34 | ```
 35 | pip install <package>
 36 | ```
 37 | Now we need some data, so we’ll use the nltk downloader
 38 | Run a python shell from the command line:
 39 | ```
 40 | python
 41 | import nltk
 42 | nltk.download()
 43 | ```
 44 | On main page, highlight book, click download and that should be it... 
 45 | These are the exact packages from nltk that are required in case you want less data:
 46 | 1) under corpora -> highlight stopwords
 47 | 2) under corpora -> highlight treebank
 48 | 3) under all packages -> highlight punkt
 49 | 4) under models -> highlight averaged-perceptron-tagger
 50 | 
 51 | This created a folder called “nltk_data” in your home folder which is
 52 | used by the program
 53 | 
 54 | Navigate to the folder where you want getweets to be
 55 | ```
 56 | git clone https://github.com/saitogroup/geotweets.git
 57 | ```
 58 | get consumerkeyandsecret (see below) and put that in the folder
 59 | cd into folder
 60 | run sample.py from the command line (see below)
 61 | 
 62 | 
 63 | Consumer Key and Secret:
 64 | ----------------------------------------------------------------------
 65 | The program looks for a file in the geotweets folder called
 66 | consumerkeyandsecret This should have at least 2 lines, with the
 67 | consumer key on the first line, the secret (the longer one) on the
 68 | next and then (for streaming and posting) 2 more lines. An access
 69 | token on the 3rd and the access token secret on the 4th. You can get
 70 | these by going to https://apps.twitter.com in a web browser and
 71 | creating an app. Then hit the button to create access tokens. You may
 72 | have to set the app permissions to "read and write" if you want to use
 73 | this to send tweets on your behalf. After creating the app, copy the 4
 74 | alphanumeric keys into a blank file called "consumerkeyandsecret" as
 75 | described above and put this file in your "geotweets" folder.
 76 | 
 77 | 
 78 | TOOLS:
 79 | ----------------------------------------------------------------------
 80 | sample:
 81 | -------
 82 | A simple tool, called 'sample' allows you to scrape and save up to
 83 | 100 geolocated tweets in batch form. You can optionally search within
 84 | this set for specific words or hash tags and visualize the top word
 85 | frequency. See sample.py for details or from command line run:
 86 | ```
 87 | python sample.py --help
 88 | python sample.py --doc
 89 | ```  
 90 | USAGE :
 91 | ```
 92 | python sample.py [-h][-d][-v][-f FILENAME][-o OUTPUT][-vis]
 93 | ```
 94 | 
 95 | scraper
 96 | --------
 97 | Given a URL this will scrape a website and save the text to scraped_text.txt
 98 | ```
 99 | scraper.py [-d][-h][-u URL][-o OUTPUT_FILE]
100 | ```
101 | 
102 | 
103 | real time visualizer:
104 | ---------------------
105 | Another tool, called 'real_time_vis' creates a word frequency
106 | distribution chart which can grow and change in near real time as more
107 | tweets are grabbed. If you use -s, you'll get streaming results, which
108 | are currently being tweeted. Otherwise you will get batched quotes,
109 | every 5 seconds using the REST API, which will return tweets that are
110 | from the recent past. See real_time_vis.py for details or from the
111 | command line run:
112 | 
113 | ```
114 | python real_time_vis.py --help
115 | python real_time_vis.py --doc
116 | ```  
117 | USAGE :
118 | ```
119 | python real_time_vis.py [-h][-d][-f FILENAME][-n NUMBER][-s][-a ADDRESS]
120 | ```
121 | 
122 | Both files use a parameter file with geolocation and search
123 | terms. See params.txt for an example.
124 | 
125 | You may have to adjust your PYTHONPATH variable to run the program
126 | from the command line. Otherwise, using the python interpreter you can
127 | run it.
128 | 
129 | 
130 | 
131 | suggest_bot
132 | -----------
133 | This is a robotically assisted poetry engine. The user can create
134 | poems using a large supplied word corpus or use their own. It can also
135 | add words to the corpus from the twitter-sphere using the search
136 | option. It can also parse those twitter messages into phrases using
137 | natural language processing.
138 | 
139 | USAGE:
140 | ```
141 | python suggest_bot.py [-d][-h][-p PARAMS][-i INPUT | -m INPUT][-o OUTPUT][-a ADDRESS]
142 | ```
143 | 1) Once you are running the program, if you call the 's' command, you
144 | can search twitter.  This will use the parameters in the params.txt
145 | file as usual.
146 | 
147 | 2) If you want to parse the tweets and then use phrases,
148 | simply repond 'y' to the query after you hit 's'.  There is also a
149 | default corpus.
150 | 
151 | 3)This is also a default set of words, that you can use
152 | by calling the 'd' command.
153 | 
154 | 4)You can also load your own corpus, which will then just use those words
155 | randomly as suggestions.
156 | 
157 | 5) Finally, while using the word suggester, if you ever find that you
158 | made an error, simply hit e and an inline editor will pop up. There is
159 | currently an error that was patched but hasn't been pushed to all
160 | python versions, so you currently cannot insert words. Sorry!
161 | 
162 | 6) Finally, I would suggest trying out the markov chain poetry assistant. It can help create poems that mimic the natural statistics of the input text. Simply supply the progra
163 | m with a grammatical text of poems or literature.
164 | ```
165 | python suggest_bot.py -m <your_text_file_here.txt>
166 | ```
167 | 
168 | scan_and_respond
169 | ----------------
170 | 
171 | This tool scans tweets and asks the user to verify them before sending
172 | a tweet response. The relevant tweets are also saved to a JSON
173 | file. This requires write access, which means the consumerkeyandsecret
174 | file must contain all 4 lines.
175 | 
176 | ```
177 | scan_and_respond.py [-h] [-d] [-f FILENAME] [-a ADDRESS] [-o OUTPUT]
178 | ```
179 | 
180 | HELP:
181 | ----------------------------------------------------------------------
182 | All programs can be run from the command line (a.k.a. terminal in OS X).
183 | 
184 | By typing
185 | ```python <program_name> -h```
186 | you will get help on the various command line tool options.
187 | By typing
188 | ```python <program_name> -d```
189 | you will get the programs documentation string.
190 | If a parameter says something like:
191 | ```-o OUTPUT``` Then simply substitute a file for the capitalized word, like so:
192 | ```
193 | python suggest_bot.py -m my_poetic_text.txt
194 | ```
195 | If a USAGE says something like ```[-x | -y]``` then you can only use parameter x OR y but not both.
196 | 
197 | 
198 | EXAMPLES:
199 | ----------------------------------------------------------------------
200 | Grabbing geo-located tweets using paramter file params.txt (default),
201 | print to command line and write to output.txt (default):
202 | ```
203 | python sample.py --verbose
204 | ```
205 | Visualizing the data, using params.txt (default):
206 | ```
207 | python real_time_vis.py
208 | ```
209 | Streaming real time data to create a word frequency chart using a local address:
210 | ```
211 | python real_time_vis.py -a "175 5th Avenue NYC" -s
212 | ```
213 | Scraping a website and saving to an output file:
214 | ```
215 | python scraper.py -u http://www.cnn.com -o scraped_text.txt
216 | ```
217 | Using suggest_bot with a file of random words, which will NOT be a markov chain:
218 | ```
219 | python suggest_bot.py -i random_not_necessarily_grammatical_text.txt
220 | ```
221 | 
222 | UTILITIES:
223 | ----------------------------------------------------------------------
224 | These modules contain methods to assist the "tools" listed above:
225 | ```
226 | tweeter.py: this allows you to tweet at people, programmatically
227 | utils.py
228 | geo_converter.py: this returns geocoordinates for a given address
229 | geosearchclass.py: searches the REST API
230 | streamer.py : creates a multithreaded twitter API streamer
231 | editor.py : creates a command line editor
232 | ngrams.py : creates a markov chain ngram word generator
233 | ```
234 | 
235 | write
236 | -----
237 | This program classifies tweets into phrase types and
238 | produces a JSON array containing these, called phrases.json. It uses
239 | parameters from params.txt. This requires quite a bit of processing
240 | time, which can be reduced by using a lower "count".
241 | 
242 | The below two modules run unit tests:
243 | ```
244 | test_real_time_vis
245 | test_write
246 | ```
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/Sentence Parse.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Delta-Ark/Geo_Bot-complex/46bcc112a8b7c2bfe063b73cd2a44fb21f4bb933/Sentence Parse.PNG


--------------------------------------------------------------------------------
/StatusStreamerOutput.txt:
--------------------------------------------------------------------------------
 1 | Status(
 2 | contributors=None, 
 3 | truncated=False, 
 4 | text=u'#pokemongo @ Alameda, California https://t.co/ksXeNFloaS', 
 5 | is_quote_status=False, 
 6 | in_reply_to_status_id=None, 
 7 | id=751221344306491392, 
 8 | favorite_count=0, _
 9 | api=<tweepy.api.API object at 0x109924690>, 
10 | author=
11 | 
12 | User(follow_request_sent=None, 
13 |   profile_use_background_image=False, 
14 |   _json={u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False},
15 | id=31076073, _api=<tweepy.api.API object at 0x109924690>, verified=False, profile_image_url_https=u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', profile_sidebar_fill_color=u'252429', is_translator=False, geo_enabled=True, profile_text_color=u'666666', followers_count=751, protected=False, location=u'Alameda, California ', default_profile_image=False, id_str=u'31076073', utc_offset=-25200, statuses_count=8670, description=u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", friends_count=895, profile_link_color=u'FF0000', profile_image_url=u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', notifications=None, profile_background_image_url_https=u'https://abs.twimg.com/images/themes/theme9/bg.gif', profile_background_color=u'000000', profile_banner_url=u'https://pbs.twimg.com/profile_banners/31076073/1359676007', profile_background_image_url=u'http://abs.twimg.com/images/themes/theme9/bg.gif', screen_name=u'ddellamorte', lang=u'en', profile_background_tile=False, favourites_count=885, name=u'Brotha Nero', url=u'http://devourthepodcast.com', created_at=datetime.datetime(2009, 4, 14, 6, 53, 58), contributors_enabled=False, time_zone=u'Pacific Time (US & Canada)', profile_sidebar_border_color=u'FFFFFF', default_profile=False, following=False, listed_count=31),
16 | 
17 | 
18 | _json={u'contributors': None, u'truncated': False, u'text': u'#pokemongo @ Alameda, California https://t.co/ksXeNFloaS', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 751221344306491392, u'favorite_count': 0, u'source': u'<a href="http://instagram.com" rel="nofollow">Instagram</a>', u'retweeted': False, u'coordinates': {u'type': u'Point', u'coordinates': [-122.257, 37.764]}, u'timestamp_ms': u'1467940100157', u'entities': {u'user_mentions': [], u'symbols': [], u'hashtags': [{u'indices': [0, 10], u'text': u'pokemongo'}], u'urls': [{u'url': u'https://t.co/ksXeNFloaS', u'indices': [33, 56], u'expanded_url': u'https://www.instagram.com/p/BHlNxzDBgdI/', u'display_url': u'instagram.com/p/BHlNxzDBgdI/'}]}, u'in_reply_to_screen_name': None, u'id_str': u'751221344306491392', u'retweet_count': 0, u'in_reply_to_user_id': None, u'favorited': False, u'user': {u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False}, 
19 | DEPRECATED: u’geo': {u'type': u'Point', u'coordinates': [37.764, -122.257]}, 
20 | u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'lang': u'und', u'created_at': u'Fri Jul 08 01:08:20 +0000 2016', u'filter_level': u'low', u'in_reply_to_status_id_str': None, u'place': {u'full_name': u'Alameda, CA', u'url': u'https://api.twitter.com/1.1/geo/id/000e96b4e9f8503f.json', u'country': u'United States', u'place_type': u'city', u'bounding_box': {u'type': u'Polygon', u'coordinates': [[[-122.332411, 37.720367], [-122.332411, 37.797229], [-122.224562, 37.797229], [-122.224562, 37.720367]]]}, u'country_code': u'US', u'attributes': {}, u'id': u'000e96b4e9f8503f', u'name': u'Alameda'}},
21 | 
22 | coordinates={u'type': u'Point', u'coordinates': [-122.257, 37.764]},
23 | timestamp_ms=u'1467940100157',
24 | entities={u'user_mentions': [], u'symbols': [], u'hashtags': [{u'indices': [0, 10], u'text': u'pokemongo'}], u'urls': [{u'url': u'https://t.co/ksXeNFloaS', u'indices': [33, 56], u'expanded_url': u'https://www.instagram.com/p/BHlNxzDBgdI/', u'display_url': u'instagram.com/p/BHlNxzDBgdI/'}]},
25 | in_reply_to_screen_name=None,
26 | in_reply_to_user_id=None,
27 | retweet_count=0,
28 | id_str=u'751221344306491392',
29 | favorited=False,
30 | source_url=u'http://instagram.com',
31 | 
32 | 
33 | user=
34 | 
35 | User(
36 | follow_request_sent=None, profile_use_background_image=False, 
37 | 
38 | _json={u'follow_request_sent': None, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 31076073, u'verified': False, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'profile_sidebar_fill_color': u'252429', u'profile_text_color': u'666666', u'followers_count': 751, u'profile_sidebar_border_color': u'FFFFFF', u'id_str': u'31076073', u'profile_background_color': u'000000', u'listed_count': 31, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme9/bg.gif', u'utc_offset': -25200, u'statuses_count': 8670, u'description': u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost", u'friends_count': 895, u'location': u'Alameda, California ', u'profile_link_color': u'FF0000', u'profile_image_url': u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg', u'following': None, u'geo_enabled': True, u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/31076073/1359676007', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme9/bg.gif', u'name': u'Brotha Nero', u'lang': u'en', u'profile_background_tile': False, u'favourites_count': 885, u'screen_name': u'ddellamorte', u'notifications': None, u'url': u'http://devourthepodcast.com', u'created_at': u'Tue Apr 14 06:53:58 +0000 2009', u'contributors_enabled': False, u'time_zone': u'Pacific Time (US & Canada)', u'protected': False, u'default_profile': False, u'is_translator': False},
39 | 
40 | id=31076073,
41 | _api=<tweepy.api.API object at 0x109924690>,
42 | verified=False,
43 | profile_image_url_https=u'https://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg',
44 | profile_sidebar_fill_color=u'252429',
45 | is_translator=False,
46 | geo_enabled=True,
47 | profile_text_color=u'666666',
48 | followers_count=751,
49 | protected=False,
50 | location=u'Alameda, California ',
51 | default_profile_image=False,
52 | id_str=u'31076073',
53 | utcoffset=-25200,
54 | statuses_count=8670,
55 | description=u"Creator\\Producer\\Host of Devour The Podcast\\Cantankerous Fuck\\Screenwriter and horror film reviewer\\Guitarist and vocalist for The Moors and Alchemilla's Ghost",
56 | friends_count=895,
57 | profile_link_color=u'FF0000',
58 | profile_image_url=u'http://pbs.twimg.com/profile_images/749991140129054720/K_ChBh-E_normal.jpg',
59 | notifications=None,
60 | profile_background_image_url_https=u'https://abs.twimg.com/images/themes/theme9/bg.gif',
61 | profile_background_color=u'000000',
62 | profile_banner_url=u'https://pbs.twimg.com/profile_banners/31076073/1359676007',
63 | profile_background_image_url=u'http://abs.twimg.com/images/themes/theme9/bg.gif',
64 | screen_name=u'ddellamorte',
65 | lang=u'en', profile_background_tile=False, favourites_count=885, name=u'Brotha Nero', url=u'http://devourthepodcast.com', created_at=datetime.datetime(2009, 4, 14, 6, 53, 58), contributors_enabled=False, time_zone=u'Pacific Time (US & Canada)', profile_sidebar_border_color=u'FFFFFF', default_profile=False, following=False, listed_count=31),
66 | 
67 | 
68 | geo={u'type': u'Point', u'coordinates': [37.764, -122.257]},
69 | in_reply_to_user_id_str=None,
70 | possibly_sensitive=False,
71 | lang=u'und',
72 | created_at=datetime.datetime(2016, 7, 8, 1, 8, 20),
73 | filter_level=u'low',
74 | in_reply_to_status_id_str=None,
75 | 
76 | place=
77 | Place(
78 | _api=<tweepy.api.API object at 0x109924690>,
79 | country_code=u'US',
80 | url=u'https://api.twitter.com/1.1/geo/id/000e96b4e9f8503f.json',
81 | country=u'United States',
82 | place_type=u'city',
83 | bounding_box=BoundingBox(_api=<tweepy.api.API object at 0x109924690>, type=u'Polygon', coordinates=[
84 | [
85 | [-122.332411, 37.720367], [-122.332411, 37.797229], [-122.224562, 37.797229], [-122.224562, 37.720367]
86 | ]
87 | ]),
88 | full_name=u'Alameda, CA',
89 | attributes={},
90 | id=u'000e96b4e9f8503f',
91 | name=u'Alameda'),
92 | source=u'Instagram',
93 | retweeted=False)
94 | 


--------------------------------------------------------------------------------
/editor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # editor.py
 3 | # Saito 2017
 4 | 
 5 | 
 6 | """Creates a simple text editor
 7 | """
 8 | import curses
 9 | from curses.textpad import Textbox
10 | import locale
11 | 
12 | 
13 | def emacs_textbox(stdscr, initial_text):
14 |     stdscr.clear()
15 |     instructions = """
16 |     To Save and Exit hit Control-G
17 | 
18 |     This editing buffer uses Emacs commands (No Control-Y though)
19 |     *** A command Control-G is == Control + g (don't capitalize) ***
20 |     ---------------------------------------------------------------
21 |     Movement:
22 |     Use arrow keys
23 | 
24 |     OR:
25 |     Start of line: Control-A
26 |     End of line:   Control-E
27 |     Back           Control-B
28 |     Forward        Control-F
29 |     Down line      Control-N Cursor down; move down one line.
30 |     Previous line  Control-P Cursor up; move up one line.
31 | 
32 |     COPY + PASTE: Use mouse + keyboard shortcuts to copy and paste
33 | 
34 |     Deletion:
35 |         Delete under cursor  Control-D
36 |         Delete backwards     Control-H
37 |         Kill line            Control-K
38 |     """
39 |     stdscr.addstr(instructions)
40 |     stdscr.refresh()
41 | 
42 |     ending = """------------------------------------------------------\n
43 |                      EDIT BELOW ONLY
44 |     ------------------------------------------------------\n"""
45 |     stdscr.addstr(ending)
46 |     stdscr.refresh()
47 |     stdscr.addstr(initial_text)
48 |     stdscr.refresh()
49 |     box = Textbox(stdscr, insert_mode=False)  # Inf recursion bug when True
50 |     box.edit()
51 |     message = box.gather()
52 |     remove_index = len(ending) + len(instructions)
53 |     return message[remove_index + 15:]
54 | 
55 | 
56 | def create_editor(initial_text):
57 |     locale.setlocale(locale.LC_ALL, '')
58 |     code = locale.getpreferredencoding()
59 |     initial_text = initial_text.encode(code, 'replace')  # or 'ignore'
60 |     msg = curses.wrapper(emacs_textbox, initial_text)
61 |     return msg
62 | 
63 | 
64 | def main():
65 |     initial_text = u"""
66 | This is my po\xe9m
67 | It is not very clever
68 | But I'm fond of it
69 | """
70 |     msg = create_editor(initial_text)
71 |     print msg
72 |     
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/geo_converter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # geo_converter.py
  3 | # Saito 2016
  4 | """This is a module for converting from the the geocoordinate, radius
  5 | convention to the bounding box convention. It allows use of the same
  6 | parameter file for the Twitter REST API and Streaming API.
  7 | 
  8 | Usage:
  9 | bounding_box = convert_geocoordinates(
 10 |                latitude_degrees, longitude_degrees, radius_miles)
 11 | where
 12 | bounding_box == [southwest corner, northeast corner] ==
 13 |                 [lon1, lat1, lon2, lat2]
 14 | 
 15 | and can be used when calling the streaming API. Notice it's longitude
 16 | then latitude!
 17 | 
 18 | """
 19 | from __future__ import division
 20 | 
 21 | import math
 22 | import sys
 23 | from geopy.geocoders import Nominatim
 24 | 
 25 | 
 26 | def deg_to_rads(degs):
 27 |     rads = (degs / 360) * 2 * math.pi
 28 |     return rads
 29 | 
 30 | 
 31 | def convert_geocoordinates(latitude_degrees, longitude_degrees, radius_miles):
 32 |     """latitude and longitude in degrees, radius in miles, are converted
 33 |     to a bounding box representation, where box = [lat1, lon1, lat2,
 34 |     lon2]
 35 |     This doesn't work near the poles!
 36 |     """
 37 |     if latitude_degrees > 85 or latitude_degrees < -85:
 38 |         print "latitude is >85 or < -85. This won't work near poles!"
 39 |         sys.exit(0)
 40 |     if longitude_degrees > 180 or longitude_degrees < -180:
 41 |         print "longitude is >180 or < -180"
 42 |         sys.exit(0)
 43 | 
 44 |     radius_km = radius_miles / 0.62137
 45 |     if radius_km > 100:
 46 |         print "bounding box may be inaccurate for large radii"
 47 |     # print radius_km
 48 |     circ_of_earth_km = 40075.1612
 49 |     lat_rads = deg_to_rads(latitude_degrees)
 50 |     circ_of_earth_km_at_lat = math.cos(abs(lat_rads)) * circ_of_earth_km
 51 |     # print circ_of_earth_km_at_lat
 52 |     lon_km_per_degree = circ_of_earth_km_at_lat / 360
 53 |     print "lon_km_per_degree = {} at {}".format(lon_km_per_degree,
 54 |                                                 latitude_degrees)
 55 |     lon_delta = radius_km / lon_km_per_degree
 56 |     # print "longitudinal delta = {}".format(lon_delta)
 57 |     lon1 = longitude_degrees - lon_delta
 58 |     lon2 = longitude_degrees + lon_delta
 59 |     # print lon1
 60 |     # print lon2
 61 |     # check if within range:
 62 | 
 63 |     # equator to pole distance in km
 64 |     eq2pole = 10001.965729
 65 |     lat_km_per_degree = eq2pole / 90
 66 |     lat_delta = radius_km / lat_km_per_degree
 67 |     # print lat_delta
 68 |     lat1 = latitude_degrees - lat_delta
 69 |     lat2 = latitude_degrees + lat_delta
 70 | 
 71 |     # check all points and correct if possible
 72 |     lat1 = correct_latitude(lat1)
 73 |     lat2 = correct_latitude(lat2)
 74 |     lon1 = correct_longitude(lon1)
 75 |     lon2 = correct_longitude(lon2)
 76 |     bounding_box = [lon1, lat1, lon2, lat2]
 77 |     return bounding_box
 78 | 
 79 | 
 80 | def correct_longitude(lon):
 81 |     if lon > 180:
 82 |         return -180 + (lon - 180)
 83 |     elif lon < -180:
 84 |         return 180 - (abs(lon) - 180)
 85 |     else:
 86 |         return lon
 87 | 
 88 | 
 89 | def correct_latitude(lat):
 90 |     if lat > 90 or lat < -90:
 91 |         print "This doesn't work near the poles!!!!"
 92 |         sys.exit(0)
 93 |     return lat
 94 | 
 95 | 
 96 | def get_bounding_box_from(GeoSearchClass):
 97 |     latitude = GeoSearchClass._latitude
 98 |     longitude = GeoSearchClass._longitude
 99 |     radius = GeoSearchClass._radius
100 |     bounding_box = convert_geocoordinates(latitude, longitude, radius)
101 |     return bounding_box
102 | 
103 | 
104 | def get_search_terms_from(GeoSearchClass):
105 |     """parses search_term string of form "", "sf", "#sf+#tech" from the
106 |     params file and returns as list for use with streaming class
107 | 
108 |     """
109 |     search_string = GeoSearchClass._search_term
110 |     if search_string == "" or search_string is None:
111 |         search_terms = None
112 |     else:
113 |         search_terms = search_string.split("+")
114 |     return search_terms
115 | 
116 | 
117 | # want to get geocoordinates for a location and visa versa
118 | # to do:
119 | # test!
120 | 
121 | 
122 | def get_geocoords_from_address(address):
123 |     """address is a string, like '555 5th Ave. NYC, NY, 12021'. This is
124 |     searched and an approximate geocoordinate is returned, if possible
125 |     in form (latitude, longitude)
126 | 
127 |     """
128 |     geolocator = Nominatim()  # from geopy.geocoders.Nominatim
129 |     location = geolocator.geocode(address)
130 |     lat = location.latitude
131 |     lon = location.longitude
132 |     coords = (lat, lon)
133 | 
134 |     # do some check to see if coords were returned
135 |     if not coords:
136 |         return None
137 | 
138 |     # maybe do some coordinate conversion
139 |     print "found these coords = {}".format(coords)
140 |     back_projected_address = geolocator.reverse("{}, {}".format(lat, lon))
141 |     print "back_projected_address = {}".format(back_projected_address)
142 |     return coords
143 | 
144 | 
145 | # def get_timezone_from_coordinates(latitude, longitude):
146 | #     """given a latitude and a longitude, this returns the IANA Time Zone
147 | #     Database (Olson database), which can be used to get a local time and
148 | #     returns a pytz tzinfo timezone object"""
149 |     
150 | #     from geopy.geocoders import GoogleV3
151 | #     g=GoogleV3(api_key=None, domain='maps.googleapis.com', scheme='https', client_id=None, secret_key=None, timeout=1, proxies=None)
152 | #     timezone = g.timezone(latitude, longitude)
153 | #     return timezone
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     print __doc__
158 |     
159 |     # run some tests
160 |     bounding_box = convert_geocoordinates(0, -122.4093, 0)
161 |     print "longitudinal precision should be 111.32"
162 |     print "should be same first and second"
163 |     print "bounding_box = {}".format(bounding_box)
164 | 
165 |     bounding_box = convert_geocoordinates(37.7821, -122.4093, 1)
166 |     print "should be ~    [-122.426, 37.771, -122.398, 37.790 ]"
167 |     print "bounding_box = {}".format(bounding_box)
168 | 
169 |     bounding_box = convert_geocoordinates(45, -179.99, 10)
170 |     print "longitudinal precision should be 78.84"
171 |     print "should be sensible around Meridian"
172 |     print "bounding_box = {}".format(bounding_box)
173 | 
174 |     bounding_box = convert_geocoordinates(84.7821, -122, 10)
175 |     print "longitudinal precision should be ~9"
176 |     print "should be sensible around pole"
177 |     print "bounding_box = {}".format(bounding_box)
178 | 
179 |     get_geocoords_from_address('901 S Van Ness Ave., San Francisco, CA, 94110')
180 | 


--------------------------------------------------------------------------------
/geosearchclass.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import ast
  3 | import codecs
  4 | import os
  5 | import sys
  6 | 
  7 | import utils
  8 | 
  9 | 
 10 | class GeoSearchClass(object):
 11 |     """Create a geo search with data validation
 12 | 
 13 |     For examples of usages, see geotweets.py
 14 | 
 15 |     Usage:
 16 |     g = GeoSearchClass()
 17 |     g.latitude =37.7821
 18 |     g.longitude =-122.4093
 19 |     g.radius =3
 20 |     g.search_term="#SF"
 21 |     g.result_type='mixed'
 22 |     g.count = 15
 23 | 
 24 |     Simple example:
 25 |     g = GeoSearchClass()
 26 |     g.search()
 27 |     g.print_search_results()
 28 | 
 29 |     OR to properly initialize:
 30 |     g = GeoSearchClass(params_filename, consumer_key_and_secret_filename)
 31 | 
 32 |     To initialize the geosearchclass with a parameter file and the
 33 |     consumer key and secret file:
 34 |     g = GeoSearchClass(params_filename,consumer_key_and_secret_filename)
 35 |     and use:
 36 |     g.search()
 37 |     g.print_search_results()
 38 |     """
 39 | 
 40 |     def __init__(self, params_file='params.txt',
 41 |                  keys_file="consumerkeyandsecret",
 42 |                  api=None):
 43 |         if params_file:
 44 |             self.set_params_from_file(params_file)
 45 |         else:
 46 |             self.use_default_params()
 47 |         self.keys_file = keys_file
 48 |         if api:
 49 |             self.api = api
 50 |             self.credits_retrieved = True
 51 |         # elif self.get_creds(keys_file):
 52 |         #     self.credits_retrieved = True
 53 |         else:
 54 |             self.credits_retrieved = False
 55 | 
 56 |     def use_default_params(self):
 57 |         self._search_term = None
 58 |         self._result_type = "mixed"
 59 |         self._count = 15
 60 |         self._latitude = 37.7821
 61 |         self._longitude = -122.4093
 62 |         self._radius = 3
 63 |         self._geo_string = None
 64 |         self.search_results = None
 65 | 
 66 |     def set_params_from_file(self, filename):
 67 |         with codecs.open(filename, encoding='utf-8', mode='rU') as f:
 68 |             params = dict()
 69 |             params.update(ast.literal_eval(f.read()))
 70 |         for key in params.keys():
 71 |             print key + ' : ' + str(params[key])
 72 |         self._latitude = params['latitude']
 73 |         self._longitude = params['longitude']
 74 |         self._radius = params['radius']
 75 |         self._search_term = params['search_term']
 76 |         self._result_type = params['result_type']
 77 |         self._count = params['count']
 78 |         self.tweet_text = params['tweet_text']
 79 | 
 80 |     def search(self):
 81 |         '''Perform a geolocated search using the class attributes
 82 |         'search_term', 'result_type', 'count', and 'geo_string'.
 83 | 
 84 |         Requires an api object as returned by the tweepy module.
 85 | 
 86 |         USAGE:
 87 |         search_results = search(api)
 88 |         
 89 |         See: http://docs.tweepy.org/en/v3.5.0/api.html#API.search
 90 |         '''
 91 |         if not self.credits_retrieved:
 92 |             (self.api, __) = utils.get_credentials(self.keys_file, True)
 93 |             self.credits_retrieved = True
 94 |         geo_string = getattr(self, "geo_string")
 95 |         if self._geo_string is None:
 96 |             raise Exception("initialize geo string")
 97 |         search_results = self.api.search(q=self._search_term,
 98 |                                          geocode=geo_string,
 99 |                                          result_type=self._result_type,
100 |                                          count=self._count)
101 |         self.search_results = search_results
102 |         return self.search_results
103 | 
104 |     def print_search_results(self):
105 |         '''Pretty prints the list of SearchResult objects returned using the
106 |         api.search method.
107 | 
108 |         The results are formated and give some info about the tweet.
109 | 
110 |         '''
111 | 
112 |         # printSROInfo()   #This is for SRO object investigation
113 |         search_results = self.search_results
114 |         print "Actual number of tweets returned from Twitter: " + str(len(
115 |             search_results))
116 | 
117 |         for sr in search_results:
118 |             print
119 |             print '@' + sr.user.screen_name
120 |             if sr.geo:
121 |                 print 'coordinates = ' + str((sr.geo)['coordinates'])
122 |             print "created_at = " + str(sr.created_at)
123 |             print "tweet id: " + str(sr.id)
124 |             print "retweet_count = " + str(
125 |                 sr.retweet_count) + "favorite_count = " + str(
126 |                     sr.favorite_count)
127 |             print sr.text
128 | 
129 |     def write_search_results(self, output_file=u'output.txt'):
130 |         '''Writes search results to output file, defaults to "output.txt".
131 | 
132 | 
133 |         USAGE:
134 |         write_results( output_file = 'output.txt')
135 | 
136 | 
137 |         Details: It uses unicode encoding to capture all of the
138 |         possible tweet characters. It gets the filesystemencoding for
139 |         each OS.
140 | 
141 |         '''
142 |         search_results = self.search_results
143 |         tweet_text = u''
144 |         for sr in search_results:
145 |             coords = u''
146 |             if sr.geo:
147 |                 coords = u'     coordinates = ' + str((sr.geo)['coordinates'])
148 |             s = u'\n\n\n@' + sr.user.screen_name + coords + u' : \n' + sr.text
149 | 
150 |             tweet_text = tweet_text + s
151 | 
152 |         # print tweet_text
153 |         # print "tweet text type = " + str(type(tweet_text))
154 |         fileSystemEncoding = sys.getfilesystemencoding()
155 |         # OUTPUT_FILE = os.path.expanduser(u'./output.txt')
156 |         OUTPUT_FILE = os.path.expanduser(u'./' + output_file)
157 |         # with codecs.open(OUTPUT_FILE, encoding='utf-8', mode="w") as f:
158 |         with codecs.open(OUTPUT_FILE,
159 |                          encoding=fileSystemEncoding,
160 |                          mode="w") as f:
161 |             f.write(tweet_text)
162 |         return
163 | 
164 |     def json_search_results(self, output_file='search_results.json'):
165 |         '''Writes search results as json to output file 'search_results.json
166 | 
167 | 
168 |         USAGE:
169 |         json_search_results( output_file = 'search_results.json')
170 | 
171 | 
172 |         Details: It uses unicode encoding to capture all of the
173 |         possible tweet characters. It gets the filesystemencoding for
174 |         each OS.
175 | 
176 |         '''
177 |         import json
178 |         print 'writing results to file {}'.format(output_file)
179 |         fileSystemEncoding = sys.getfilesystemencoding()
180 |         # OUTPUT_FILE = os.path.expanduser(u'./output.txt')
181 |         OUTPUT_FILE = os.path.expanduser(u'./' + output_file)
182 |         # with codecs.open(OUTPUT_FILE, encoding='utf-8', mode="w") as f:
183 |         with codecs.open(OUTPUT_FILE,
184 |                          encoding=fileSystemEncoding,
185 |                          mode="w") as f:
186 |             for sr in self.search_results:
187 |                 j = json.dumps(sr._json, indent=1)
188 |                 f.write(j)
189 |         return
190 | 
191 |     def _print_SRO_info(self):
192 |         '''
193 |         This gives a verbose amount of info about the SearchResult object
194 | 
195 |         USAGE:
196 |         print_SRO_info()
197 |         '''
198 |         search_results = self.search_results
199 |         print '\n\n\n\n'
200 |         print 'The methods of each SearchResult object :'
201 |         print dir(search_results[0])
202 |         print '\n\n\n\n'
203 |         print 'The methods of each User object in a SRO:'
204 |         print dir(search_results[0].user)
205 |         print '\n\n\n\n'
206 |         print 'Example of the first SRO object:'
207 |         sr1 = search_results[0]
208 |         print sr1.created_at
209 |         # print sr1.retweets
210 |         print sr1.retweet_count
211 |         # print sr1.favorite
212 |         # print sr1.favorited
213 |         print sr1.favorite_count
214 | 
215 |     @property
216 |     def count(self):
217 |         "Number of results to return"
218 |         return self._count
219 | 
220 |     @count.setter
221 |     def count(self, value):
222 |         if isinstance(value, basestring):
223 |             value = float(value)
224 |         if isinstance(value, (float, int)):
225 |             if not (value > 0 and value < 101 and value == int(value)):
226 |                 raise ValueError(
227 |                     "count is '" + str(value) +
228 |                     "' but count must be an integer and 0 < count < 101")
229 |         self._count = value
230 | 
231 |     @property
232 |     def result_type(self):
233 |         "Type of results to return: mixed, popular or recent"
234 |         return self._result_type
235 | 
236 |     @result_type.setter
237 |     def result_type(self, rt):
238 |         if not (rt == "mixed" or rt == "popular" or rt == "recent"):
239 |             raise ValueError(
240 |                 "result_type must be 'mixed', 'recent', or 'popular' NOT '" +
241 |                 str(rt) + "'")
242 |         self._result_type = rt
243 | 
244 |     @property
245 |     def latitude(self):
246 |         "90 > Latitude > -90"
247 |         return self._latitude
248 | 
249 |     @latitude.setter
250 |     def latitude(self, value):
251 |         if (value == ''):
252 |             raise ValueError("You must put in a value")
253 |         value = float(value)
254 |         if not (value > -90.0 and value < 90.0):
255 |             raise ValueError("latitude must be in bounds: 90.0>latitude>-90.0")
256 |         self._latitude = value
257 | 
258 |     @property
259 |     def longitude(self):
260 |         "180 > Longitude > -180"
261 |         return self._longitude
262 | 
263 |     @longitude.setter
264 |     def longitude(self, value):
265 |         if (value == ''):
266 |             raise ValueError("You must put in a value")
267 |         value = float(value)
268 |         if not (value > -180.0 and value < 180.0):
269 |             raise ValueError(
270 |                 "longitude must be in bounds: 180.0>longitude>-180.0")
271 |         self._longitude = value
272 | 
273 |     @property
274 |     def radius(self):
275 |         "Radius of search, must be >0"
276 |         return self._radius
277 | 
278 |     @radius.setter
279 |     def radius(self, value):
280 |         if (value == ''):
281 |             raise ValueError("You must put in a value")
282 |         value = float(value)
283 |         if not (value > 0):
284 |             raise ValueError("radius must be > 0.0 miles")
285 |         self._radius = value
286 | 
287 |     @property
288 |     def geo_string(self):
289 |         "Formats the geo string using latitude, longitude and radius"
290 |         self._geo_string = str(self._latitude) + "," + \
291 |             str(self._longitude) + "," + str(self._radius) + "mi"
292 |         return self._geo_string
293 | 
294 | 
295 | def att_test(obj, atr, val_list):
296 |     '''
297 |     Perform a unit test on attributes of a class
298 | 
299 |     USAGE:
300 |     att_test(this_object, attribute_name_as_string, values_to_test_as_list)
301 |     '''
302 |     print "\n\nTesting " + atr + " validation"
303 |     for val in val_list:
304 |         try:
305 |             print "trying to set attribute to " + str(val)
306 |             setattr(obj, atr, val)
307 |         except ValueError as e:
308 |             print e
309 | 
310 | 
311 | def main():
312 |     c = GeoSearchClass()
313 | 
314 |     print c.__doc__
315 |     print c.__dict__
316 |     # att_test(c, "count", [1,35, 101, -1, 3.5, "hello", "15"])
317 |     # att_test(c, "result_type", ["mixed","popular","recent","other",15, " mIxEd"])
318 |     # att_test(c,"latitude",[0, -90, 90, 300, "-50", "hello", 1.3])
319 |     # att_test(c,"longitude",[0, -180, 180, 300, "-100", "hello", 1.3])
320 |     # att_test(c,"radius",[0, -1, 10, 100, 1000])
321 |     print "\n\ncurrent geo_string " + c.geo_string
322 |     print c.result_type
323 | 
324 | 
325 | if __name__ == '__main__':
326 |     main()
327 | 


--------------------------------------------------------------------------------
/ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # ngrams.py
 3 | # Saito 2017
 4 | 
 5 | import random
 6 | 
 7 | import utils
 8 | 
 9 | 
10 | def make_ngram(text, n):
11 |     ngram = dict()
12 |     tokens = utils.tokenize_normal_words(text)
13 |     i = 0
14 |     while i < (len(tokens)-(n-1)):
15 |         l = list()
16 |         for j in range(n-1):
17 |             token = tokens[i+j]
18 |             token = token.lower()
19 |             # print token
20 |             l.append(token)
21 |         key = tuple(l)
22 |         # print key
23 |         value = tokens[i+n-1]
24 |         value = value.lower()
25 |         if key in ngram:
26 |             ngram[key].append(value)
27 |         else:
28 |             ngram[key] = list()
29 |             ngram[key].append(value)
30 |         i += 1
31 |     return ngram
32 | 
33 | 
34 | def generate(ngram, seed):
35 |     """given an ngram dictionary and a string or tuple of words, this \
36 | returns a word. For efficiency, pass in all words as a list"""
37 |     if type(seed) is not tuple:
38 |         l = list()
39 |         tokens = utils.tokenize_normal_words(seed)
40 |         tokens = [t.lower() for t in tokens]
41 |         l.extend(tokens)
42 |         seed = tuple(l)
43 | 
44 |     word = ""
45 |     if seed in ngram:
46 |         word = random.choice(ngram[seed])
47 |         # print "found in dictionary"
48 |         # print ngram[seed]
49 | 
50 |     # elif words is None:
51 |     #     print "Combining all dictionary values."
52 |     #     words = sum(ngram.values(), [])
53 |     #     word = random.choice(words)
54 |     # else:
55 |     #     word = random.choice(words)
56 |     return word
57 | 
58 | 
59 | def make_bigram_trigram_dictionary(text):
60 |     bigram = make_ngram(text, 2)
61 |     # print bigram
62 |     trigram = make_ngram(text, 3)
63 |     # print trigram
64 |     bigram.update(trigram)
65 |     # print "printing bigram"
66 |     # print bigram
67 |     return bigram
68 | 
69 | 
70 | def main():
71 |     initial_text = u"""
72 | This is my poem.
73 | It is not very clever,
74 | But I'm fond of it.
75 | """
76 | 
77 |     print initial_text
78 |     ngram = make_bigram_trigram_dictionary(initial_text)
79 |     word = generate(ngram, 'this')
80 |     print "response should be 'is'"
81 |     print word
82 |     
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/p_files/LA.txt:
--------------------------------------------------------------------------------
 1 | #LA
 2 | 
 3 | {"latitude" : 34.043931
 4 | , 
 5 | "longitude": -118.243936
 6 | ,
 7 | "radius" : 100
 8 | ,
 9 | "search_term" : ""
10 | ,
11 | "result_type" : "recent"
12 | ,
13 | "count" : 100}
14 | 


--------------------------------------------------------------------------------
/p_files/clive.txt:
--------------------------------------------------------------------------------
 1 | #Clive, Iowa
 2 | 
 3 | {"latitude" : 41.608593
 4 | , 
 5 | "longitude": -93.788713
 6 | ,
 7 | "radius" : 100
 8 | ,
 9 | "search_term" : ""
10 | ,
11 | "result_type" : "recent"
12 | ,
13 | "count" : 100}
14 | 


--------------------------------------------------------------------------------
/p_files/nyc.txt:
--------------------------------------------------------------------------------
 1 | #NYC
 2 | 
 3 | {"latitude" : 40.734073
 4 | , 
 5 | "longitude": -73.990663
 6 | ,
 7 | "radius" : 10
 8 | ,
 9 | "search_term" : ""
10 | ,
11 | "result_type" : "recent"
12 | ,
13 | "count" : 100}
14 | 


--------------------------------------------------------------------------------
/p_files/nyc.txt~:
--------------------------------------------------------------------------------
 1 | #NYC
 2 | 
 3 | {"latitude" : 40.734073
 4 | , 
 5 | "longitude": -73.990663
 6 | ,
 7 | "radius" : 10
 8 | ,
 9 | "search_term" : ""
10 | ,
11 | "result_type" : "recent"
12 | ,
13 | "count" : 100}
14 | 


--------------------------------------------------------------------------------
/params.txt:
--------------------------------------------------------------------------------
 1 | # This file contains the parameters for the geo located search.
 2 | 
 3 | # You must list all parameters in python dictionary format as shown to
 4 | # use this input method. Just follow the example below. Note the
 5 | # commas after each entry.  Note: latitude, longitude, radius and
 6 | # count all take numbers. Radius is set in miles and can be a decimal
 7 | # or whole number like 0.1 or 3. search_term, result_type are both
 8 | # strings and must be in quotes. search_term can be set to, for
 9 | # example "#SF+tech" to use one hash tag search terms and one normal
10 | # word. It can also just be a word in quotes, "sf" or the word None
11 | # without quotes for no term. Result_type can be either "mixed",
12 | # "popular", or "recent". Count must be an integer between 0 and 100
13 | # OR None. Finally, a term called "tweet_text" is optional and is for
14 | # posting tweets on your behalf using the scan_and_respond tool. It
15 | # should be a unicode string (hence the preceding 'u'.
16 | # 
17 | # Example of params.txt:
18 | # {"latitude" : 37.7821,
19 | # "longitude": -122.4093,
20 | # "radius" : 10,
21 | # "search_term" : "#SF+tech",
22 | # "result_type" : "mixed",
23 | # "count" : 100}
24 | 
25 | 
26 | 
27 | {"latitude" : 37.772296
28 | , 
29 | "longitude": -122.412911
30 | ,
31 | "radius" : 10
32 | ,
33 | "search_term" : ""
34 | ,
35 | "result_type" : "mixed"
36 | ,
37 | "count" : 100
38 | ,
39 | "tweet_text" : u'''WASSUP! This tweet was written using the saito 'geotweets' project on github! Check it out!'''
40 | }
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/real_time_vis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # real_time_vis.py
  3 | # Saito 2015
  4 | 
  5 | """This grabs tweets and visualizes them in real time using params.txt.
  6 | 
  7 | You can get the tweets using the streaming API or the REST API. The
  8 | rest API requires 5 second pauses between successive calls to the
  9 | twitter server. This is the default. Use the --stream or -s flag to
 10 | enable the streaming API. The Streaming API gets all tweets that are
 11 | geotagged within the bounding box. The geolocation is approximately
 12 | converted, by inscribing a bounding box square in the circle around
 13 | the geocoordinates. The tweets are also saved in JSON form to
 14 | a file called 'tweets.json'.
 15 | 
 16 | USAGE:
 17 |   $ python real_time_vis.py [-h][-d][-f FILENAME][-n NUMBER][-s][-a ADDRESS]
 18 | OR for help, try:
 19 |   $ ./real_time_vis.py -h
 20 | OR:
 21 |   $ python real_time_vis.py
 22 | 
 23 | 
 24 | Example using default parameter file 'params.txt', with 20 top words
 25 | to display, on a growing chart:
 26 | 
 27 |     $ ./real_time_vis --number 20
 28 | Or using the streaming API with an address:
 29 |     $ ./real_time_vis -n 20 -s -a "175 5th Avenue NYC"
 30 | 
 31 | 
 32 | TO EXIT:
 33 | To exit one of these multithreaded programs, use a keyboard interrupt
 34 | like CTRL+C.
 35 | 
 36 | """
 37 | from __future__ import division
 38 | 
 39 | import Queue
 40 | import argparse
 41 | import sys
 42 | 
 43 | import matplotlib.pyplot as plt
 44 | 
 45 | import geo_converter
 46 | import geosearchclass
 47 | import streamer
 48 | import utils
 49 | 
 50 | 
 51 | global stream  # so that CTRL + C kills stream
 52 | 
 53 | 
 54 | def update_fdist(fdist, new_words):
 55 |     for word in new_words:
 56 |         if word in fdist:
 57 |             fdist[word] += 1
 58 |         else:
 59 |             fdist[word] = 1
 60 |     return fdist
 61 | 
 62 | 
 63 | def remove_infrequent_words(samples, fdist):
 64 |     trimmed_samples = []
 65 |     for item in samples:
 66 |         if fdist[item] > 2:
 67 |             trimmed_samples.append(item)
 68 |     return trimmed_samples
 69 | 
 70 | 
 71 | def updating_plot(geosearchclass, number_of_words, grow=True):
 72 |     search_results = geosearchclass.search()
 73 |     filtered_words = utils.tokenize_and_filter(search_results)
 74 |     fdist = utils.get_freq_dist(filtered_words)
 75 |     # set up plot
 76 |     samples = [item for item, _ in fdist.most_common(number_of_words)]
 77 |     freqs = [fdist[sample] for sample in samples]
 78 |     plt.grid(True, color="silver")
 79 |     plt.plot(freqs, range(len(freqs)))
 80 |     plt.yticks(range(len(samples)), [s for s in samples])
 81 |     plt.ylabel("Samples")
 82 |     plt.xlabel("Counts")
 83 |     plt.title("Top Words Frequency Distribution")
 84 |     plt.ion()
 85 |     plt.show()
 86 | 
 87 |     # set up loop
 88 |     old_ids = set([s.id for s in search_results])
 89 |     for i in xrange(100):
 90 |         plt.pause(5)
 91 |         # use mixed above, change to recent here
 92 |         geosearchclass.result_type = "recent"
 93 |         # perturbation study
 94 |         # if i%2:  # for testing purposes
 95 |         #     # #change location every odd time to nyc
 96 |         #     # geosearchclass.latitude =40.734073
 97 |         #     # geosearchclass.longitude =-73.990663
 98 |         #     # perturb latitude
 99 |         #     geosearchclass.latitude =geosearchclass.latitude + .001
100 | 
101 |         # else:
102 |         #     #now back to sf
103 |         #     # geosearchclass.latitude = 37.7821
104 |         #     # geosearchclass.longitude =  -122.4093
105 |         #     geosearchclass.longitude =geosearchclass.longitude + .001
106 | 
107 |         search_results = geosearchclass.search()
108 |         new_search_results = utils.new_tweets(search_results, old_ids)
109 |         if new_search_results:
110 |             filtered_words = utils.tokenize_and_filter(new_search_results)
111 |             fdist = update_fdist(fdist, filtered_words)
112 |             if grow:
113 |                 newsamples = [item
114 |                               for item, _ in fdist.most_common(number_of_words)
115 |                               ]
116 |                 s1 = set(newsamples)
117 |                 s2 = set(samples)
118 |                 s1.difference_update(s2)
119 |                 if s1:
120 |                     print "New words: " + str(list(s1))
121 |                     newsamples = list(s1)
122 |                     samples.extend(newsamples)
123 |                     plt.yticks(range(len(samples)), [s for s in samples])
124 |             freqs = [fdist[sample] for sample in samples]
125 |             plt.plot(freqs, range(len(freqs)))
126 |             if grow:
127 |                 plt.draw()
128 |             print '%d new tweet(s)' % len(new_search_results)
129 |             old_ids.update(set([s.id for s in new_search_results]))
130 |         else:
131 |             print "no updates"
132 | 
133 | # g = geosearchclass.GeoSearchClass()
134 | # g.set_params_from_file('params.txt')
135 | # search_results = g.search()
136 | 
137 | 
138 | def updating_stream_plot(q, number_of_words=30):
139 |     """This plot uses the streaming API to get real time twitter
140 |     information from a given region, determined by a geo-coordinate
141 |     bounding box. The upper left and lower right determine the
142 |     bounding box.
143 | 
144 |     q is a queue instance, which holds tweets
145 | 
146 |     number_of_words determines the average number of words in the
147 |     plot. Once the plot reaches 2 x number_of_words, it is shrunk down
148 |     to the new set of words and starts growing again
149 | 
150 |     To exit the program early, hit CTRL + Z to stop the python script
151 |     and then CTRL + D twice to kill the terminal process and close the
152 |     window.
153 | 
154 |     """
155 |     setup = False
156 |     fdist = None
157 |     samples = None
158 |     draw_time = 0.1
159 |     samples = []
160 |     plt.ion()
161 |     plt.grid(True, color="silver")
162 | 
163 |     for i in range(100000):
164 |         status = q.get()
165 |         search_results = [status]
166 |         while not q.empty():
167 |             print "getting another tweet"
168 |             status = q.get()
169 |             search_results.append(status)
170 | 
171 |         if not setup:
172 |             print "Gathering enough data to begin plotting"
173 |             while len(samples) < 1:
174 |                 status = q.get()
175 |                 search_results.append(status)
176 |                 filtered_words = utils.tokenize_and_filter(search_results)
177 |                 if fdist is None:
178 |                     fdist = utils.get_freq_dist(filtered_words)
179 |                 else:
180 |                     fdist = update_fdist(fdist, filtered_words)
181 |                 n_words = min(10, len(fdist))
182 |                 samples = [item for item, _ in fdist.most_common(n_words)]
183 |                 # print "len(samples) = {}".format(len(samples))
184 |                 samples = remove_infrequent_words(samples, fdist)
185 |             freqs = [fdist[sample] for sample in samples]
186 |             plt.plot(freqs, range(len(freqs)))
187 |             plt.yticks(range(len(samples)), [s for s in samples])
188 |             plt.ylabel("Samples")
189 |             plt.xlabel("Counts")
190 |             plt.title("Top Words Frequency Distribution")
191 |             plt.show()
192 |             plt.pause(draw_time)
193 |             setup = True
194 | 
195 |         else:
196 |             filtered_words = utils.tokenize_and_filter(search_results)
197 |             fdist = update_fdist(fdist, filtered_words)
198 |             newsamples = [item
199 |                           for item, _ in fdist.most_common(number_of_words)]
200 |             newsamples = remove_infrequent_words(newsamples, fdist)
201 |             s1 = set(newsamples)
202 |             s2 = set(samples)
203 |             s1.difference_update(s2)
204 |             if s1:
205 |                 print "New words: " + str(list(s1))
206 |                 newsamples = list(s1)
207 |                 samples.extend(newsamples)
208 |                 if len(samples) > 2*number_of_words:
209 |                     samples = newsamples
210 |                     plt.close()
211 |                 plt.yticks(range(len(samples)), [s for s in samples])
212 |             freqs = [fdist[sample] for sample in samples]
213 |             plt.plot(freqs, range(len(freqs)))
214 |             plt.draw()
215 |             plt.pause(draw_time)
216 |     kill_plot()
217 |     return
218 | 
219 | 
220 | def kill_plot():
221 |     print "turning interactive off"
222 |     plt.ioff()
223 |     print "closing plot"
224 |     plt.close()
225 |     return
226 | 
227 | 
228 | def get_parser():
229 |     """ Creates a command line parser
230 | 
231 |     --doc -d
232 |     --help -h
233 |     --filename -f
234 |     --grow -g
235 |     --number -n
236 |     """
237 |     # Create command line argument parser
238 |     parser = argparse.ArgumentParser(
239 |         description='Create an updating word frequency distribution chart.')
240 | 
241 |     parser.add_argument('-d',
242 |                         '--doc',
243 |                         action='store_true',
244 |                         help='print module documentation and exit')
245 |     parser.add_argument(
246 |         '-f',
247 |         '--filename',
248 |         help='''specify a FILENAME to use as the parameter file.
249 |         If not specified, will use 'params.txt'.''')
250 |     parser.add_argument(
251 |         '-a',
252 |         '--address',
253 |         help='''give an ADDRESS to get geocoordinates for.
254 | Put the address in quotes''')
255 |     # parser.add_argument('-r',
256 |     #                     '--rest',
257 |     #                     action='store_true',
258 |     #                     help='Use the REST API to create a growing chart\
259 |     #                     as new words arrive.')
260 |     parser.add_argument('-n',
261 |                         '--number',
262 |                         help='specify NUMBER of words to display. The\
263 |                         streaming plot will grow to twice this number\
264 |                         before shrinking again')
265 |     parser.add_argument('-s',
266 |                         '--stream',
267 |                         action='store_true',
268 |                         help='Use streaming API to update a growing plot. \
269 |                         Otherwise, results will be batched.\
270 |                         Use Interrupt signal, like CTRL + C to exit. \
271 |                         This uses the LOCATION and SEARCH_TERM from\
272 |                         parameter file. The tweets are saved to tweets.json.')
273 |     return parser
274 | 
275 | 
276 | def main():
277 |     parser = get_parser()
278 |     args = parser.parse_args()
279 |     # print args
280 |     # print args.help
281 | 
282 |     if args.doc:
283 |         print __doc__
284 |         import sys
285 |         sys.exit(0)
286 | 
287 |     if args.number:
288 |         number = int(args.number)
289 |     else:
290 |         number = 30
291 | 
292 |     g = geosearchclass.GeoSearchClass()
293 | 
294 |     if args.filename:
295 |         print 'Using parameters from ' + str(args.filename)
296 |         g.set_params_from_file(args.filename)
297 |     else:
298 |         print "Using search values from params.txt"
299 |         g.set_params_from_file('params.txt')
300 | 
301 |     if args.address:
302 |         print "Finding geocoordates for address:\n{}".format(args.address)
303 |         coords = geo_converter.get_geocoords_from_address(args.address)
304 |         if coords:
305 |             g.latitude = coords[0]
306 |             print "Found this latitude:"
307 |             print g.latitude
308 |             g.longitude = coords[1]
309 |             print "Found this longitude:"
310 |             print g.longitude
311 |         else:
312 |             print "Failed to find coordinates. Exiting."
313 |             sys.exit()
314 | 
315 |     if args.stream:
316 |         print "using streaming queue"
317 |         q = Queue.Queue()
318 |         bounding_box = geo_converter.get_bounding_box_from(g)
319 |         search_terms = geo_converter.get_search_terms_from(g)
320 |         print "bounding_box = {}".format(bounding_box)
321 |         print "search_terms = {}".format(search_terms)
322 |         global stream
323 |         fn = 'tweets.json'
324 |         stream = streamer.start_stream(q, bounding_box, fn, search_terms)
325 |         updating_stream_plot(q, number)
326 |     else:
327 |         print "using REST API updating plot"
328 |         updating_plot(g, number, True)  # set grow flag to True
329 | 
330 | 
331 | if __name__ == '__main__':
332 |     try:
333 |         main()
334 |     except KeyboardInterrupt:
335 |         print "Main function interrupted"
336 |         if "stream" in globals():
337 |             streamer.kill_stream(stream)
338 |         kill_plot()
339 |         sys.exit()
340 | 


--------------------------------------------------------------------------------
/sample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # sample.py
  3 | # Saito 2015
  4 | 
  5 | """This program is for grabbing and saving a sample of geo-located tweets
  6 | 
  7 | 
  8 | USAGE:
  9 |   $ ./sample.py [-h][-d][-v][-f FILENAME][-o OUTPUT][-vis]
 10 | 
 11 | Print command line help:
 12 |   $ ./sample.py --help   (or just -h)
 13 | 
 14 | Example: This uses parameter file 'params.txt', prints results to
 15 | command line and writes them to 'out.txt':
 16 |   $ ./sample.py --verbose --filename params.txt --output out.txt
 17 | 
 18 | The program requires a file in this folder called consumerkeyandsecret
 19 | which contains only a consumer key on the first line and consumer
 20 | secret (the longer one) on the second line. See README.
 21 | 
 22 | The program can optionally take a parameter file as input. Please see
 23 | the file "params.txt" for an example.
 24 | 
 25 | Example of params.txt:
 26 | {"latitude" : 37.7821,
 27 | "longitude": -122.4093,
 28 | "radius" : 10,
 29 | "search_term" : "#SF+tech",
 30 | "result_type" : "mixed",
 31 | "count" : 15}
 32 | 
 33 | """
 34 | 
 35 | import sys
 36 | import argparse
 37 | import geosearchclass
 38 | 
 39 | 
 40 | def get_parser():
 41 |     """ Creates a command line parser
 42 | 
 43 |     --doc -d
 44 |     --help -h
 45 |     --filename -f
 46 |     --verbose -v
 47 |     --output -o
 48 |     --visualize -vis
 49 |     --default
 50 |     """
 51 | 
 52 |     parser = argparse.ArgumentParser(
 53 |         description='Perform a geo-located search.')
 54 | 
 55 |     parser.add_argument(
 56 |         '-d', '--doc', action='store_true',
 57 |         help='print module documentation and exit')
 58 |     parser.add_argument(
 59 |         '-f', '--filename',
 60 |         help='''specify a FILENAME to use as the parameter file.
 61 |         If not specified, will use 'params.txt'.''')
 62 |     parser.add_argument(
 63 |         '-v', '--verbose', action='store_true',
 64 |         help='additionally print output to command line')
 65 |     parser.add_argument(
 66 |         '--default', action='store_true',
 67 |         help="""ignore parameter file and use default search
 68 |         terms from geosearchclass""")
 69 |     parser.add_argument(
 70 |         '-o', '--output',
 71 |         help='''specify an OUTPUT file to write to.
 72 |         Default is output.txt''')
 73 |     parser.add_argument(
 74 |         '-j', '--json',
 75 |         help='''specify an OUTPUT JSON file to write to.''')
 76 |     parser.add_argument('-vis', '--visualize',
 77 |                         action='store_true', help='visualize using nlp tools')
 78 | 
 79 |     # automatically grabs arguments from sys.argv[]
 80 | 
 81 |     return parser
 82 | 
 83 | 
 84 | def main():
 85 | 
 86 |     parser = get_parser()
 87 |     args = parser.parse_args()
 88 | 
 89 |     if args.doc:
 90 |         print __doc__
 91 |         sys.exit()
 92 | 
 93 |     g = geosearchclass.GeoSearchClass()
 94 | 
 95 |     if args.filename:
 96 |         print 'Using parameters from ' + str(args.filename)
 97 |         # turn parameter file into dictionary
 98 |         g.set_params_from_file(args.filename)
 99 |     else:
100 |         if args.default:
101 |             print 'Using default search terms'
102 |         else:
103 |             print 'Using parameters from params.txt'
104 |             g.set_params_from_file('params.txt')
105 | 
106 |     g.search()
107 |     # print formatted results with extra info to terminal
108 |     if args.verbose:
109 |         g.print_search_results()
110 | 
111 |     if args.output:
112 |         g.write_search_results(args.output)
113 |     else:
114 |         g.write_search_results()
115 | 
116 |     if args.json:
117 |         g.json_search_results(args.json)
118 | 
119 |     if args.visualize:
120 |         import utils
121 |         filtered_words = utils.tokenize_and_filter(g.search_results)
122 |         utils.visualize(filtered_words)
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     main()
127 | 


--------------------------------------------------------------------------------
/scan_and_respond.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # real_time_vis.py
  3 | # Saito 2016
  4 | 
  5 | """
  6 | Scans tweets and asks the user to verify them before sending a tweet response.
  7 | 
  8 | A queue is created of tweets as they arrive via the REST API. The user
  9 | is then asked to look over these tweets and decide if they are
 10 | relevant. If they are, the relevant parts are saved to a JSON file. If
 11 | they respond flag -r was passed, a public tweet is sent out with the
 12 | user tagged in it.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import division
 17 | 
 18 | import Queue
 19 | import argparse
 20 | import codecs
 21 | import json
 22 | import threading
 23 | import sys
 24 | import time
 25 | 
 26 | import geo_converter
 27 | import geosearchclass
 28 | import tweeter
 29 | import utils
 30 | from utils import new_tweets
 31 | 
 32 | 
 33 | def scan(geosearchclass, q):
 34 |     global keep_scanning
 35 |     search_results = geosearchclass.search()
 36 |     old_ids = [sr.id for sr in search_results]
 37 |     for s in search_results:
 38 |         q.put(s)
 39 |     while keep_scanning:
 40 |         for i in range(5):
 41 |             if keep_scanning:
 42 |                 time.sleep(1)
 43 |             else:
 44 |                 return
 45 |         geosearchclass.result_type = "recent"
 46 |         search_results = geosearchclass.search()
 47 |         new_search_results = new_tweets(search_results, old_ids)
 48 |         if new_search_results:
 49 |             for nsr in new_search_results:
 50 |                 q.put(nsr)
 51 |     return
 52 | 
 53 | 
 54 | def verify(geosearchclass, filename):
 55 |     q = Queue.Queue()
 56 |     global keep_scanning
 57 |     keep_scanning = True
 58 |     thread = threading.Thread(target=scan, args=(geosearchclass, q))
 59 |     thread.daemon = True
 60 |     thread.start()
 61 |     respond = False
 62 |     with codecs.open(filename, encoding='utf-8', mode='a') as json_file:
 63 |         json_file.seek(0)
 64 |         json_file.truncate()
 65 | 
 66 |         print """\n\n\tThis program will present a series of tweets and ask for you to
 67 |         verify if they should be responded to. If so, they will be saved
 68 |         to the JSON file. When you quit scanning, the public tweets will
 69 |         be sent out.\n"""
 70 | 
 71 |         print """Would you like to send tweet responses at the end of this verification
 72 |         session?"""
 73 |         response = ""
 74 |         while response != 'y' and response != 'n':
 75 |             response = raw_input("[y for Yes, n for No] :  ")
 76 |             print response
 77 |         if response == 'y':
 78 |             respond = True
 79 |         elif response == 'n':
 80 |             respond = False
 81 | 
 82 |         first = True
 83 |         while True:
 84 |             if q.empty():
 85 |                 time.sleep(5)
 86 |                 continue
 87 |             status = q.get()
 88 |             print "\n\nVerify if this tweet is what you want:"
 89 |             simplified_tweet = utils.get_simplified_tweet(status)
 90 |             response = ""
 91 |             while response != 'y' and response != 'n' and response != 'q':
 92 |                 response = raw_input("[y for Yes, n for No, q for Quit] :  ")
 93 |             if response == 'y':
 94 |                 j = json.dumps(simplified_tweet, indent=1)
 95 |                 if first:
 96 |                     json_file.write('[\n')
 97 |                     json_file.write(j)
 98 |                     first = False
 99 |                     continue
100 |                 json_file.write(',\n')
101 |                 json_file.write(j)
102 |             elif response == 'n':
103 |                 continue
104 |             elif response == 'q':
105 |                 keep_scanning = False
106 |                 thread.join()
107 |                 json_file.write('\n]')
108 |                 break
109 |     responder(geosearchclass, respond, filename)
110 |     return
111 | 
112 | 
113 | def responder(geosearchclass, respond, filename):
114 |     if not respond:
115 |         print "No responses sent!"
116 |         return
117 |     with codecs.open(filename, encoding='utf-8', mode='rU') as json_file:
118 |         json_string = json_file.read()
119 |         tweets = json.loads(json_string)
120 |         for tweet in tweets:
121 |             user = tweet[0]
122 |             response_text = geosearchclass.tweet_text + u" @" + user
123 |             if len(response_text) > 140:
124 |                 raise ValueError("Tweet text is > 140 characters. Can't post. \
125 | Shorten the tweet text in the params file")
126 |             id = int(tweet[2])
127 |             users_text = tweet[3]
128 | 
129 |             print "\n\n\nPlease confirm you want to respond to this tweet"
130 |             print user
131 |             print users_text
132 |             print "with this text: "
133 |             print response_text
134 |             response = raw_input("[y for Yes, anything for No] :  ")
135 |             if response == 'y':
136 |                 status = tweeter.tweet(geosearchclass.api, response_text, id)
137 |                 print "This tweet was posted: "
138 |                 utils.get_simplified_tweet(status)
139 |                 
140 |     return
141 | 
142 | 
143 | def get_parser():
144 |     """ Creates a command line parser
145 | 
146 |     --doc -d
147 |     --help -h
148 |     --filename -f
149 |     --respond -r
150 |     """
151 |     # Create command line argument parser
152 |     parser = argparse.ArgumentParser(
153 |         description='Create an updating word frequency distribution chart.')
154 | 
155 |     parser.add_argument('-d',
156 |                         '--doc',
157 |                         action='store_true',
158 |                         help='print module documentation and exit')
159 |     parser.add_argument(
160 |         '-f',
161 |         '--filename',
162 |         help='''specify a FILENAME to use as the parameter file.
163 |         If not specified, will use 'params.txt'.''')
164 |     parser.add_argument('-a',
165 |                         '--address',
166 |                         help='''give an ADDRESS to get geocoordinates for.''')
167 |     parser.add_argument(
168 |         '-o', '--output',
169 |         help='''specify an OUTPUT file to write to.
170 |         Default is tweets.json''')
171 |     return parser
172 | 
173 | 
174 | def main():
175 |     parser = get_parser()
176 |     args = parser.parse_args()
177 | 
178 |     if args.doc:
179 |         print __doc__
180 |         import sys
181 |         sys.exit(0)
182 | 
183 |     # pass in an API to GeoSearchClass to get full access for posting
184 |     (api, __) = utils.get_credentials('consumerkeyandsecret', False)
185 |     g = geosearchclass.GeoSearchClass('params.txt', None, api)
186 | 
187 |     if args.filename:
188 |         print 'Using parameters from ' + str(args.filename)
189 |         g.set_params_from_file(args.filename)
190 |     else:
191 |         print "Using search values from params.txt"
192 |         g.set_params_from_file('params.txt')
193 | 
194 |     if args.output:
195 |         fn = str(args.output)
196 |     else:
197 |         fn = 'tweets.json'
198 |     print 'Output file: ' + fn
199 | 
200 |     if args.address:
201 |         print "Finding geocoordates for address:\n{}".format(args.address)
202 |         coords = geo_converter.get_geocoords_from_address(args.address)
203 |         if coords:
204 |             g.latitude = coords[0]
205 |             g.longitude = coords[1]
206 |         else:
207 |             print "Failed to find coordinates"
208 |             sys.exit()
209 | 
210 |     verify(g, fn)
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     try:
215 |         main()
216 |     except KeyboardInterrupt:
217 |         print "Main function interrupted"
218 |         print "JSON file may be in incomplete format"
219 |         sys.exit()
220 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # scraper.py
 3 | # Saito 2017
 4 | 
 5 | import argparse
 6 | import sys
 7 | import urllib
 8 | 
 9 | from bs4 import BeautifulSoup
10 | 
11 | import utils
12 | 
13 | 
14 | def scrape(url):
15 |     html = urllib.urlopen(url).read()
16 |     soup = BeautifulSoup(html, "html.parser")
17 |     [x.extract() for x in soup.find_all('script')]
18 |     text = soup.get_text(" ", strip=True)
19 |     # ftext = text.split(" ")
20 |     return text
21 | 
22 | 
23 | def scrape_and_save_to_file(url, filename="scraped_text.txt"):
24 |     text = scrape(url)
25 |     utils.save_file(filename, text)
26 |     return text
27 | 
28 | 
29 | def get_parser():
30 |     """ Creates a command line parser
31 | 
32 |     --doc -d
33 |     --help -h
34 |     --url -u
35 |     --output -o
36 | 
37 |     This automatically grabs arguments from sys.argv[]
38 |     """
39 | 
40 |     parser = argparse.ArgumentParser(
41 |         description='Scrape a website.')
42 | 
43 |     parser.add_argument(
44 |         '-d', '--doc', action='store_true',
45 |         help='print module documentation and exit')
46 |     parser.add_argument(
47 |         '-u', '--url',
48 |         help='''specify a url to scrape. Use the full name like
49 |         http://www.cnn.com''')
50 |     parser.add_argument(
51 |         '-o', '--output',
52 |         help='''specify an OUTPUT file to write to.
53 |         Default is scraped_text.txt''')
54 | 
55 |     return parser
56 | 
57 | 
58 | def main():
59 |     parser = get_parser()
60 |     args = parser.parse_args()
61 | 
62 |     if args.doc:
63 |         print __doc__
64 |         sys.exit()
65 | 
66 |     if args.url:
67 |         url = args.url
68 |     else:
69 |         url = "http://chrisnovello.com/teaching/risd/computer-utopias/"
70 | 
71 |     if args.output:
72 |         print '\nwriting file to ' + str(args.output)
73 |         output_file = args.output
74 |     else:
75 |         print "\nwriting to scraped_text.txt"
76 |         output_file = "scraped_text.txt"
77 |     text = scrape_and_save_to_file(url, output_file)
78 | 
79 |     # # Example
80 |     # url = "http://chrisnovello.com/teaching/risd/computer-utopias/"
81 |     # # text = scrape(url)
82 |     # text = scrape_and_save_to_file(url)
83 |     print text
84 |     
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 
89 | 


--------------------------------------------------------------------------------
/streamer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """ This is a utility that allows tweets to be read off in real time
  3 | 
  4 | To stop, use a KeyboardInterrupt like CTRL + C"""
  5 | 
  6 | 
  7 | import Queue
  8 | import json
  9 | import sys
 10 | import threading
 11 | import time
 12 | 
 13 | import tweepy
 14 | 
 15 | import utils
 16 | 
 17 | 
 18 | global stream  # so that CTRL + C kills stream
 19 | 
 20 | 
 21 | class ListenerQueue(tweepy.streaming.StreamListener):
 22 |     """A StreamListener implementation for accessing Twitter Streaming API
 23 |     that writes to a queue object sent on initialization.
 24 | 
 25 |     Usage: myListener = ListenerQueue(queue)
 26 |     Stream(authorization, myListener)
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(self, queue, filename, search_terms):
 31 |         super(ListenerQueue, self).__init__()
 32 |         self.queue = queue
 33 |         self.search_terms = search_terms
 34 |         self.json_file = open(filename, 'a')
 35 |         self.json_file.seek(0)
 36 |         self.json_file.truncate()
 37 | 
 38 |     def has_all_search_terms(self, text):
 39 |         for term in self.search_terms:
 40 |             if text.find(term) > -1:
 41 |                 continue
 42 |             else:
 43 |                 return False
 44 |         return True
 45 |             
 46 |     def on_status(self, status):
 47 |         text = status.text
 48 |         if self.search_terms:
 49 |             if not self.has_all_search_terms(text):
 50 |                 return True
 51 |     
 52 |         self.queue.put(status)
 53 |         # sj = status._json
 54 |         sj = utils.get_simplified_tweet(status)
 55 |         # filter_lev = status.filter_level
 56 |         # print filter_lev
 57 |         j = json.dumps(sj, indent=1)
 58 |         self.json_file.write(j)
 59 |         return True
 60 | 
 61 |     def on_error(self, status):
 62 |         # error codes: https://dev.twitter.com/overview/api/response-codes
 63 |         print status
 64 |         if status == 420:
 65 |             print "Too many attempts made to contact the Twitter server"
 66 |             print "Wait awhile to use the tool again"
 67 |             return False  # returning False in on_data disconnects the stream
 68 | 
 69 |     def on_disconnect(self):
 70 |         super(ListenerQueue, self).on_disconnect()
 71 |         print "stream disconnected"
 72 |         self.json_file.close()
 73 |         if self.json_file.closed:
 74 |             print "json file closed successfully"
 75 | 
 76 | 
 77 | # def stream_to_json_file(fn='tweets.json'):
 78 | #     auth = get_creds()
 79 | #     L = ListenerJSON(fn)
 80 | #     stream = Stream(auth, L)
 81 | #     stream.filter(locations=[-122.75, 36.8, -121.75, 37.8], async=True)
 82 | #     # can find terms: by adding track=['python']
 83 | #     print "waiting 15s"
 84 | #     time.sleep(15)
 85 | #     print "terminating"
 86 | #     stream.disconnect()
 87 | #     L.json_file.close()
 88 | 
 89 | 
 90 | def get_tweets_from_q(queue):
 91 |     while True:
 92 |         status = queue.get(True, 5)
 93 |         print u"Tweet Message : {}\n\n".format(status.text)
 94 |         queue.task_done()
 95 | 
 96 | 
 97 | def start_stream(q, bounding_box, fn='tweets.json', search_terms=None):
 98 |     '''Takes in a Queue object, a bounding_box of [lon, lat, lon, lat] for
 99 |     SW and NE corners, a filename and a search term list. Examples in:
100 |     bounding_box = geo_converter.get_bounding_box_from(g)
101 |     search_terms = geo_converter.get_search_terms_from(g)
102 |     '''
103 |     global stream
104 |     (__, auth) = utils.get_credentials("consumerkeyandsecret", False)
105 |     L = ListenerQueue(q, fn, search_terms)
106 |     stream = tweepy.Stream(auth, L)
107 |     stream.filter(locations=bounding_box, filter_level='none', async=True)
108 |     # if search_terms:
109 |     #     # OR semantics:
110 |     #     stream.filter(locations=bounding_box, track=search_terms, async=True)
111 |     # else:
112 |     #     stream.filter(locations=bounding_box, async=True)
113 |     return stream
114 | 
115 | 
116 | def kill_stream(stream):
117 |     if stream:
118 |         print "attempting to disconnect stream from kill_stream"
119 |         stream.disconnect()
120 |         print "closing file in 1 second..."
121 |         time.sleep(1)
122 |         stream.listener.json_file.close()
123 |     else:
124 |         print "stream not set"
125 | 
126 |     
127 | def main():
128 |     print __doc__
129 |     
130 |     q = Queue.Queue()
131 |     bounding_box = [-122.75, 36.8, -121.75, 37.8]
132 |     global stream
133 |     stream = start_stream(q, bounding_box)
134 |     
135 |     # t = threading.Thread(target=start_stream, args=(q, bounding_box))
136 |     # t.daemon = True
137 |     # t.start()
138 |     # t.join()
139 |     # print "waiting 15s"
140 |     # time.sleep(15)
141 |     # kill_stream(stream)
142 | 
143 |     # stream_to_json_file()
144 | 
145 |     # get_tweets_from_q(q)
146 |     # now read in the files
147 |     # https://dev.twitter.com/streaming/overview/request-parameters
148 |     
149 | 
150 | if __name__ == '__main__':
151 |     try:
152 |         main()
153 |     except KeyboardInterrupt:
154 |         print "Main function interrupted"
155 |         if "stream" in globals():
156 |             print "trying to kill stream"
157 |             kill_stream(stream)
158 |         sys.exit()
159 | 
160 | 
161 | 
162 | 
163 | # class ListenerJSON(StreamListener):
164 | #     """A StreamListener implementation for accessing Twitter Streaming API
165 | #     that writes to a JSON file
166 | 
167 | #     """
168 | 
169 | #     def __init__(self, filename):
170 | #         super(ListenerJSON, self).__init__()
171 | #         self.json_file = open(filename, 'a')
172 | 
173 | #     def on_status(self, status):
174 | #         # print data
175 | #         # print u"Tweet Message : {}\n\n".format(status.text)
176 | #         print type(status)
177 | #         sj = status._json
178 | #         j = json.dumps(sj, indent=1)
179 | #         self.json_file.write(j)
180 | #         return True
181 | 
182 | #     def on_error(self, status):
183 | #         # error codes: https://dev.twitter.com/overview/api/response-codes
184 | #         print status
185 | #         if status == 420:
186 | #             return False  # returning False in on_data disconnects the stream
187 | 
188 | #     def on_disconnect(self):
189 | #         super(ListenerJSON, self).on_disconnect()
190 | #         print "made it to disconnector"
191 | #         self.json_file.close()
192 | 


--------------------------------------------------------------------------------
/suggest_bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # suggest_bot.py
  3 | # Saito 2017
  4 | 
  5 | """This creates robot assisted poems!
  6 | 
  7 | 
  8 | """
  9 | import argparse
 10 | import random
 11 | import sys
 12 | 
 13 | import geo_converter
 14 | import editor
 15 | import geosearchclass
 16 | import ngrams
 17 | import utils
 18 | import write
 19 | 
 20 | 
 21 | def create_poem(g=None, default_words=None, ngram=None):
 22 |     """ This creates a poem with user input by suggesting from the words supplied.
 23 | 
 24 |     A user can use the word, decline the word, or add their own input.
 25 |     g is for geosearchclass. It is none by default.
 26 |     default_words is a list of words that can be enabled by default.
 27 |     """
 28 |     words = []
 29 |     formatted_poem = ''''''
 30 |     # for no, yes and finish (print poem)
 31 |     options = ['y', 'n', 's', 'd', 'r', 'e', 'f', '\n']
 32 |     keep_adding = True
 33 |     added_default = False
 34 |     use_phrases = False
 35 |     random_word = False
 36 |     print "\n\n\n"
 37 |     print """
 38 | 
 39 |         This robot poet will present a series of suggestions. You can
 40 |         either use these suggestions, edit them, or type your own
 41 |         input.  You may also add more words from geolocated tweets to
 42 |         your word corpus. The words you choose or add will be
 43 |         succeessively added to a poem, which will be printed and saved
 44 |         to an output file. To add a new line, type '\\n'. To finish
 45 |         writing type f (for finish).
 46 | 
 47 |         y: yes use this word
 48 |         n: no, skip this and give me a new phrase
 49 |         s: search: add more geolocated terms from twitter
 50 |         d: default words added to corpus
 51 |         r: get random word, when running markov model
 52 |         e: edit the text
 53 |         \\n: enter line
 54 |         f: finish
 55 | 
 56 |     """
 57 | 
 58 |     if ngram:
 59 |         print "Populating seed words from markov chain ngram"
 60 |         values = sum(ngram.values(), [])
 61 |         words.extend(values)
 62 |     chosen = ""
 63 |     while keep_adding:
 64 |         if len(words) == 0:
 65 |             print "Nothing in corpus. Type d for default words or s to search\
 66 | twitter"
 67 |         if ngram and formatted_poem and not random_word:
 68 |             tokens = utils.tokenize_normal_words(formatted_poem)
 69 |             num = random.random()
 70 |             potential_word = ""
 71 |             if len(tokens) > 0:
 72 |                 #  This is for trigrams
 73 |                 if num > 0.66 and len(tokens) > 1:  # 50% of time get trigram
 74 |                     potential_word = tokens_to_word(tokens, ngram, 2)
 75 |                     if potential_word:
 76 |                         chosen = potential_word
 77 |                     else:
 78 |                         potential_word = tokens_to_word(tokens, ngram, 1)
 79 |                         if potential_word:
 80 |                             chosen = potential_word
 81 |                         else:
 82 |                             chosen = random.choice(words)
 83 |                 elif num > 0.33:  # 30% of time get bigram
 84 |                     potential_word = tokens_to_word(tokens, ngram, 1)
 85 |                     if potential_word:
 86 |                         chosen = potential_word
 87 |                     else:
 88 |                         chosen = random.choice(words)
 89 |                 else:  # 20% of time get random word
 90 |                     chosen = random.choice(words)
 91 |             else:
 92 |                 chosen = random.choice(words)
 93 |         elif words:
 94 |             chosen = random.choice(words)
 95 |             random_word = False
 96 |         else:
 97 |             pass
 98 |         if chosen:
 99 |             print chosen,
100 |         response_string = "     " + str(options) + " or your own :"
101 |         response = raw_input(response_string)
102 |         # include the chosen word:
103 |         if response == "y":
104 |             if len(words) == 0:
105 |                 continue
106 |             formatted_poem = formatted_poem + ''' ''' + chosen
107 |             print
108 |             print formatted_poem
109 |             continue
110 |         elif response == "n":
111 |             continue
112 |         elif response == "r":
113 |             random_word = True
114 |         elif response == "s":
115 |             print "Searching geo-located tweets to add to vocab"
116 |             print "This can only be used once every 5 seconds"
117 |             if g is None:
118 |                 g = geosearchclass.GeoSearchClass()
119 |             search_results = g.search()
120 | 
121 |             phrase_response = ""
122 |             while phrase_response not in ["y", "n"]:
123 |                 phrase_response = raw_input("\nWould you like to use phrases (\
124 | (otherwise, just words)? [y/n]: ")
125 |                 if phrase_response == "y":
126 |                     list_of_info_dicts = write.parse_tweets(search_results)
127 |                     filtered_words = []
128 |                     if len(list_of_info_dicts) < 1:
129 |                         filtered_words = utils.tokenize_and_filter(
130 |                             search_results)
131 |                     else:
132 |                         for d in list_of_info_dicts:
133 |                             filtered_words.append(d['phrase'])
134 |                 elif phrase_response == "n":
135 |                     filtered_words = utils.tokenize_and_filter(search_results)
136 |                 else:
137 |                     continue
138 |             print "\n\n\nAdding these Twitter words: "
139 |             print filtered_words
140 |             print "\n"
141 |             words.extend(filtered_words)
142 |             continue
143 |         elif response == "d":
144 |             if not added_default:
145 |                 print "\nadding in these words to corpus:"
146 |                 print default_words
147 |                 print "\n\n\n"
148 |                 words.extend(default_words)
149 |                 options.remove('d')
150 |                 added_default = True
151 |         elif response == "e":
152 |             formatted_poem = editor.create_editor(formatted_poem)
153 |             print formatted_poem
154 |         elif response not in options:
155 |             response = response.replace('\\n', '\n')
156 |             formatted_poem = formatted_poem + ''' ''' + response
157 |             print
158 |             print formatted_poem
159 |             continue
160 |         elif response == "f":
161 |             print
162 |             print formatted_poem
163 |             keep_adding = False
164 |     return formatted_poem
165 | 
166 | 
167 | def tokens_to_word(tokens, ngram, n):
168 |     seed = tokens[-n:]
169 |     t = tuple(seed)
170 |     word = ngrams.generate(ngram, t)
171 |     return word
172 | 
173 | 
174 | def get_parser():
175 |     """ Creates a command line parser
176 | 
177 |     --doc -d
178 |     --help -h
179 |     --params -p
180 |     --input -i
181 |     --markov -m
182 |     --output -o
183 |     --address -a
184 | 
185 |     This automatically grabs arguments from sys.argv[]
186 |     """
187 | 
188 |     parser = argparse.ArgumentParser(
189 |         description='Create a robot assisted poem.')
190 | 
191 |     parser.add_argument(
192 |         '-d', '--doc', action='store_true',
193 |         help='print module documentation and exit')
194 |     parser.add_argument(
195 |         '-m', '--markov',
196 |         help='''specify a TEXT file to mimic. This will\
197 |  Train a markov chain word predictor\
198 |  using this text. This will basically create a robotic poet!''')
199 |     parser.add_argument(
200 |         '-p', '--params',
201 |         help='''specify a PARAMS file to use as the parameter file.
202 |         If not specified, will use 'params.txt' for searches.''')
203 |     parser.add_argument(
204 |         '-i', '--input',
205 |         help='''specify an input file to use as word seed file.
206 |         ''')
207 |     parser.add_argument(
208 |         '-o', '--output',
209 |         help='''specify an OUTPUT file to write to.
210 |         Default is output.txt''')
211 |     parser.add_argument(
212 |         '-a',
213 |         '--address',
214 |         help='''give an ADDRESS to get geocoordinates for.''')
215 | 
216 |     return parser
217 | 
218 | 
219 | def main():
220 |     parser = get_parser()
221 |     args = parser.parse_args()
222 | 
223 |     if args.doc:
224 |         print __doc__
225 |         sys.exit()
226 | 
227 |     g = geosearchclass.GeoSearchClass()
228 | 
229 |     if args.params:
230 |         print 'Using parameters from ' + str(args.params)
231 |         # turn parameter file into dictionary
232 |         g.set_params_from_file(args.params)
233 |         
234 |     if args.address:
235 |         print "Finding geocoordates for address:\n{}".format(args.address)
236 |         coords = geo_converter.get_geocoords_from_address(args.address)
237 |         if coords:
238 |             g.latitude = coords[0]
239 |             print "Found this latitude:"
240 |             print g.latitude
241 |             g.longitude = coords[1]
242 |             print "Found this longitude:"
243 |             print g.longitude
244 |         else:
245 |             print "Failed to find coordinates. Exiting."
246 |             sys.exit()
247 | 
248 |     if args.input:
249 |         text = utils.load_file(args.input)
250 |         tokens = utils.tokenize_normal_words(text)
251 |         for_poem = utils.filter_words(tokens)
252 |     else:
253 |         for_poem = get_default_words()
254 | 
255 |     if args.markov:
256 |         if args.input:
257 |             raise StandardError("Can only input a single text file. \
258 | use --markov <your_text_file.txt>")
259 |         else:
260 |             text = utils.load_file(args.markov)
261 |             # ngram = ngrams.make_ngram(text, 2)
262 |             ngram = ngrams.make_bigram_trigram_dictionary(text)
263 |             formatted_poem = create_poem(g, for_poem, ngram)
264 |     else:
265 |         formatted_poem = create_poem(g, for_poem)
266 | 
267 |     if args.output:
268 |         print '\nwriting formatted poem to ' + str(args.output)
269 |         output_file = args.output
270 |     else:
271 |         print "\nwriting formatted poem to poem.txt"
272 |         output_file = "poem.txt"
273 | 
274 |     utils.save_file(output_file, formatted_poem)
275 | 
276 |     
277 | def get_default_words():
278 |     # These are some good default words used in the poem creator above
279 |     for_poem = [  # emerging tech shit
280 |         'Agricultural', 'ecological', 'systems', 'meat', 'genetically',
281 |         'modified', 'precision', 'vertical', 'farming', 'printing', 'contour',
282 |         'crafting', 'artificial', 'uterus', 'transplant', 'cryonics',
283 |         'vitrification', 'suspended animation', 'de-extinction',
284 |         'genetic engineering', 'gene therapy', 'life extension',
285 |         'engineered negligible senescence',
286 |         'nanomedicine', 'nanosensors', 'regenerative', 'medicine',
287 |         'stem-cell', 'tissue engineering', 'robot assisted surgery',
288 |         'synthetic biology', 'synthetic genomics', 'virus',
289 |         'whole genome sequencing', 'bionic contact lens',
290 |         'head-mounted display', 'virtual',
291 |         'retinal', 'e-textiles', 'molecular', 'electronics', 'thermal',
292 |         'copper', 'pillar', 'airborne wind turbine', 'artificial',
293 |         'photosynthesis', 'biofuels', 'solar', 'power', 'fusion', 'fuel cell',
294 |         'molten salt', 'photovoltaic', 'translation', 'machine vision',
295 |         'speech recognition', 'fourth-generation', 'optical discs', 'storage',
296 |         'holographic data', 'millipede', 'optical computing',
297 |         'quantum computing', 'quantum cryptography', 'RFID', 'software-defined',
298 |         'three-dimensional', 'integrated', 'circuit', 'artificial muscle',
299 |         'superconductivity', 'superfluidity', 'metamaterials', 'cloaking',
300 |         'metal', 'multi-function', 'superalloy', 'synthetic diamond',
301 |         'weapon', 'laser', 'particle-beam', 'coilgun', 'plasma', 'stealth',
302 |         'brain computer interface', 'retinal implant',
303 |         'self reconfiguring modular robot', 'swarm robotics', 'pulse',
304 |         'solar sail', 'backpack',
305 |         'helicopter', 'delivery drone', 'detonation', 'engine', 'driverless\
306 |         car', 'automated', 'vacuum', 'collection', 'cloak', 'immersive',
307 |         'dilemma',
308 |         # japanese shit
309 |         'august', 'black', 'chinese', 'Gaugin', 'heaven', 'illusion',
310 |         'island', 'Kibune', 'Michinoku', 'milky', 'Mogami', 'mother',
311 |         'mount', 'mountain', 'Musashi', 'night', 'observe', 'October',
312 |         'portrait', 'river', 'Roman', 'SUNSHINE', 'should', 'submit',
313 |         'tangled', 'Tokiwa', 'washing', 'watching', 'world', 'Yoshino',
314 |         'actual', 'admires', 'after', 'afterlife', 'again', 'against',
315 |         'alive', 'almost', 'always', 'amidah', 'ancient', 'another',
316 |         'armor', 'armored', 'arrayed', 'arrows', 'autumn', 'autumns',
317 |         'awakening', 'bamboo', 'bathe', 'beads', 'become', 'becoming',
318 |         'begins', 'behind', 'between', 'beyond', 'birth', 'blade',
319 |         'blind', 'bloom', 'blooming', 'blossoms', 'break', 'breaks',
320 |         'breeze', 'bridge', 'brings', 'brother', 'brush', 'buried',
321 |         'burning', 'butterfly', 'calligraphy', 'calling', 'camellia',
322 |         'cancer', 'candle', 'canyon', 'caress', 'carry', 'ceaseless',
323 |         'cedars', 'center', 'certain', 'change', 'chanted', 'chases',
324 |         'cherries', 'cherry', 'child', 'chill', 'chorus', 'chrysanthemum',
325 |         'chrysanthemums', 'cicada', 'clock', 'closer', 'color', 'combing',
326 |         'compare', 'completely', 'content', 'continent', 'corona',
327 |         'could', 'crest', 'crossing', 'curve', 'dancers', 'darkens',
328 |         'darkness', 'death', 'deepens', 'delusions', 'deserted',
329 |         'destitute', 'distance', 'dream', 'dreaming', 'dreams', 'drips',
330 |         'drops', 'drums', 'dying', 'early', 'eclipse', 'egret', 'ended',
331 |         'entangling', 'escaped', 'evening', 'every', 'exhausted',
332 |         'faintly', 'falling', 'falls', 'feeling', 'field', 'finished',
333 |         'fireflies', 'firefly', 'fireworks', 'first', 'flash', 'flesh',
334 |         'flies', 'float', 'flowers', 'flowing', 'flows', 'follow',
335 |         'forever', 'forlorn', 'forth', 'fragile', 'frozen', 'garden',
336 |         'gates', 'gauntlet', 'gauzy', 'gazing', 'geese', 'giant',
337 |         'glances', 'going', 'grapes', 'grass', 'grasses', 'guards',
338 |         'guided', 'gunshots', 'harbor', 'heart', 'heaven', 'hillside',
339 |         'holding', 'horse', 'house', 'houses', 'hundred', 'hydrangea',
340 |         'idling', 'image', 'insane', 'interrogation', 'invisible',
341 |         'irrevocable', 'itself', 'journey', 'juice', 'karma', 'killed',
342 |         'knotty', 'knowing', 'knowledge', 'later', 'leave', 'leaving',
343 |         'letting', 'light', 'lightning', 'lilacs', 'limit', 'little',
344 |         'lodging', 'longing', 'looks', 'loving', 'making', 'mantle',
345 |         'marshes', 'memories', 'messengers', 'meteor', 'midnight',
346 |         'might', 'mirror', 'mirrored', 'missed', 'month', 'moonlight',
347 |         'mother', 'motorcycle', 'mouth', 'moving', 'myself', 'night',
348 |         'nightingale', 'nights', 'north', 'nothing', 'nowhere', 'ocean',
349 |         'octopus', 'opening', 'orchid', 'other', 'paradise', 'parting',
350 |         'passes', 'passions', 'pattern', 'pealing', 'pears', 'people',
351 |         'period', 'petal', 'place', 'plain', 'planters', 'playing',
352 |         'poems', 'poppy', 'press', 'primal', 'primeval', 'purple',
353 |         'quivered', 'rabbits', 'radiation', 'radio', 'rapids', 'reaches',
354 |         'reality', 'really', 'recklessly', 'reconciled', 'relax',
355 |         'remember', 'replies', 'returning', 'right', 'ripple', 'ripples',
356 |         'rising', 'river', 'riverbank', 'rocky', 'rowing', 'running',
357 |         'saying', 'seals', 'seeing', 'serpent', 'shadow', 'shall',
358 |         'shaped', 'shattered', 'shell', 'shelves', 'shift', 'shining',
359 |         'shore', 'short', 'shower', 'sided', 'silkworm', 'silkworms',
360 |         'single', 'sleep', 'slept', 'slightest', 'slowly', 'smell',
361 |         'snail', 'soiled', 'soldiers', 'solitary', 'somehow', 'something',
362 |         'sometimes', 'sound', 'speak', 'spill', 'spilling', 'spray',
363 |         'spreads', 'spring', 'squid', 'stable', 'stars', 'station',
364 |         'steel', 'stirrups', 'stolen', 'stomach', 'stone', 'storm',
365 |         'straighten', 'strands', 'strange', 'straw', 'streaming',
366 |         'stripes', 'study', 'submit', 'summer', 'sunlight', 'sunrise',
367 |         'sunset', 'sutra', 'sweet', 'swimsuit', 'tangled', 'taste',
368 |         'temple', 'tethered', 'their', 'there', 'these', 'thighs',
369 |         'thing', 'things', 'think', 'thought', 'thousand', 'throat',
370 |         'through', 'throughout', 'tiger', 'tight', 'tossing', 'total',
371 |         'toward', 'trace', 'transferred', 'traps', 'truth', 'turning',
372 |         'turns', 'twilight', 'unborn', 'under', 'utterly', 'vanished',
373 |         'village', 'visible', 'waiting', 'wandering', 'warrior',
374 |         'warriors', 'washed', 'water', 'waves', 'weight', 'where',
375 |         'which', 'whistling', 'white', 'whitecaps', 'willow', 'wings',
376 |         'winter', 'wisteria', 'without', 'woman', 'world', 'yanking',
377 |         'years', 'yesterday', 'yielded', 'young']
378 |     return for_poem
379 | 
380 | if __name__ == '__main__':
381 |     main()
382 | 
383 | 


--------------------------------------------------------------------------------
/test_real_time_vis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # test_real_time_vis.py
 3 | # Saito 2015
 4 | """ Test unit for real_time_vis """
 5 | 
 6 | import unittest
 7 | import time
 8 | from utils import new_tweets
 9 | from real_time_vis import update_fdist
10 | import geosearchclass
11 | import utils
12 | 
13 | 
14 | class TestRTV(unittest.TestCase):
15 | 
16 |     def setUp(self):
17 |         self.g = geosearchclass.GeoSearchClass()
18 |         self.g.latitude = 37.7821
19 |         self.g.longitude = -122.4093
20 |         self.g.radius = 100
21 |         self.g.search_term = ""
22 |         self.g.result_type = 'mixed'
23 |         self.g.count = 100
24 |         self.sr = self.g.search()
25 | 
26 |     def test_new_tweets(self):
27 |         sr2 = self.sr[0:10]  # 10 old same one
28 |         old = [s.id for s in sr2]
29 |         old = set(old)
30 |         print 'len(sr) = %d' % len(self.sr)
31 |         print 'len(sr2) = %d' % len(sr2)
32 |         self.assertEqual(
33 |             len(new_tweets(self.sr, old)), 90)
34 | 
35 |         sr2 = self.sr
36 |         old = [s.id for s in sr2]
37 |         old = set(old)
38 |         self.assertEqual(
39 |             len(new_tweets(self.sr, old)), 0)
40 | 
41 |         self.g.latitude = 40.734073
42 |         self.g.longitude = -73.990663
43 |         self.g.radius = 10
44 |         self.g.search_term = ""
45 |         self.g.result_type = 'mixed'
46 |         self.g.count = 10
47 |         sr2 = self.g.search()    # all different (15 old different ones)
48 |         old = [s.id for s in sr2]
49 |         old = set(old)
50 |         self.assertEqual(
51 |             len(new_tweets(self.sr, old)), 100)
52 | 
53 |     def test_update_fdist(self):
54 |         filtered_words = utils.tokenize_and_filter(self.sr)
55 |         fdist = utils.get_freq_dist(filtered_words)
56 |         # take distribution and send it empty list
57 |         fdist2 = update_fdist(fdist, [])
58 |         self.assertEqual(fdist, fdist2)
59 | 
60 |         time.sleep(5)
61 |         self.g.latitude = 40.734073
62 |         self.g.longitude = -73.990663
63 |         self.g.count = 100
64 |         self.sr = self.g.search()
65 |         filtered_words = utils.tokenize_and_filter(self.sr)
66 |         # updating with entirely new word set -> should be longer
67 |         old_len_fdist = len(fdist)
68 |         fdist = update_fdist(fdist, filtered_words)
69 |         self.assertTrue(len(fdist) > old_len_fdist)
70 | 
71 |     def tearDown(self):
72 |         pass
73 | 
74 | if __name__ == '__main__':
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/test_write.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # test_write.py
 3 | # Saito 2015
 4 | 
 5 | """ Test unit for write """
 6 | 
 7 | import unittest
 8 | # import time
 9 | # import geosearchclass
10 | import nltk
11 | import logging
12 | from write import traverse_tree_grab_phrase
13 | from write import traverse_tree_grab_phrases
14 | from write import parse_sentence
15 | from write import get_grammar
16 | 
17 | 
18 | class TestWrite(unittest.TestCase):
19 |     # def __init__(self):
20 |     #     super(TestWrite, self).__init__()
21 | 
22 |     @classmethod
23 |     def setUpClass(cls):
24 |         pass
25 |         # self.g = geosearchclass.GeoSearchClass()
26 |         # self.g.latitude = 37.7821
27 |         # self.g.longitude = -122.4093
28 |         # self.g.radius = 10
29 |         # self.g.search_term = ""
30 |         # self.g.result_type = 'mixed'
31 |         # self.g.count = 2
32 |         # self.sr = self.g.search()
33 | 
34 |     def setUp(self):
35 |         # set to DEBUG, INFO, WARNING, ERROR, CRITICAL :
36 |         logging.basicConfig(
37 |             format='%(levelname)s:  %(message)s', level=logging.INFO)
38 |         self.tokens = nltk.word_tokenize(
39 |             'Numerous passing references to the phrase have occurred in movies')
40 |         self.grammar = get_grammar('grammar_20ids_HM0VM0.pickle')
41 |         self.tree = parse_sentence(self.tokens, self.grammar)
42 | 
43 |     def test_traverse_tree_grab_phrase(self):
44 |         print 'printing tree!!!'
45 |         print self.tree
46 | 
47 |         label = 'VP'
48 |         phrase = traverse_tree_grab_phrase(self.tree, label)
49 |         print "For label {} returned this phrase: {}".format(label, phrase)
50 |         self.assertEqual(phrase, 'have occurred in movies')
51 | 
52 |         label = 'NP'
53 |         phrase = traverse_tree_grab_phrase(self.tree, label)
54 |         print "For label {} returned this phrase: {}".format(label, phrase)
55 |         self.assertEqual(phrase, 'Numerous passing references')
56 | 
57 |         label = 'PP'
58 |         phrase = traverse_tree_grab_phrase(self.tree, label)
59 |         print "For label {} returned this phrase: {}".format(label, phrase)
60 |         self.assertEqual(phrase, 'to the phrase')
61 | 
62 |     def test_traverse_tree_grab_phrases(self):
63 |         # # Now testing other function
64 |         labels = [u'VP', u'NP', u'PP']
65 |         phrases = dict.fromkeys(labels)
66 |         for k in phrases.keys():
67 |             phrases[k] = []
68 |         phrases = traverse_tree_grab_phrases(self.tree, phrases)
69 |         for k, v in phrases.items():
70 |             print '{} : {}'.format(k, v)
71 |         self.assertEqual(
72 |             phrases['NP'], ['Numerous passing references',
73 |                             'the phrase', 'movies'])
74 |         self.assertEqual(
75 |             phrases['VP'], ['have occurred in movies', 'occurred in movies'])
76 |         self.assertEqual(phrases['PP'], ['to the phrase'])  # maybe 'in movies'
77 | 
78 |     def tearDown(self):
79 |         pass
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 |     # suite = unittest.TestLoader().loadTestsFromTestCase(TestWrite)
84 |     # unittest.TextTestRunner(verbosity=2).run(suite)
85 | 
86 |     # tw = TestWrite()
87 |     # tw.setUp()
88 |     # tw.test_traverse_tree_grab_phrases()
89 | 


--------------------------------------------------------------------------------
/tweeter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """This is a utility module that allows a user to send tweets and
 4 | read timelines"""
 5 | 
 6 | import geosearchclass
 7 | import utils
 8 | 
 9 | 
10 | def tweet(api, text, in_reply_to_status_id=None):
11 |     """Send a tweet, possibly in response to another tweet
12 | 
13 |     REF: http://docs.tweepy.org/en/v3.5.0/api.html#API.update_status
14 |     """
15 |     if len(text) > 140:
16 |         raise ValueError("Text is over 140 Characters. Can\'t tweet")
17 |         return
18 |     if in_reply_to_status_id:
19 |         status = api.update_status(
20 |             status=text, in_reply_to_status_id=in_reply_to_status_id)
21 |     else:
22 |         status = api.update_status(status=text)
23 |     return status
24 | 
25 |         
26 | def get_user_timeline(api, screen_name, count=20):
27 |     """
28 |     This returns a users timeline
29 | 
30 |     REF: http://docs.tweepy.org/en/v3.5.0/api.html#API.user_timeline
31 |     """
32 |     statuses = api.user_timeline(
33 |         screen_name=screen_name, count=count)
34 |     return statuses
35 |     #  API.user_timeline(
36 |     # [id/user_id/screen_name][, since_id][, max_id][, count][, page])
37 |     
38 | 
39 | def main():
40 |     print __doc__
41 |     print tweet.__name__
42 |     print tweet.__doc__
43 |     print get_user_timeline.__name__
44 |     print get_user_timeline.__doc__
45 |     
46 |     # TESTING
47 |     # (api, __) = utils.get_credentials('consumerkeyandsecret', False)
48 |     # g = geosearchclass.GeoSearchClass('params.txt', None, api)
49 | 
50 |     # Robotic Tweet:
51 |     # print g.tweet_text
52 |     # tweet_text = g.tweet_text + " @SaitoGroup"
53 |     # print tweet_text
54 |     # api = g.api
55 |     # status = tweet(api, tweet_text, 745399390219739137)
56 |     # utils.get_simplified_tweet(status)
57 | 
58 | 
59 |     # Get user timeline:
60 |     # screen_name = "SaitoGroup"
61 |     # print "returning user timeline for {}".format(screen_name)
62 |     # statuses = get_user_timeline(g, screen_name, 50)
63 |     # for status in statuses:
64 |     #     utils.get_simplified_tweet(status)
65 |     #     print "\n NEXT TWEET \n"
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # NLTK stuff
  2 | 
  3 | """This is a utils file for the other programs.
  4 | 
  5 | It contains Natural language processing tools from NLTK, some basic
  6 | visualizer, a tweet status object info extractor and a new tweet
  7 | identifier.
  8 | 
  9 | """
 10 | 
 11 | 
 12 | import codecs
 13 | import os
 14 | import re
 15 | import sys
 16 | 
 17 | import nltk
 18 | import tweepy
 19 | from nltk.corpus import stopwords
 20 | 
 21 | 
 22 | def get_credentials(keys_file="consumerkeyandsecret", app_only=True):
 23 |     '''This function gives credentials  to the API.
 24 | 
 25 |     When app_only is true, application only authorization level
 26 |     credentials are supplied. This is sufficient for searching tweet
 27 |     history. It must be False for streaming access and to post tweets.
 28 | 
 29 |     It requires that your consumerkeyandsecret have 4 lines, with the
 30 |     consumer key on the first line, the secret on the next and then an
 31 |     access token on the 3rd and the access token secret on the
 32 |     4th. You can get these by logging on to your twitter account and
 33 |     creating an app.
 34 | 
 35 |     USAGE: (api, auth) = get_creds(keys_file, [app_only=[True/False]])
 36 |         The second argument is optional
 37 | 
 38 |     '''
 39 |     with open(keys_file, 'rU') as myfile:
 40 |         auth_data = [line.strip() for line in myfile]
 41 |         CONSUMER_KEY = auth_data[0]
 42 |         CONSUMER_SECRET = auth_data[1]
 43 |         auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
 44 |         if not app_only:
 45 |             ACCESS_TOKEN = auth_data[2]
 46 |             ACCESS_TOKEN_SECRET = auth_data[3]
 47 |             auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
 48 |         api = tweepy.API(auth)
 49 |     return (api, auth)
 50 |             
 51 | 
 52 | def get_simplified_tweet(status):
 53 |     """ Takes in a tweet status object and parses it"""
 54 |     user = status.user.screen_name
 55 |     print user
 56 |     d = status.created_at
 57 |     isotime = d.isoformat()
 58 |     print isotime
 59 |     id_string = status.id_str
 60 |     print id_string
 61 |     loc_name = None
 62 |     loc = None
 63 |     if status.place:
 64 |         if status.place.full_name:
 65 |             loc_name = status.place.full_name
 66 |             print loc_name
 67 |         if status.place.bounding_box:
 68 |             loc = status.place.bounding_box.origin()
 69 |             print loc
 70 |     text = status.text
 71 |     print text
 72 |     simplified_tweet = [user, isotime, id_string, text, loc_name, loc]
 73 |     return simplified_tweet
 74 | 
 75 | 
 76 | def new_tweets(new_sr, old_ids):
 77 |     '''returns only search_results that do not have ids listed in old_ids
 78 |     new_sr is the new search results, old_ids is a set of ids
 79 | 
 80 |     '''
 81 |     new_tweets = []
 82 |     if old_ids:
 83 |         new_tweets = [sr for sr in new_sr if sr.id not in old_ids]
 84 |     else:
 85 |         new_tweets = new_sr
 86 |     return new_tweets
 87 | 
 88 | 
 89 | def get_freq_dist(word_list):
 90 |     """Returns a frequency distribution for a list of words"""
 91 |     fdist = nltk.probability.FreqDist(word_list)
 92 |     return fdist
 93 | 
 94 | 
 95 | def tokenize_and_filter(search_results):
 96 |     """Tokenizes and then filters search results"""
 97 |     tokens = tokenize_results(search_results)
 98 |     filtered_words = filter_words(tokens)
 99 |     return filtered_words
100 | 
101 | 
102 | def tokenize_results(search_results):
103 |     """This takes in search_results i.e. status return from a twitter
104 |     search and tokenizes the results"""
105 |     tweet_text = u''
106 |     for sr in search_results:
107 |         tweet_text = tweet_text + sr.text
108 |     tokenizer = nltk.tokenize.casual.TweetTokenizer()
109 |     tokens = tokenizer.tokenize(tweet_text)
110 | #    tokens = nltk.tokenize.word_tokenize(tweet_text)
111 |     return tokens
112 | 
113 | 
114 | def tokenize_normal_words(text_string):
115 |     """ This takes in a normal string and tokenizes it into a word list """
116 |     tokens = nltk.word_tokenize(text_string)
117 |     return tokens
118 | 
119 | 
120 | def filter_words(word_list):
121 |     """remove stop words and do some basic filtering"""
122 |     tokens = [word.lower() for word in word_list]
123 |     filtered_words = [
124 |         word for word in tokens if word not in stopwords.words('english')]
125 |     # remove urls with another filter using reg expressions
126 |     p = re.compile(r'//t.co/')
127 |     filtered_words = [word for word in filtered_words if not p.match(word)]
128 |     p2 = re.compile(r'https')
129 |     filtered_words = [word for word in filtered_words if not p2.match(word)]
130 |     filtered_words = [word for word in filtered_words if len(word) > 2]
131 |     return filtered_words
132 | 
133 | 
134 | def visualize(word_list):
135 |     """Takes in a word list and visualizes the distribution of the top 30 words.
136 | 
137 |     This works well when combined with tokenize_and_filter(search_results)."""
138 |     # import matplotlib
139 |     # matplotlib.use('qt4agg')  # workaround for virtual environments
140 |     import matplotlib.pyplot as plt
141 |     
142 |     fdist = get_freq_dist(word_list)
143 |     textOb = nltk.text.Text(word_list)
144 |     print "\nCollocations: "
145 |     print textOb.collocations()
146 |     # fdist.plot(30)
147 |     samples = [item for item, _ in fdist.most_common(30)]
148 |     freqs = [fdist[sample] for sample in samples]
149 | 
150 |     plt.grid(True, color="silver")
151 |     plt.plot(freqs, range(1, 1+len(freqs)))
152 |     plt.yticks(range(
153 |         1, 1 + len(samples)), [s for s in samples], rotation=0)
154 |     plt.ylabel("Samples")
155 |     plt.xlabel("Counts")
156 |     plt.show()
157 |     return fdist
158 | 
159 | 
160 | def save_file(filename, text):
161 |     fileSystemEncoding = sys.getfilesystemencoding()
162 |     OUTPUT_FILE = os.path.expanduser(u'./' + filename)
163 |     with codecs.open(OUTPUT_FILE,
164 |                      encoding=fileSystemEncoding,
165 |                      mode="w") as f:
166 |         f.write(text)
167 | 
168 |         
169 | def load_file(filename):
170 |     fileSystemEncoding = sys.getfilesystemencoding()
171 |     #  with codecs.open(filename, encoding='utf-8', mode='rU') as f:
172 |     with codecs.open(filename, encoding=fileSystemEncoding, mode='rU') as f:
173 |         text = f.read()
174 |     return text
175 | 


--------------------------------------------------------------------------------
/write.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # write.py
  3 | # Saito 2015
  4 | 
  5 | """
  6 | This program classifies tweets into phrase type.
  7 | 
  8 | It produces a JSON array, "phrases.json" with properties:
  9 |  phrase
 10 |  tweeter
 11 |  type
 12 |  geolocation
 13 | """
 14 | # TODO:
 15 | # Try a faster parser, like chart parser or something
 16 | 
 17 | 
 18 | import nltk
 19 | import json
 20 | import cPickle
 21 | import re
 22 | import types
 23 | import logging
 24 | from nltk.corpus import treebank
 25 | from nltk import treetransforms
 26 | #from nltk.grammar import WeightedProduction, Nonterminal
 27 | from nltk.grammar import ProbabilisticProduction, Nonterminal
 28 | 
 29 | 
 30 | class PCFGViterbiParser(nltk.ViterbiParser):
 31 | 
 32 |     def __init__(self, grammar, trace=0):
 33 |         super(PCFGViterbiParser, self).__init__(grammar, trace)
 34 | 
 35 |     def parse(self, tokens):
 36 |         tagged = nltk.pos_tag(tokens)
 37 |         missing = False
 38 |         for tok, pos in tagged:
 39 |             if not self._grammar._lexical_index.get(tok):
 40 |                 missing = True
 41 |                 self._grammar._productions.append(
 42 |                     ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001))
 43 | # WeightedProduction(Nonterminal(pos), [tok], prob=0.000001))
 44 |         if missing:
 45 |             self._grammar._calculate_indexes()
 46 | 
 47 |         # returns a generator, so call 'next' to get the ProbabilisticTree
 48 |         tree = super(PCFGViterbiParser, self).parse(tokens)
 49 |         if issubclass(tree.__class__, nltk.tree.Tree):
 50 |             print 'returning a tree'
 51 |             return tree
 52 |         elif isinstance(tree, types.GeneratorType):
 53 |             try:
 54 |                 return next(tree)
 55 |             except(StopIteration):
 56 |                 tweet = ' '.join(tokens)
 57 |                 print u'Couldn\'t parse {}'.format(tweet)
 58 |                 return None
 59 |         else:
 60 |             error("Type of tree is: {}".format(type(tree)))
 61 | 
 62 | 
 63 | def train_pcfg():
 64 |     print 'training grammar'
 65 |     productions = []
 66 |     # print len(treebank.fileids())
 67 |     trees = []
 68 |     # up to 199 less for shorter grammar for quicker training
 69 |     for fileid in treebank.fileids()[0:20]:
 70 |         for tree in treebank.parsed_sents(fileid):
 71 |             # perform optional tree transformations, e.g.:
 72 |             # Remove branches A->B->C into A->B+C so we can avoid infinite
 73 |             # productions
 74 |             tree.collapse_unary(collapsePOS=False)
 75 |             # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
 76 |             # horizontal and vertical Markovization: remember parents and siblings in tree
 77 |             #     This gives a performance boost, but makes the grammar HUGE
 78 |             #     If we use these we would need to implement a tag forgetting method
 79 |             #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
 80 |             tree.chomsky_normal_form()
 81 |             productions += tree.productions()
 82 |     S = nltk.Nonterminal('S')
 83 |     grammar = nltk.induce_pcfg(S, productions)
 84 |     print "grammar trained!"
 85 |     return grammar
 86 | 
 87 | 
 88 | def traverse_tree_grab_phrases(tree, phrases):
 89 |     """Finds all examples of each label and returns the phrases dictionary.
 90 | 
 91 |     Usage: phrases = traverse_tree_grab_phrase(tree, 'VP')
 92 | 
 93 |     Phrases is a dictionary with a key for each label you which to
 94 |     find, and each value is a list.
 95 |     """
 96 | 
 97 |     for subtree in tree:
 98 |         logging.debug('type of subtree= {}'.format(type(subtree)))
 99 |         if issubclass(subtree.__class__, nltk.tree.Tree):
100 |             logging.debug('this subtree has label {}'.format(subtree.label()))
101 |             if subtree.label() in phrases.keys():
102 |                 logging.debug('found {} label'.format(subtree.label()))
103 |                 tokens = subtree.leaves()
104 |                 phrase = ' '.join(tokens)
105 |                 logging.debug(u'which has this phrase \n {}'.format(phrase))
106 |                 phrases[subtree.label()].append(phrase)
107 |             logging.debug('going one deeper')
108 |             phrases = traverse_tree_grab_phrases(subtree, phrases)
109 |         elif type(subtree) == unicode:
110 |             logging.debug(subtree)
111 |     return phrases
112 | 
113 | 
114 | def traverse_tree_grab_phrase(tree, label):
115 |     """Finds the first example of the label and returns the phrase.
116 | 
117 |     Usage: phrase = traverse_tree_grab_phrase(tree, 'VP')
118 | 
119 |     For exhaustive search try the sister function
120 |     traverse_tree_grab_phrases(tree,phrases)
121 |     """
122 |     phrase = None
123 |     logging.debug("tree type: {}".format(type(tree)))
124 | 
125 |     for subtree in tree:
126 |         logging.debug('type of subtree= {}'.format(type(subtree)))
127 |         if issubclass(subtree.__class__, nltk.tree.Tree):
128 |             logging.debug('this subtree has label {}'.format(subtree.label()))
129 |             logging.debug('subtree {} == label {} : {}'.format(
130 |                 subtree.label(), label, subtree.label() == label))
131 |             if subtree.label() == label:
132 |                 logging.debug('found {} label'.format(label))
133 |                 tokens = subtree.leaves()
134 |                 phrase = ' '.join(tokens)
135 |                 logging.debug(u'which has this phrase \n {}\n'.format(phrase))
136 |                 return phrase
137 |             else:
138 |                 phrase = traverse_tree_grab_phrase(subtree, label)
139 |                 if phrase != None:
140 |                     return phrase
141 |     return phrase
142 | 
143 | 
144 | def get_phrases_from_tree(tree, exhaustive=False):
145 |     labels = [u'VP', u'NP', u'PP']
146 |     phrases = dict.fromkeys(labels)
147 |     for k in phrases.keys():
148 |         phrases[k] = []
149 |     if exhaustive:
150 |         phrases = traverse_tree_grab_phrases(tree, phrases)
151 |     else:
152 |         for label in phrases.keys():
153 |             # print '\n\n\n\nlooking for {}'.format(label)
154 | 
155 |             phrase = traverse_tree_grab_phrase(tree, label)
156 | 
157 |             if phrase is not None:
158 |                 phrases[label].append(phrase)
159 |     return phrases
160 | 
161 | 
162 | def parse_sentence(tokenized_sentence, grammar):
163 |     """ Parses a tokenized sentence and returns a tree
164 |     """
165 |     #    parser = nltk.parse.ViterbiParser(grammar)
166 |     parser = PCFGViterbiParser(grammar, trace=0)
167 |     tree = parser.parse(tokenized_sentence)
168 |     return tree
169 | 
170 | 
171 | def json_phrases(phrases, filename):
172 |     with open(filename, 'w') as f:
173 |         j = json.dumps(phrases, indent=1)
174 |         f.write(j)
175 |     return
176 | 
177 | 
178 | def pickle_grammar(grammar, fn):
179 |     """ Write grammar to file (serialized, marshalled)
180 |     """
181 |     with open(fn, 'w') as f:
182 |         #cPickle.dump(grammar, f, protocol=cPickle.HIGHEST_PROTOCOL)
183 |         cPickle.dump(grammar, f, protocol=0)
184 | 
185 | 
186 | def unpickle_grammar(fn):
187 |     """ Read grammar from a file and return it"""
188 |     with open(fn, 'rU') as f:
189 |         grammar = cPickle.load(f)
190 |     return grammar
191 | 
192 | 
193 | def get_grammar(fn='grammar.pickle'):
194 | 
195 |     try:
196 |         grammar = unpickle_grammar(fn)
197 |         print 'Loaded grammar'
198 |         return grammar
199 |     except IOError:
200 |         print 'No grammar file, gotta train'
201 |         grammar = train_pcfg()
202 |         pickle_grammar(grammar, fn)
203 |         return grammar
204 | 
205 | 
206 | def create_info_phrase_add_to_list(phrases, status, dict_list):
207 |     keys = ['phrase', 'phrase_type', 'tweet',
208 |             'coordinates', 'time', 'screen_name']
209 | 
210 |     for pos in phrases:
211 |         for phrase in phrases[pos]:
212 |             print u'phrase: {}'.format(phrase)
213 |             d = dict.fromkeys(keys)
214 |             d['phrase_type'] = pos
215 |             d['phrase'] = phrase
216 |             d['tweet'] = status.text
217 |             d['screen_name'] = status.user.screen_name
218 |             d['time'] = str(status.created_at)
219 |             if status.geo:
220 |                 d['coordinates'] = status.geo['coordinates']
221 |             dict_list.append(d)
222 |             del d
223 | 
224 |     return
225 | 
226 | 
227 | def parse_tweets(search_results):
228 |     grammar = get_grammar('grammar_20ids_HM0VM0.pickle')
229 |     list_of_info_dicts = []
230 |     for sr in search_results:
231 |         print u'tweet text: {}'.format(sr.text)
232 |         #        nltk.tree.Tree.draw(tree)
233 |         sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
234 |         sentences = sentence_detector.tokenize(sr.text)
235 |         tokenizer = nltk.tokenize.casual.TweetTokenizer()
236 |         for sent in sentences:
237 |             if not sent:
238 |                 logging.debug('sent is None')
239 |                 continue
240 |             tokens = tokenizer.tokenize(sent)
241 |             logging.debug(tokens)
242 |             p = re.compile(r'https.*')
243 |             #        tokens = [word for word in tokens if not word == u'\xe2']
244 |             tokens = [word for word in tokens if not p.match(word)]
245 |             logging.debug(tokens)
246 |             if not tokens:
247 |                 continue
248 |             tree = parse_sentence(tokens, grammar)
249 |             if not tree:
250 |                 logging.debug('tree was None')
251 |                 continue
252 |             print tree
253 |             phrases = get_phrases_from_tree(tree, exhaustive=True)
254 |             print 'printing phrases dictionary for this tweet'
255 |             for k, v in phrases.items():
256 |                 print u'{} : {}'.format(k, v)
257 | 
258 |             create_info_phrase_add_to_list(phrases, sr, list_of_info_dicts)
259 | 
260 |     i = 1
261 |     for d in list_of_info_dicts:
262 |         print '\n\n\n printing dictionary {}'.format(i)
263 |         for k, v in d.items():
264 |             print u'{} : {}'.format(k, v)
265 |         i += 1
266 | 
267 |     json_phrases(list_of_info_dicts, 'phrases.json')
268 |     return list_of_info_dicts
269 | 
270 | 
271 | def main():
272 |      # set to DEBUG, INFO, WARNING, ERROR, CRITICAL :
273 |     logging.basicConfig(
274 |         format='%(levelname)s:  %(message)s', level=logging.INFO)
275 |     import geosearchclass
276 |     g = geosearchclass.GeoSearchClass()
277 |     print "Using search values from params.txt"
278 |     g.set_params_from_file('params.txt')
279 |     search_results = g.search()
280 |     parse_tweets(search_results)
281 |     # grammar = get_grammar()
282 |     # #sentences = treebank.sentences()[34:35]
283 |     # sentences = [nltk.word_tokenize('Numerous passing references to the phrase have occurred in movies')]
284 |     # #print sentences
285 | 
286 |     # sentence_trees = parse_sentences(sentences, grammar)
287 |     # phrases = get_phrases(sentence_trees)
288 |     # print 'Now printing the phrases: '
289 |     # for k,v in phrases.items():
290 |     #     print '{} : {}'.format(k,v)
291 |     # json_phrases(phrases, 'phrases.json')
292 | 
293 | 
294 | if __name__ == '__main__':
295 |     main()
296 | 


--------------------------------------------------------------------------------