├── bash-201
    ├── README.md
    ├── grep_rules.txt
    ├── prac
    ├── twitter.agg.piped
    ├── rules_to_file_name.py
    └── data_structures.bash
├── k-means
    └── README.md
├── count-min
    ├── README.md
    ├── count_min_1.png
    ├── count_min_2.png
    ├── CM_small.json
    └── CountMinSketch.ipynb
├── pandas-101
    ├── README.md
    └── data
    │   └── twitter_sample.csv
├── max-likelihood
    └── README.md
├── topic-modeling-101
    ├── README.md
    ├── image.png
    ├── table.png
    ├── vector_corpus.mm.index
    ├── vector_corpus.mm
    ├── text_corpus.txt
    └── topic_modeling_part1.ipynb
├── logistic-regression
    └── README.md
├── python-oop
    ├── life
    │   ├── __init__.py
    │   ├── beast.py
    │   └── human.py
    ├── simple_script.py
    ├── data.csv
    ├── simple_module.py
    └── README.md
├── regex-101
    ├── panda.jpg
    ├── small.log
    └── README.md
├── sklearn-101
    ├── iris_knn.png
    └── README.md
├── classical-stats-and-social-data-101
    ├── p_val.jpeg
    ├── README.md
    ├── num_boulder_50k.txt
    └── classical-stats-and-social-data-101.ipynb
├── .gitignore
├── python-unittest
    ├── test_foo.py
    ├── mathy.py
    ├── test_mathy.py
    └── README.md
├── LICENSE
└── README.md


/bash-201/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/k-means/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/count-min/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pandas-101/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/max-likelihood/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/topic-modeling-101/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logistic-regression/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bash-201/grep_rules.txt:
--------------------------------------------------------------------------------
1 | grep_stmt='grep -i -E "cat|dog"'
2 | grep_stmt='grep -i -E "cat|bull dog"'
3 | 


--------------------------------------------------------------------------------
/bash-201/prac:
--------------------------------------------------------------------------------
1 | cat
2 | bull dog
3 | cAt and buLL dog
4 | bull
5 | dog
6 | cat's
7 | cat's bull DoG
8 | 


--------------------------------------------------------------------------------
/python-oop/life/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |         "beast"
3 |         , "human" 
4 |         ]
5 | 


--------------------------------------------------------------------------------
/regex-101/panda.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/regex-101/panda.jpg


--------------------------------------------------------------------------------
/count-min/count_min_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/count-min/count_min_1.png


--------------------------------------------------------------------------------
/count-min/count_min_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/count-min/count_min_2.png


--------------------------------------------------------------------------------
/sklearn-101/iris_knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/sklearn-101/iris_knn.png


--------------------------------------------------------------------------------
/topic-modeling-101/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/topic-modeling-101/image.png


--------------------------------------------------------------------------------
/topic-modeling-101/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/topic-modeling-101/table.png


--------------------------------------------------------------------------------
/topic-modeling-101/vector_corpus.mm.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/topic-modeling-101/vector_corpus.mm.index


--------------------------------------------------------------------------------
/classical-stats-and-social-data-101/p_val.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kunalj101/Data-Science-45min-Intros/master/classical-stats-and-social-data-101/p_val.jpeg


--------------------------------------------------------------------------------
/classical-stats-and-social-data-101/README.md:
--------------------------------------------------------------------------------
1 | This tutorial looks at the use of the Binomial and Poisson distributions to variables in social media data. 
2 | The $p$-value is used to explore the topic hypothesis testing. 
3 | 


--------------------------------------------------------------------------------
/topic-modeling-101/vector_corpus.mm:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | 9 12 28                                           
 3 | 1 6 1
 4 | 1 9 1
 5 | 1 11 1
 6 | 2 3 1
 7 | 2 6 1
 8 | 2 7 1
 9 | 2 8 1
10 | 2 10 1
11 | 2 12 1
12 | 3 3 1
13 | 3 5 1
14 | 3 8 1
15 | 3 11 1
16 | 4 3 2
17 | 4 5 1
18 | 4 9 1
19 | 5 8 1
20 | 5 10 1
21 | 5 12 1
22 | 6 4 1
23 | 7 2 1
24 | 7 4 1
25 | 8 1 1
26 | 8 2 1
27 | 8 4 1
28 | 9 1 1
29 | 9 2 1
30 | 9 7 1
31 | 


--------------------------------------------------------------------------------
/topic-modeling-101/text_corpus.txt:
--------------------------------------------------------------------------------
 1 | Human machine interface for lab abc computer applications
 2 | A survey of user opinion of computer system response time
 3 | The EPS user interface management system
 4 | System and human system engineering testing of EPS
 5 | Relation of user perceived response time to error measurement
 6 | The generation of random binary unordered trees
 7 | The intersection graph of paths in trees
 8 | Graph minors IV Widths of trees and well quasi ordering
 9 | Graph minors A survey
10 | 


--------------------------------------------------------------------------------
/python-oop/simple_script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | # import the sys module from the standard library (ie no need to pip install) 
 7 | import sys 
 8 | 
 9 | # data comes in via stdin, results are sent to stdout 
10 | for cnt, line in enumerate(sys.stdin):
11 |     body = line.split('|')[2]
12 |     sys.stdout.write("line number: {}, tweet body: {}\n".format(cnt, body)) 
13 |     # ^ this is ~equivalent to "print(stuff)"
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.swp
 3 | sklearn-101/lin-reg.pkl
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Packages
 9 | *.egg
10 | *.egg-info
11 | dist
12 | build
13 | eggs
14 | parts
15 | bin
16 | var
17 | sdist
18 | develop-eggs
19 | .installed.cfg
20 | lib
21 | lib64
22 | 
23 | # Installer logs
24 | pip-log.txt
25 | 
26 | # Unit test / coverage reports
27 | .coverage
28 | .tox
29 | nosetests.xml
30 | 
31 | # Translations
32 | *.mo
33 | 
34 | # Mr Developer
35 | .mr.developer.cfg
36 | .project
37 | .pydevproject
38 | 
39 | # silly osx 
40 | .DS_Store
41 | 


--------------------------------------------------------------------------------
/python-unittest/test_foo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | import unittest
 7 | 
 8 | class Test_Foo(unittest.TestCase):
 9 |     """
10 |     Test module to go along with test discovery example 
11 |     """
12 | 
13 |     def setUp(self):
14 |         pass
15 | 
16 |     def tearDown(self):
17 |         pass
18 | 
19 |     def test_baz(self):
20 |         self.assertEqual(0,0)
21 | 
22 |     def test_bar(self):
23 |         self.assertIsInstance("josh", str)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 
29 | 


--------------------------------------------------------------------------------
/bash-201/twitter.agg.piped:
--------------------------------------------------------------------------------
1 | tag:search.twitter.com,2005:1111111111111111111|2014-01-24T23:53:25.000Z|dog Sun is out, but it's still cold. Some ol' boooooshit|None|None|None|['en']|en|en|None|None|None|None|None|None|None|SOMEONE @_DUDER|None|None|None|None|None|None|None|None|JIM|COOL_JIM|1111111111|None|256|165|0|6249|Tweet|None|None|None
2 | tag:search.twitter.com,2005:1111111111111111112|2014-01-24T23:56:47.000Z|RT @SOMEDUDE: Looks like tomorrow's plunge weather will be flurtastic, a warm 27 with a chance of snow! Don't forget you earmuffs when you bull dog|None|None|None|['en']|en|en|None|None|None|None|None|None|-18000|None|None|None|None|None|None|None|None|None|THE DUDE|DUDE_theDUDE|11111122|None|492|259|0|7801|Retweet|None|tag:search.twitter.com,2005:1111111111111111113|2014-01-24T23:10:46.000Z
3 | 


--------------------------------------------------------------------------------
/python-unittest/mathy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | 
 7 | class Calcs(object):
 8 |     """
 9 |     Objects of type Calcs have two methods for numerical calculations, and 
10 |     an attribute of 'zero' which is set to 0
11 |     """
12 | 
13 |     def __init__(self):
14 |         self.zero = 0
15 | 
16 |     def add_one(self, x):
17 |         """Return the argument incremented by 1"""
18 |         try:
19 |             return x + 1
20 |         except TypeError:   # be gentle
21 |             return "And what, exactly would it mean to add one to {}?".format(x)
22 | 
23 |     def square(self, x):
24 |         """Return the square of the argument""" 
25 |         return x**2     # no exception handling
26 |  
27 | 


--------------------------------------------------------------------------------
/regex-101/small.log:
--------------------------------------------------------------------------------
 1 | [2014-05-19 20:59:21,303][storm.petrel][DEBUG]Got chunk
 2 | [2014-05-19 20:59:21,499][storm.petrel][DEBUG]Got chunk
 3 | 1111111111111111111111111111111111
 4 | 1111111111111111511111111111111111
 5 | 1111111111111111551111111111111111
 6 | 1111111111111111555111111111111111
 7 | ggplot accepts colour as acceptable spelling
 8 | thankfully, it also accepts color
 9 | 4346238410471012988888834874629873446918327
10 | [2014-05-19 20:59:21,995][storm.petrel][DEBUG]Got chunk
11 | [2014-05-19 20:59:21,996][storm.petrel][DEBUG]Roll: duration (1>=1)
12 | foobarbaZ
13 | [2014-05-19 20:59:21,996][storm.petrel][DEBUG]Processes triggered
14 | [2014-05-19 20:59:22,001][storm.petrel][DEBUG]recsize=1746308, twitter-decahose, ts=1400533161, dur=1
15 | [2014-05-19 20:59:22,006][storm.petrel][DEBUG]Roll: duration (1>=1)
16 | 


--------------------------------------------------------------------------------
/classical-stats-and-social-data-101/num_boulder_50k.txt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 2
  4 | 1
  5 | 4
  6 | 1
  7 | 1
  8 | 4
  9 | 2
 10 | 1
 11 | 2
 12 | 0
 13 | 0
 14 | 1
 15 | 4
 16 | 0
 17 | 3
 18 | 3
 19 | 7
 20 | 3
 21 | 5
 22 | 4
 23 | 2
 24 | 3
 25 | 1
 26 | 1
 27 | 3
 28 | 1
 29 | 2
 30 | 0
 31 | 7
 32 | 2
 33 | 2
 34 | 4
 35 | 2
 36 | 3
 37 | 0
 38 | 3
 39 | 1
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 4
 45 | 1
 46 | 0
 47 | 3
 48 | 0
 49 | 4
 50 | 1
 51 | 1
 52 | 3
 53 | 3
 54 | 0
 55 | 7
 56 | 3
 57 | 1
 58 | 1
 59 | 5
 60 | 1
 61 | 2
 62 | 2
 63 | 2
 64 | 3
 65 | 2
 66 | 2
 67 | 2
 68 | 0
 69 | 4
 70 | 1
 71 | 3
 72 | 2
 73 | 2
 74 | 0
 75 | 2
 76 | 2
 77 | 4
 78 | 0
 79 | 1
 80 | 0
 81 | 0
 82 | 3
 83 | 0
 84 | 1
 85 | 2
 86 | 1
 87 | 1
 88 | 2
 89 | 0
 90 | 3
 91 | 0
 92 | 2
 93 | 1
 94 | 2
 95 | 1
 96 | 2
 97 | 1
 98 | 0
 99 | 2
100 | 1
101 | 1
102 | 3
103 | 1
104 | 0
105 | 1
106 | 1
107 | 2
108 | 1
109 | 0
110 | 0
111 | 2
112 | 2
113 | 1
114 | 1
115 | 1
116 | 1
117 | 2
118 | 0
119 | 0
120 | 


--------------------------------------------------------------------------------
/bash-201/rules_to_file_name.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import re
 4 | import codecs
 5 | # globals
 6 |     
 7 | #  --sans regex w/ order
 8 | str_replacements=[ #(search,replacement)
 9 | ('grep_stmt=',''),
10 | ('| grep','_AND_grep'),
11 | ('|','_OR_'),
12 | ('\\b','_BG_'),
13 | ("'",''),
14 | (',',''),
15 | ('?','_QM_'),
16 | (' ','_'),
17 | ('/','_'),
18 | ('<','_LT_'),
19 | ('%','_PT_'),
20 | ('@','_AT_'),
21 | ('#','_SH_'),
22 | ('["]','_LQT_'),
23 | ('"','_QT_'),
24 | ('&','_AMP_'),
25 | ('$','_DOL_'),
26 | ("\\","")
27 | ]
28 | 
29 | # --regex
30 | re_dict={ #'search':'replacement'
31 | '[__]+':'_'
32 | }
33 | 
34 | # unicode
35 | reload(sys)
36 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
37 | 
38 | # replacements set
39 | for file_name in sys.stdin:
40 |     for search,replacement in str_replacements:
41 |         file_name=file_name.replace(search,replacement)
42 |     for key in re_dict:
43 |         for item in re.findall(key,file_name):
44 |             file_name=file_name.replace(item,re_dict[key])
45 | 
46 | # out
47 | print file_name 
48 | 
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/python-oop/life/beast.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | 
 7 | class Animal(object):
 8 |     """Create a generic Animal object."""
 9 |     def __init__(self):         # generic Animal takes no arguments
10 |         self.hungry = True      # born hungry 
11 |         self.name = None
12 |         self.speak = ""
13 | 
14 |     def eat(self):
15 |         """Set hungry attribute to False"""
16 |         self.hungry = False
17 | 
18 |     def run(self):
19 |         """Set hungry attibute to True."""
20 |         self.hungry = True
21 | 
22 |     def talk(self):
23 |         """Return the 'vocal' representation of this object."""
24 |         hunger = "hungry" if self.hungry else "not hungry"
25 |         return "{} I'm {} and {}".format(self.speak, self.name, hunger) 
26 | 
27 | class Dog(Animal):
28 |     """Create a Dog object, inherits from Animal object"""
29 |     def __init__(self, name):   # at the risk of anthropomorphizing, a dog must have a name 
30 |         super(Dog, self).__init__()
31 |         #nb: in this case, the use of "super" is ~ as saying "Animal.__init__()"
32 |         #       but for multiple inheritance, becomes more obviously valuable 
33 |         self.name = name
34 |         self.speak = "woof!"    # dogs should woof by default 
35 | 
36 |         
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science 45-min Intros
 2 | 
 3 | ~45-minute lessons and tutorials on topics that help our data team ([@Gnip](http://gnip.com)) learn new things and become more efficient in our day-to-day work: basic programming, language-specific packages, statistics, machine learning topics, whatever helps! 
 4 | 
 5 | For each session, someone puts together the lesson/walk-through and leads the discussion. Presentation platform can include IPython notebooks, interactive code sessions, or anything else. *The more hands-on, the better.*
 6 | 
 7 | Feel free to use these for your own (or your team's) growth, and do submit pull requests if you have something to add! 
 8 | 
 9 | ## Current topics
10 | 
11 | 
12 | ### Bash
13 | 
14 | - [Data structures](bash-201)
15 | 
16 | - [Regular Expressions](regex-101)
17 | 
18 | 
19 | ### Python
20 | 
21 | - [Object oriented programming concepts + modules/packaging](python-oop)
22 | 
23 | - [Unit testing with ``unittest``](python-unittest)
24 | 
25 | - [Introduction to ``pandas``](pandas-101)
26 | 
27 | 
28 | ### Statistics
29 | 
30 | - [Maximum Likelihood Estimation](max-likelihood)
31 | 
32 | - [Count-Min algorithm](count-min)
33 | 
34 | 
35 | ### Machine Learning
36 | 
37 | - [Intro to ``scikit-learn``](sklearn-101))
38 | 
39 | - [Intro to topic modeling](topic-modeling-101)
40 | 
41 | - [Introduction to K-means clustering](k-means)
42 | 
43 | - [Logistic Regression](logistic-regression)
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/count-min/CM_small.json:
--------------------------------------------------------------------------------
 1 | {"id":"tag:search.twitter.com,2005:1","postedTime":"2013-01-01T00:00:00.000Z","actor":{"preferredUsername":"user1","followersCount":11}}
 2 | {"id":"tag:search.twitter.com,2005:2","postedTime":"2013-01-01T00:10:00.000Z","actor":{"preferredUsername":"user2","followersCount":22},"inReplyTo":{"link":"http://450"}}
 3 | {"id":"tag:search.twitter.com,2005:3","postedTime":"2013-01-01T00:20:00.000Z","actor":{"preferredUsername":"HPSupport","followersCount":33}}
 4 | {"id":"tag:search.twitter.com,2005:4","postedTime":"2013-01-01T00:30:00.000Z","actor":{"preferredUsername":"user1","followersCount":11},"inReplyTo":{"link":"http://3"}}
 5 | {"id":"tag:search.twitter.com,2005:5","postedTime":"2013-01-01T00:40:00.000Z","actor":{"preferredUsername":"HPSupport","followersCount":33},"inReplyTo":{"link":"http://4"}}
 6 | {"id":"tag:search.twitter.com,2005:6","postedTime":"2013-01-01T00:50:00.000Z","actor":{"preferredUsername":"user4","followersCount":44},"inReplyTo":{"link":"http://460"}}
 7 | {"id":"tag:search.twitter.com,2005:7","postedTime":"2013-01-01T01:00:00.000Z","actor":{"preferredUsername":"user1","followersCount":11},"inReplyTo":{"link":"http://5"}}
 8 | {"id":"tag:search.twitter.com,2005:8","postedTime":"2013-01-01T01:10:00.000Z","actor":{"preferredUsername":"user2","followersCount":22}}
 9 | {"id":"tag:search.twitter.com,2005:9","postedTime":"2013-01-01T01:20:00.000Z","actor":{"preferredUsername":"HPSupport","followersCount":33},"inReplyTo":{"link":"http://8"}}
10 | {"id":"tag:search.twitter.com,2005:10","postedTime":"2013-01-01T02:30:00.000Z","actor":{"preferredUsername":"user2","followersCount":22},"inReplyTo":{"link":"http://9"}}
11 | 


--------------------------------------------------------------------------------
/sklearn-101/README.md:
--------------------------------------------------------------------------------
 1 | # Intro to ``scikit-learn``
 2 | 
 3 | 2014-04-04, Josh Montague
 4 | 
 5 | 
 6 | A short and basic introduction to the ``sklearn`` API interface and a couple of very simple examples of using an estimator on some built-in sample data (k-nearest neighbors and linear regression).  
 7 | 
 8 | This session was built using: 
 9 | 
10 | - Python 2.7 
11 | - IPython 1.2
12 | - matplotlib 1.3
13 | - numpy 1.8
14 | - sklearn 0.14
15 | 
16 | -----
17 | 
18 | 
19 | The capability of the full [``sklearn`` package](http://scikit-learn.org/stable/index.html) is pretty mind-blowing; this Notebook aims for the lowest hanging fruit, because the same framework is used for the advanced use-cases. This is certainly one of the strengths of ``sklearn``. Note that these materials do not go into explaining *what* the various estimators are doing or how the algorithm works. For those discussions, definitely see the other materials in [this repository](https://github.com/DrSkippy27/Data-Science-45min-Intros) and the [official documentation](http://scikit-learn.org/stable/documentation.html).
20 | 
21 | The majority of this material was collected by combining pieces of the official docs (which are possibly the pinnacle of package documentation) and assorted other online materials. Instead of replicating a bunch of awesome information here, I'll suggest you read the [Quick Start](http://scikit-learn.org/stable/tutorial/basic/tutorial.html) and as much of the [tutorial](http://scikit-learn.org/stable/tutorial/statistical_inference/index.html) as you like before getting started with this. 
22 | 
23 | If you want to explore the IPython Notebook without running Python on your own machine, you can also view it at [nbviewer](http://nbviewer.ipython.org/github/DrSkippy27/Data-Science-45min-Intros/blob/master/sklearn-101/sklearn-101.ipynb).
24 | 
25 | Enjoy! 
26 | 
27 | 


--------------------------------------------------------------------------------
/python-oop/life/human.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | 
 7 | class Person(object):
 8 |     """Create a general Person object."""
 9 |     def __init__(self):
10 |         self.gender = None 
11 |         self.name = None 
12 |         self.eyes = None 
13 |         self.word = "uhnnnnn" 
14 | 
15 |     def talk(self):                   
16 |         """Use the Person's specific attributes to 'talk'."""
17 |         s = "{}, my name is {}. my gender is {}, and i have {} eyes.".format(
18 |                 self.word
19 |                 , self.name
20 |                 , self.gender
21 |                 , self.eyes
22 |                 )
23 |         return s
24 | 
25 | class Woman(Person):
26 |     """
27 |     Create a Woman object, derived from a Person object. Requires a name, and 
28 |         takes optional keyword arguments 'eyes' (color), and 'word'.
29 |     """
30 |     def __init__(self, name, eyes="blue", word="yo"):
31 |         super(Woman, self).__init__()       # inherit attrs & methods from Person 
32 |         self.name = name                    # overwrite attrs for these objects 
33 |         self.eyes = eyes 
34 |         self.word = word 
35 |         self.gender = "female"              # set this one attr
36 | 
37 |     def high_five(self):
38 |         """Return a badass high-five."""
39 |         return "High-five!"
40 |     
41 | 
42 | class AmericanWoman(Woman):
43 |     """Create an American Woman object, derived from a Woman object."""
44 |     def __init__(self, name, eyes="brown", word="holla", **kwargs):
45 |         super(AmericanWoman, self).__init__(name, eyes, word)
46 |         [ setattr(self, k, v) for k,v in kwargs.iteritems() ]
47 | 
48 |     def talk(self):
49 |         """Override the talk method in the Person class"""
50 |         s = "\nDon't come hangin' 'round my door\nI don't wanna see your shadow no more."
51 |         print s
52 | 
53 |     def lenny_kravitz(self):
54 |         """Make Lenny Kravitz proud.""" 
55 |         return "guitar solo! ( http://www.youtube.com/watch?v=UzWHE32IxUc ) "
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/python-oop/data.csv:
--------------------------------------------------------------------------------
1 | tag:search.twitter.com,2005:351835317671690241|2013-07-01T22:50:51.000Z|kavga edelim ama konuşalım|None|None|None|['tr']|en|tr|None|None|['[25.663883, 35.817497]', '[25.663883, 42.109993]', '[44.822762, 42.109993]', '[44.822762, 35.817497]']|Polygon|Türkiye|TR|7200|None|None|None|None|None|None|None|None|None|Rümeysa Özdemir|uykugibisiyok|248312738|35|178|129|0|2028|Tweet|None|None|None
2 | tag:search.twitter.com,2005:351835317604593666|2013-07-01T22:50:51.000Z|@shane_joersz wooooow|None|None|None|['en']|en|es|[47.29088246, -101.0379045]|Point|['[-101.043785, 47.275933]', '[-101.043785, 47.306601]', '[-101.01285, 47.306601]', '[-101.01285, 47.275933]']|Polygon|Washburn, ND|US|-21600|sevenohone|None|None|None|None|None|None|None|None|cori▲alex.|CoBerg_|48025164|32|144|215|0|4071|Reply|http://twitter.com/shane_joersz/statuses/351828999086940160|None|None
3 | tag:search.twitter.com,2005:351835317747191808|2013-07-01T22:50:51.000Z|お前との肌のふれあいなんぞ求めてない。自重しろ。|None|None|None|['ja']|en|ja|[35.70675048, 139.84273005]|Point|['[139.8332175, 35.6345694444444]', '[139.8332175, 35.7507544444444]', '[139.919876666667, 35.7507544444444]', '[139.919876666667, 35.6345694444444]']|Polygon|江戸川区, 東京都|JP|-36000|ちば|None|None|None|None|None|None|None|None|黒い恋人|yamasyoyamasyo|217987801|18|37|54|0|3505|Tweet|None|None|None
4 | tag:search.twitter.com,2005:351835317608792064|2013-07-01T22:50:51.000Z|@Gabo_navoficial yo tambien creo en ti mi charro bello:))|None|None|None|['en']|en|es|None|None|['[-80.248663, 25.986366]', '[-80.248663, 26.093192]', '[-80.102066, 26.093192]', '[-80.102066, 25.986366]']|Polygon|Hollywood, FL|US|-14400|hollywood florida|None|None|None|None|None|None|None|None|MARIA|maria_e_pena|461188787|50|438|174|1|17636|Reply|http://twitter.com/Gabo_navoficial/statuses/351835075786186752|None|None
5 | tag:search.twitter.com,2005:351835317755592705|2013-07-01T22:50:51.000Z|только ты об этом не знаешь... http://t.co/MOH8pcKyJY|['http://twitter.com/ElkaAlb/status/351835317755592705/photo/1']|None|None|['ru']|en|ru|None|None|['[23.179216999999998, 51.2626423]', '[23.179216999999998, 56.1717339]', '[32.794200000000004, 56.1717339]', '[32.794200000000004, 51.2626423]']|Polygon|Belarus|BY|None|None|None|None|None|None|None|None|None|None|Элька Алб|ElkaAlb|1433828712|21|12|6|0|145|Tweet|None|None|None
6 | 


--------------------------------------------------------------------------------
/python-oop/simple_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | import sys
 7 | 
 8 | 
 9 | ######################
10 | # part 1
11 | #
12 | 
13 | my_int = 4 
14 | my_s = "hello world!"
15 | 
16 | def square(number):
17 |     """Return the squared value of 'number'."""
18 |     try:
19 |         return number**2
20 |     except TypeError as e:
21 |         sys.stderr.write("Can't square something that's not a number! ({})".format(e))
22 | 
23 | 
24 | ######################
25 | # part 2
26 | #
27 | 
28 | class Dog(object):
29 |     """Create a general Dog object. Takes no arguments."""
30 |     def __init__(self):
31 |         self.name = "rex"
32 |         self.legs = 4
33 |         self.owner = "jane"
34 |         self.word = "woof"
35 |         
36 |     def talk(self):
37 |         """Return a statement about the attributes of this Dog."""
38 |         s = "{}, my name is {}. i have {} legs and belong to {}.".format(
39 |                 self.word
40 |                 , self.name
41 |                 , self.legs
42 |                 , self.owner
43 |                 )
44 |         return s
45 | 
46 | 
47 | ######################
48 | # part 3
49 | #
50 | 
51 | class Cat(object):
52 |     """
53 |     Create a general Cat object. Requires a name and optional count of legs
54 |     and owner name.
55 |     """
56 |     def __init__(self, name, legs=4, owner="john"):
57 |         self.name = name
58 |         self.legs = legs        # note that we're not doing any type checking... 
59 |         self.owner = owner 
60 |         self.word = "meow"
61 |         
62 |     def talk(self):
63 |         """Return a statement about the attributes of this Dog."""
64 |         s = "{}, my name is {}. i have {} legs and belong to {}.".format(
65 |                 self.word
66 |                 , self.name
67 |                 , self.legs
68 |                 , self.owner
69 |                 )
70 |         return s
71 | 
72 | 
73 | 
74 | ##################################################
75 | if __name__ == '__main__':
76 |     print   # cheap carriage returns in stdout
77 |     sys.stdout.write("Now creating a Cat named Sue, with 84 legs, belonging to Jeff.")
78 |     sys.stdout.write(".\n"*10)
79 |     c = Cat("Sue", legs=84, owner="jeff")
80 |     sys.stdout.write("SPEAK, CAT!")
81 |     print
82 |     print
83 |     sys.stdout.write('"' + c.talk() + '"')
84 |     print
85 |     print
86 | 


--------------------------------------------------------------------------------
/python-unittest/test_mathy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | __author__="Josh Montague"
 4 | __license__="MIT License"
 5 | 
 6 | import mathy 
 7 | import random
 8 | import string
 9 | import unittest
10 | 
11 | 
12 | class Test_mathy(unittest.TestCase):
13 |     """
14 |     Test module for the practice mathy.py module
15 |     """
16 | 
17 |     def setUp(self):
18 |         """
19 |         Create an instance of the Calcs class for use with all test methods 
20 |         """
21 |         # assign some instance vars so we can use them in subsequent tests
22 |         #   in lieu of global vars
23 |         self.f = random.normalvariate(0, 1)         # random float
24 |         self.i = int( random.uniform(0, 10) )       # random int 
25 |         self.s = random.choice( [x for x in string.lowercase] )  # random string char 
26 |         # create a new object of type Calcs for testing
27 |         self.calcs = mathy.Calcs()                  # use this object for the tests 
28 |         
29 |     def tearDown(self):
30 |         """ 
31 |         Clean up anything e.g. database connections that needs to be taken care of 
32 |         after each test
33 |         """
34 |         pass
35 | 
36 |     def test_init(self):
37 |         """
38 |         Test that the constructor is behaving as expected
39 |         """
40 |         self.assertIsInstance(self.calcs, mathy.Calcs)
41 |         self.assertIsInstance(self.calcs.zero, int)
42 |         self.assertEqual(self.calcs.zero, 0)
43 | 
44 |     def test_add_one(self):
45 |         """
46 |         Test that the add_one method is behaving as expected 
47 |         """
48 |         # floats and ints should be happy
49 |         # check the value
50 |         self.assertEqual(self.calcs.add_one(self.f), self.f + 1)
51 |         self.assertEqual(self.calcs.add_one(self.i), self.i + 1)
52 |         # check the type
53 |         self.assertIsInstance(self.calcs.add_one(self.i), int)
54 |         self.assertIsInstance(self.calcs.add_one(self.f), float)
55 |         # check the exception 
56 |         self.assertIsInstance(self.calcs.add_one(self.s), str)
57 | 
58 |     def test_square(self):
59 |         """
60 |         Test that the square method is behaving as expected 
61 |         """
62 |         # check the values
63 |         self.assertEqual(self.calcs.square(self.f), self.f**2)
64 |         self.assertEqual(self.calcs.square(self.i), self.i**2)
65 |         # check the type
66 |         self.assertIsInstance(self.calcs.square(self.f), float)
67 |         self.assertIsInstance(self.calcs.square(self.i), int)
68 |         # check the exception
69 |         #
70 |         # because of the way the expression is evaluated, 
71 |         #   there are two ways to check the exception handling...
72 |         # ==>   lambda fnc
73 |         self.assertRaises(TypeError, lambda: self.calcs.square(self.s))
74 |         # ==>   contextmanager
75 |         with self.assertRaises(TypeError):
76 |             self.calcs.square(self.s)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()
81 | 
82 | 


--------------------------------------------------------------------------------
/bash-201/data_structures.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #################
  3 | echo $(date)
  4 | dt=$(date +%Y%m%d_%H%M)
  5 | 
  6 | #################
  7 | # list
  8 | #################
  9 |  
 10 | #--create:
 11 | declare -a pub_list=(twitter tumblr disqus fsq wp-com wp-org)
 12 | 
 13 | #--assign:
 14 | list_items=$(echo ${pub_list[@]})
 15 | my_item=${pub_list[5]}
 16 | 
 17 | #--reference:
 18 | echo ${pub_list[@]}
 19 | echo ${pub_list[5]}
 20 | 
 21 | #--loop:
 22 | for pub in "${pub_list[@]}"; do
 23 |     echo "---------------------------------"
 24 |     echo "$pub"
 25 | done
 26 | 
 27 | #################
 28 | # dictionary
 29 | #################
 30 | 
 31 | #--create:
 32 | declare -A twitter_handle
 33 | 
 34 | #--assign:
 35 | twitter_handle["jrmontague"]=jrmontag
 36 | twitter_handle["jkobl"]=JeffAKolb
 37 | twitter_handle["shendrickson"]=DrSkippy27
 38 | twitter_handle["blehman"]=WordCrank
 39 | 
 40 | #--reference:
 41 | echo ${twitter_handle["jrmontague"]}
 42 | echo ${!twitter_handle[@]}
 43 | echo ${twitter_handle[@]}
 44 | 
 45 | #--loop:
 46 | for i in "${!twitter_handle[@]}"; do
 47 |     echo "key: $i"
 48 |     echo "value: ${twitter_handle[$i]}"
 49 | done
 50 | 
 51 | #################
 52 | # date object
 53 | #################
 54 | 
 55 | #--create range:
 56 | start_date="2014-01-09 22:00:00";  
 57 | end_date="2014-01-10 22:00:00";  
 58 | 
 59 | #--create date objects:
 60 | current=$(date -d "${start_date:0:4}${start_date:5:2}${start_date:8:2} ${start_date:11:2}")
 61 | end=$(date -d "${end_date:0:4}${end_date:5:2}${end_date:8:2} ${end_date:11:2}")
 62 | 
 63 | #--loop:
 64 | while [ "$end" != "$current" ];do
 65 |     path=$(date -d "$current" +%Y%m%d.%H)
 66 |     year="${path:0:4}"
 67 |     mnth="${path:4:2}"
 68 |     day="${path:6:2}"
 69 |     hour="${path:9:2}"
 70 |     echo $path
 71 |     current=$(date -d "$current +1 hours")
 72 | done
 73 | 
 74 | #################
 75 | # strings  (quote hell) 
 76 | #################
 77 | 
 78 | #--hard vs soft quote
 79 | var='$USER'
 80 | echo $var
 81 | 
 82 | var="$USER"
 83 | echo $var
 84 | 
 85 | #--combine quotes
 86 | var='$USER='"$USER"
 87 | 
 88 | #--Example:
 89 | #--create globals
 90 | grep_cmd1='grep -i -E "cat|bull dog"'       # cat
 91 | grep_cmd2='grep -i -E "cat'"'"'s|bull dog"'  # cat's
 92 | cmd1="cat prac | ${grep_cmd1}"
 93 | cmd2="cat prac | ${grep_cmd2}"
 94 | 
 95 | #--eval
 96 | eval $cmd1
 97 | eval $cmd2
 98 | 
 99 | #--back tic 
100 | echo `eval $cmd1` # be careful with back tics 
101 | 
102 | #--back tic vs eval
103 | pwd
104 | eval pwd
105 | echo `pwd`
106 | `pwd`           #notice error
107 | 
108 | echo $USER
109 | eval "$USER"    #notice error
110 | echo `$USER`    #notice error
111 | `$USER`         #notice error
112 | $USER           #notice error
113 | 
114 | var=`echo $USER`
115 | echo $var
116 | 
117 | var=$USER
118 | echo $var
119 | 
120 | #-- quiz #1 
121 | tmp=twitter.agg.piped
122 | if [ -f grep_rules.txt ]; then
123 |     while read line; do                    
124 |         echo "file: $tmp for rule: $line"
125 |         eval "$line"
126 | 
127 |         # -- fix line below -- 
128 |         rname="${grep_stmt} | rules_to_file_name.py"
129 |         # -- fix line above --        
130 | 
131 |         cmd="cat $tmp | $grep_stmt > twitter.agg.piped.${rname}.filter.piped &"
132 |         eval "$cmd"
133 |     done < grep_rules.txt
134 | else
135 |     echo "   No grep_rules.txt found."
136 | fi 
137 | 
138 | #-- quiz #2 
139 | # The following string resulted in a rule with value: "from:$USER" ; instead of value: "from:compston"
140 | curl -v -X POST -ustephen@gnip.com "https://api.gnip.com/replay/rules.json" -d '{"rules":[{"value":"from:$USER"}]}'
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | #---------------------------------------------------------------
156 | #---------------------------------------------------------------
157 | 
158 | #-- quiz #1 solution
159 | rname=$(echo "${grep_stmt}" |./rules_to_file_name.py)
160 | 
161 | #-- quiz #2 solution
162 | curl -v -X POST -ustephen@gnip.com "https://api.gnip.comreplay/rules.json" -d '{"rules":[{"value":"from:'"$USER"'"}]}'
163 | 
164 | #---------------------------------------------------------------
165 | #---------------------------------------------------------------
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/python-unittest/README.md:
--------------------------------------------------------------------------------
 1 | # Unit Testing In Python 
 2 | 
 3 | Generally, unit testing is a style of testing that ensures the correct mapping of inputs to outputs at the most atomic level of the code. The ``unittest`` module ([docs](http://docs.python.org/2/library/unittest.html)) provides a bunch of helpful methods for checking that the output of your code is as as expected, in addition to convenient test discovery, automated "setup" and "tear down", and bunch of other functionality that I've never used! Python's ``unittest`` framwork is modeled after ``jUnit`` (Java), which is an implementation of the general ``xUnit`` framework for unit testing. 
 4 | 
 5 | It's often easier to express the desired functionality of the code through tests of what it should and shouldn't do, than to actually get it to execute properly. So, ideally, your collection of unit testing code should grow faster than your application code; making sure you've tested and confirmed that all the edge cases behave correctly. Further, through diligent maintainance of your test code, you can ensure that any new features don't break the fundamental blocks you've built (or, if they do, you're at least informed right away). 
 6 | 
 7 | 
 8 | ## Concepts
 9 | 
10 | - **test fixture**: preparation for one or more tests, e.g. temporary databases, directories, servers
11 | 
12 | - **test case**: smallest unit of testing; specific response to a specific set of inputs 
13 | 
14 | - **test suite**: collection of test cases, suites, or both. can be used to aggregate tests that are logically related
15 | 
16 | - **test runner**: organizes the execution of tests (ie through GUI or CLI)
17 | 
18 | 
19 | Instances of the ``TestCase`` class are the smallest testable units. Project- and code-specific test classes can (should) inherit from ``TestCase``. This session won't dive into the full functionality of ``unittest``, but rather see it in action in a simple case, write some tests, then see it in action in a separate codebase.
20 | 
21 | ----
22 | 
23 | ## Code Example
24 | 
25 | Defined here is a module called ``mathy.py`` (nb: ``math`` is a real thing, so we want to avoid clobbering it in the namespace). It has some very simple functionality, demonstrated below:
26 | 
27 |     $ ipython
28 |     In [1]: import mathy as m
29 | 
30 |     In [2]: c = m.Calcs()       # create a calculations object
31 | 
32 |     In [3]: c.zero              # return an instance attribute 
33 |     Out[3]: 0
34 | 
35 |     In [4]: c.square(4)         # use the instance methods 
36 |     Out[4]: 16
37 | 
38 |     In [5]: c.add_one(84.2)
39 |     Out[5]: 85.2
40 | 
41 | Open up the code and have a look. There are a couple of methods and an example of a simple error handling case (try block) in one method to avoid having the code crash in a fiery death. In a production environment, you'd probably be sad if your entire application failed because some smarty-pants passed a string to your numerical calculation. The other method is left without exception handling for an example of testing for errors. 
42 | 
43 | 
44 | ## Test Example
45 | 
46 | Now that we have some code, we can build a test suite to make sure that it still behaves the way we want with every new feature or modification. Ideally, you develop the test code in parallel with your actual application; it may often be easier to express how you'd *like* the code to run than to actually get it to run that way.
47 | 
48 | First, just run the ``test_mathy.py`` module from the command line so you can see what the output looks like. Ideally, it will say that all the tests have passed and also tell you how many it ran. Now have a look at the code in the test module. 
49 | 
50 | The first thing we have to do is import the module we're testing, in this case the ``mathy`` module. Then you can import any other modules that will help / be needed. Additionally, you have to import the actual ``unittest`` module. Next, we define a test class that will test all the moving parts of the particular module in which we're interested. There could be many classes defined in ``mathy``, and but we can (and will) test them all in the ``test_mathy`` test suite. The test class needs to inherit from the ``TestCase`` module and this allows us to use all of the ``assert*`` methods seen in the rest of the code. There are all sorts of assertion tests - for types, values, etc; for a list of all the possible methods, check [the documentation](http://docs.python.org/2/library/unittest.html). 
51 | 
52 | 
53 | Each test method defined in the class will run independently, and the ``setUp`` and ``tearDown`` methods will run, respectively before and after each of the test methods. This ensures the tests have consistency (or randomness if you design it that way e.g. with the ``random`` module). In this ``setUp`` method, we're creating a handful of test arguments. Each ``test_*`` method is independent of the others, so as soon as one of the tests (``self.assert...``) fails, that method stops evaluating and the test runner moves to another test. *Importantly*, the tests do not necessarily execute in the order they are listed. This is part of the value of (and need for) the ``setUp`` and ``tearDown`` methods.
54 | 
55 | Finally, if you have a full package (e.g. our RST on [OO and packaging **fix link**](deleteme)) which includes many modules and many test suites, you can run them all by using the built-in command line test discovery. From the top-level directory in your project, run:
56 | 
57 | ``$ python -m unittest discover``
58 | 
59 | There are options associated with ``discover`` that allow you to specify the naming convention of your test code, but the default is to execute any and all modules that look like ``test*.py``. If you run the discover statement above, you'll now see that more tests have been run, because ``discover`` has also run the two (trivial) tests found in ``test_foo.py``
60 | 
61 | 
62 | ## Your turn
63 | 
64 | Add some new functionality to the ``mathy`` class: an attribute, a new method, whatever you like, but only add one thing. Then start writing the test code for the feature you've added; you should think hard about *all* the various ways you can test that the rest of the code is interacting as you expected. Boolean logic, checking for type, None-ness, equality... The real wins from using ``unittest`` come from knowing that you can always re-run your test suite after each edit to the code to ensure all the smallest pieces are still working as expected. Your test code should grow much faster than your running code. And if you find yourself adding a feature for which there isn't a good test, consider whether you can reframe the purpose and implementation of the code into a deterministic, test-able form. 
65 | 
66 | ## Demo of ``unittest`` in larger application 
67 | 
68 | .... 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/regex-101/README.md:
--------------------------------------------------------------------------------
  1 | # Regex 101: Building Blocks 
  2 | 
  3 | Josh Montague, 2014-05-23
  4 | 
  5 | ### Build notes
  6 | 
  7 | These examples were designed using ``grep`` 2.10 and ``sed`` 4.2.1 on Ubuntu 12.04. If using something else (especially OS X), ymmv. This walk-through assumes approximately zero previous knowledge of regular expressions or command line tools like ``grep``, or ``sed``. We'll use them to learn stuff, but you can read the ``man`` pages for more intimate details on how they work. 
  8 | 
  9 | ## Background
 10 | 
 11 | There are many places to use regular expressions ("regex"). Most environments and programs (Python, vim, \*sh commands like ``grep``, ...) include the concept, but may introduce subtlties in the handling of expressions (paricularly escaping characters). For consistency, we're going to use ``grep`` for most of our regex pattern matching. This approach is pretty readable which is a win: ``$ grep "[PATTERN]" [FILE]``. 
 12 | 
 13 | Following the [Zed Shaw](http://learncodethehardway.org/) philosophy of learning, you're advised to actually smash your fingers onto the appropriate keys to recreate the examples here. And though we'll cruise through this the first time, revisit it occasionally for ideal retention. 
 14 | 
 15 | Use this __Hack Button__ [![Hack DrSkippy/Data-Science-45min-Intros.git directly on Nitrous.IO](https://d3o0mnbgv6k92a.cloudfront.net/assets/hack-s-v1-7475db0cf93fe5d1e29420c928ebc614.png)](https://www.nitrous.io/hack_button?source=embed&runtime=nodejs&repo=DrSkippy%2FData-Science-45min-Intros.git&file_to_open=regex-101%2FREADME.md) to access a Linux instance with grep and sed already loaded and ready to go!
 16 | 
 17 | 
 18 | ### Start with the bad news
 19 | 
 20 | Though extremely powerful, regular expressions ("regex") can be ungodly awful to read. Regex is a mighty sword that can also be wielded for evil. Here's an example; these are the first few lines of the famous [RFC822 email validation regular expression](http://www.ex-parrot.com/pdw/Mail-RFC822-Address.html) (check the link to see the whole thing): 
 21 | 
 22 |     (?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t]
 23 |     )+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:
 24 |     \r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(
 25 |     ?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ 
 26 |     \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\0
 27 |     31]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\
 28 |     ](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+
 29 |     ...
 30 | 
 31 | This is basically a crime against humanity in terms of the [Unix philosophy](http://en.wikipedia.org/wiki/Unix_philosophy). By continuously appending edge cases to this regex, it's now completely intractable. However, when used with care, tiny pieces of regex are incredibly powerful. Below is an introduction to some of the fundamental units of a regex. The beauty of the regex is that once you learn the building blocks, you can combine them in infinitely-extensible ways. Just don't build a single-expression, 100-line email address parser, please. 
 32 | 
 33 | 
 34 | ### Vocabulary 
 35 | 
 36 | *literal*: a character used to match in a search e.g. the ``a`` in ``bat``, or the ``og`` in ``dog`` can both be considered literal strings
 37 | 
 38 | *metacharacters*: *anchors* that deal with line position, and *modifiers* that deal with counting and specifying ranges, e.g ``*``, ``[``, ``]``. 
 39 | 
 40 | *target string*: string to be searched by the pattern
 41 | 
 42 | *escape sequence*: combination of escape metacharacter ``\`` and literal(s). Each metacharacter desired to be treated as a literal gets escaped
 43 | 
 44 | 
 45 | ## Simple matching
 46 | 
 47 | Some simple literal character matching examples. Have a look at the ``small.log`` file, and check that the matching makes sense. 
 48 | 
 49 |     $ grep "DEBUG" small.log 
 50 |     $ grep "20:59:22" small.log
 51 |     $ grep "]" small.log
 52 |     $ grep "[" small.log          # note the difference; [ is a metacharacter! (more to come below) 
 53 |     $ grep "1>" small.log
 54 | 
 55 | 
 56 | ## Metacharacters
 57 | 
 58 | ### Brackets, ranges, negation
 59 | 
 60 | For these examples, really take a second to really analyze the resulting match, so the pattern matching concepts become as clear as possible. Some of them are subtle!
 61 | 
 62 | ``[ ]`` - (L and R square brackets) used together and denote a list of characters to match against each character position in the target (once and only once). 
 63 | 
 64 |     $ grep "[01]" small.log
 65 | 
 66 | ``-`` - (dash) within brackets, specifies a range of characters to match against the target (there are additional "[special ranges](http://www.zytrax.com/tech/web/regex.htm#special)" that may (or may not) be available on your system). The next three lines should all return the same matches: 
 67 | 
 68 |     $ grep "[0123456789]" small.log
 69 |     $ grep "[0-9]" small.log              # (I think this one is the most intelligible)
 70 |     $ grep "[[:digit:]]" small.log
 71 | 
 72 |     $ grep "[a-z]" small.log              # note the differences b/w these three
 73 |     $ grep "[A-Z]" small.log
 74 |     $ grep "[a-Z]" small.log
 75 | 
 76 |     $ grep "[0-9]]" small.log             # R square bracket is now *outside* the range character, so it's literal
 77 |     $ grep "[0-9\]]" small.log            # same matches as ^, but *not* identical pattern -- can you see why? 
 78 | 
 79 | ``^`` - (caret / circumflex) *within* brackets, negates the expression 
 80 | 
 81 |     $ grep "[^a-m]" small.log
 82 |     $ grep "[^a-m0-5]k" small.log         # combined range (*for a single character match*) is specified 
 83 |                                             #   w/o spaces 
 84 |                                             # --note: this range matches any *one* character that is not in 
 85 |                                             #   a-m *or* any digit not in 0-5. not one followed by the other. 
 86 | 
 87 | ### Anchors 
 88 | 
 89 | When you need to make your match pattern include physical positions within the line, we can use the following three characters:
 90 | 
 91 | ``^`` - the caret makes an encore appearance, but this time it is *outside* of the square brackets introduced above for denoting a list of characters. In this context, it refers to the *beginning* of the line. 
 92 | 
 93 |     $ grep "^\[" small.log                  # [ is a metacharacter
 94 |     $ grep "^[^[]" small.log                # "opposite" of the previous pattern
 95 |     $ grep "^[^[:punct:]]" small.log
 96 | 
 97 | ``$`` - the dollar sign means the *end* of the line
 98 | 
 99 |     $ grep "[0-9]$" small.log
100 |     $ grep "[a-Z]$" small.log
101 | 
102 | ``.`` - the period refers to *any* one character in a position 
103 | 
104 |     $ grep "E.U" small.log 
105 |     $ grep "m.p" small.log
106 |     $ grep "m\.p" small.log                 # what's the difference b/w this pattern & the previous one? 
107 | 
108 | See also: ``\<``, ``\>``, ``\b``
109 | 
110 | 
111 | ### Iteration / repetition
112 | 
113 | To control the *number of times* a character is matched, it should be followed by an iteration (or repetition) metacharacter. Note that the output coloring for matched patterns is off for this section. Pay attention to the lines returned by ``grep`` rather than the characters that it highlights in the output.
114 | 
115 | ``*`` - the asterisk matches *0 or more* occurrences of the preceding character
116 | 
117 |     $ grep "0*" small.log 
118 |     $ grep "s.*D" small.log
119 | 
120 | ``+`` - the plus sign matches *1 or more* occurrences of the preceding character
121 | 
122 | NB: the ``+`` metacharacters (and, really, the whole set: ``?``, ``+``, ``{``, ``|``, ``(``, ``)`` ) lose their special meaning in *basic* regular expressions e.g. the default behavior of ``$ grep PATTERN FILE``. In order to use these as iteration metacharacters in the context of basic ``grep``, they must be escaped. Alternatively, ``grep`` can be called in *extended* regular expression mode with the ``-E`` option: ``$ grep -E PATTERN FILE`` (there was once an ``egrep`` command that was synonymous but is deprecated).  
123 | 
124 |     $ grep "0+" small.log                   # matches a literal 0 followed by a literal +
125 |     $ grep "0\+" small.log                  # + becomes iteration metachar
126 |     $ grep -E "0+" small.log                # equivalent pattern to ^
127 |     
128 | ``?`` - the question mark matches *0 or 1* occurrences of the preceding character 
129 | 
130 |     $ grep -E "15?1" small.log              # (note about my confusion here)
131 |     $ grep -E "colou?r" small.log           # English spelling check
132 | 
133 | ``{ }`` - the curly brackets specify a count or a range or occurences to match the preceding pattern. If used with a single number eg ``{n}``, pattern matches preceding character exactly ``n`` times. With a range eg ``{n,m}``, pattern matches at least ``n`` times, but not more than ``m`` times. With a single number + comma eg ``{n,}``, pattern matches preceding character at least ``n`` times
134 | 
135 |     $ grep "15\{2\}1" small.log
136 |     $ grep -E "15{2}1" small.log
137 |     $ grep -E "15{1,2}1" small.log
138 |     $ grep -E "15{2,}1" small.log 
139 |  
140 | 
141 | ### Grouping 
142 | 
143 | *Change-up!*
144 | 
145 | So far, we've been using ``grep`` as a way to both match regex and, conveniently (most of the time), highlight the matches thanks to the default Ubuntu ``.bashrc`` settings. ``grep`` is fundamentally about matching pattern and returning the matching *lines*. What we're about to work through is a little more subtle and is better suited for demonstration with ``sed`` than ``grep``. Thankfully, we're working on a beautiful Linux system that has both of these ready to rock. If you want to, you can go ahead and read the ``man`` page for ``sed``, but for the sake of this little demonstration, all you need to know is the following syntax:
146 | 
147 |     $ sed 's/[REGEX PATTERN]/[REPLACEMENT PATTERN]/g' FILE 
148 | 
149 | ``sed``'s ``s/thing1/thing2/g`` pattern is often used to search for ``thing1`` and replace it with ``thing2`` (everywhere, in this case). So we'll continue to use small.log as our ``FILE`` and we'll explore the results of our regex by adding the pattern between the first two slashes, and then selecting groups (or subexpressions) into the replacement pattern (between the second and third slashes). A quick example to make the use of ``sed`` super clear:
150 | 
151 |     $ sed 's/[0-9]/Q/g' small.log 
152 | 
153 | One unfortunate gotcha: ``sed`` - like just about every environment where you want to use regex - has it's own rules about escaping. The escape character is still ``\`` but you may need to use one where you didn't have to in the previous ``grep`` examples. The example below illustrates this.
154 | 
155 | ``( )`` - the open and close parantheses ("parens") group parts of the expression into subexpressions (or submatches, or groups). Awesomely, these matches are actually captured into variables so you can reuse them. This is called back-referencing, and the variables are accessed via ``\N`` where ``N`` is the numbered order of the matching subexpression (1, 2, ...). 
156 | 
157 | To be slightly more explicit, I'll build up to this example. The mission is as follows: 
158 | 
159 | > You have a log file where the dates were stored in ``yyyy-mm-dd`` format. You needed them to be in ``dd-mm-yyyy`` format using only your shell command line tools.
160 | 
161 | Recall the grep approach to capturing a certain number of characters that match a range:
162 | 
163 |     [0-9]{4}                      # matches four digits in a row 
164 |     [0-9]{4}-[0-9]{2}-[0-9]{2}    # matches a typical date format
165 | 
166 | Now we want to group each set of numbers into a subexpression so we have access to the year, month, and day in separate variables. Remember to escape the parens:
167 | 
168 |     \([0-9]{4}\)-\([0-9]{2}\)-\([0-9]{2}\)    
169 | 
170 | When this matches a valid date, ``\1`` stores the year, ``\2`` the month, and ``\3`` the day the line above is *almost* our finished regex, which we can drop into the ``sed`` command. The only remaining sadness is that the curly brackets need to be escaped within a ``sed`` expression. Escaping makes for a sad panda
171 | 
172 |     \([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)
173 | 
174 | Now drop this regex into the ``sed`` expression and use back-references to change the order. Don't forget to put the hyphens back in, too!
175 | 
176 |     $ sed 's/\([0-9]\{4\}\)-\([0-9]\{2\}\)-\([0-9]\{2\}\)/\3-\2-\1/g' small.log     # BOOM
177 | 
178 | Yes, the escaping is terrible. But once you do it a bit, you start to see through the escaping and identify the underlying structures: the groups, the ranges, the anchors, etc. Sad panda is sad, but sad panda is very powerful. 
179 | 
180 | ![powerful panda](./panda.jpg "escaping. deal with it.")
181 | 
182 | 
183 | ### OR-ing (alternation)
184 | 
185 | *Back to ``grep``!*
186 | 
187 | ``|`` - the pipe (or vertical bar) is an "or" and "alternation" and will match either the expression on the left or right side of the symbol. This can be combined with parens within a character string, or also used to OR entire expressions.
188 | 
189 |     $ grep "05\|22" small.log 
190 |     $ grep -E "[0-9]{8}|[a-Z]{10}" small.log    # recall -E escaping 
191 |     $ grep -E "o(l|r)" small.log
192 |     $ grep "o\(l\|r\)" small.log
193 | 
194 | 
195 | *the end*
196 | 
197 | -------
198 | 
199 | ### Notes 
200 | 
201 | There are a lot of avenues to further explore the capabilities of regular expressions. We covered most of the building blocks, but here are a handful of possible next steps and assorted other notes to learn from while you're waiting for the 201 session of the class: 
202 | 
203 | I used these two resources for getting this outline laid out. The TLDP link has far more than is possible to fit in a 101 class: 
204 | - [zytrax](http://www.zytrax.com/tech/web/regex.htm)
205 | - [TLDP](http://www.tldp.org/LDP/abs/html/x17129.html)
206 | 
207 | Some other assorted things if you're interested in taking all of this a few steps further:
208 | 
209 | **backreferences**
210 | - there are even more ways to use these... for example, define a group so that you can reuse that group in your *matching expression* instead of just the result. 
211 | 
212 | **more character classes**
213 | - there are other [character classes](http://www.zytrax.com/tech/web/regex.htm#special) that I didn't mention. 
214 | 
215 | **``grep`` vs. ``egrep`` vs. ``fgrep``** 
216 | - Though ``egrep`` is deprecated, it is the same as using ``grep -E``. This extends ``grep`` to "extended ``grep``". Additionally, there is ``grep -F`` which does not evaluate the expression being used for matching. That is, if you're searching for plain text (literal characters), ``fgrep`` should be faster to finish searching and possibly matching. 
217 | 
218 | **don't reinvent the wheel**
219 | - for commonly-used pattern matching, there are [inventories](http://regexlib.com/DisplayPatterns.aspx) of expressions to use
220 | 
221 | **don't use regex for all of your parsing**
222 | - particularly, don't try to [parse HTML](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454?stw=2#1732454)
223 | 
224 | 
225 | 
226 | 


--------------------------------------------------------------------------------
/python-oop/README.md:
--------------------------------------------------------------------------------
  1 | # OOP in Python (with a dash of packaging)
  2 | 
  3 | An overview illustrating some aspects of Object Oriented Programming (OOP) in Python including: classes, functions, methods, inheritance. And, while we're at it, some discussion of modules and packages.
  4 | 
  5 | - Built on Python 2.7.6, OS X 
  6 | 
  7 | - This walk-through will also involve learning about modules and packages in Python, so here are some great reference materials (to both have open and bookmarked for later):
  8 | 
  9 |     - [Introduction to Modules](http://docs.python.org/2/tutorial/modules.html) from official Python tutorial
 10 | 
 11 |     - [Introduction to Packages](http://www.network-theory.co.uk/docs/pytut/Packages.html) from the creator, himself
 12 | 
 13 | ### Introduction
 14 | 
 15 | There are a lot of words in this README. Sorry about that. It's not so much a README as it is a walk-through of the materials; this was my attempt to lay things out logically for presentation, but hopefully you can also just read through it and get the gist. With conversation and questions, this is more appropriately two 45-minute sessions. One reasonable way to break it down would consider the first two major pieces (Scripts and Modules) as the first installment, and Packages as the second installment.
 16 | 
 17 | **n.b.**: I actually learned many of the details presented here while preparing this, so please do submit pull requests with any corrections you might have. Just try to keep it at the level of a beginning- to moderate-experience Python programmer.
 18 | 
 19 | 
 20 | ## (Executable) Scripts
 21 | 
 22 | First, look at the data file. Should look like standard `gnacs`-ified Twitter Activity Streams. No surprises up that sleeve.
 23 | 
 24 | We'll start with an example of the simple Python script functionality with which many folks (myself included) begin using Python. Send some data to the shell's `stdout`, pipe it to an executable Python script, and read / do something with it via `sys.stdin` in our Python code. Run the data through the script first, and then we'll dive deeper:  
 25 | 
 26 |     $ cat data.csv | ./simple_script.py     # ideally, you get something like this...
 27 |     line number: 0, tweet body: kavga edelim ama konuşalım
 28 |     line number: 1, tweet body: @shane_joersz wooooow
 29 |     line number: 2, tweet body: お前との肌のふれあいなんぞ求めてない。自重しろ。
 30 |     line number: 3, tweet body: @Gabo_navoficial yo tambien creo en ti mi charro bello:))
 31 |     line number: 4, tweet body: только ты об этом не знаешь... http://t.co/MOH8pcKyJY
 32 | 
 33 | Now, let's look at the code in `simple_script.py`. If you're coming from another OO language, your first inclination may be to seek out things that match the pattern ``A.B``. In many OO languages, this suggests that ``A`` is an object and ``B`` is something defined within the scope of that object e.g. a function (often called a method in this case) or variable. The same is true in Python, only it has a subtle variation when it comes to modules and packages. On line 7 we import the standard-library module ``sys`` ('standard-library' means it's built-in, no ``pip install`` needed). Then, on line 10, we write a version of the most common Python scripting line ever written: ``for line in sys.stdin: ....``. Here, we use the dot notation reach into the ``sys`` module and address an attribute defined therein, the ``stdin`` file object.
 34 | 
 35 | It's not quite as obvious an implementation of object creation as ``foo = Thing()`` followed by e.g. ``foo.bar()`` (if that comparison doesn't make sense yet, hold tight... we're getting there). But, the ``import useful-module`` statement does actually instantiate an object that lets you use it later on. Ok, so let's just let that simmer for a bit. In line 10 we make use ``stdin`` file object to buffer lines from the shell, and we've also done two other object creation steps: created an integer called ``cnt`` and a string object ``line``. As a result, we can use all of the methods and attributes that belong to these variables, which we quickly do: in line 11, we use the ``split()`` method, which is defined in the string class. This one line actually has another object-specific attribute use: ``split()`` returns an object of type ``list``, and we use the ability to index the items in that list to extract the thing we wanted (the square bracket notation is actually shorthand for the list method ``__index__()``).
 36 | 
 37 | Ok, so that's a quick example of how OOP is involved even when we're using "scripted" Python. But the power of OOP is yet to come. First, something completely different... (not really).  
 38 | 
 39 | ### *A Diversion On Python Namespace*
 40 | 
 41 | Let's be a little more specific about the ``import sys`` line above; what we've actually done is added the ``sys`` module to the namespace of the current Python session. Every time you create a Python session, whether opening the interpreter (``$ python``) or running an executable script from the command line, a whole host of functions, variables, and modules are entered into the current namespace. This means you can use them by calling them directly, without any special techniques. For example, you can always do the following:
 42 | 
 43 |     $ ipython
 44 |     In [1]: f = float(34)
 45 | 
 46 | And the reason you can use the ``float()`` function is because it's prepopulated in the namespace of the session. When you say ``import module_a``, you can subsequently use any attribute e.g. ``var1`` defined within the module scope as ``module_a.var1``. But if ``var2`` happened to be included in namespace by default, you can go straight to addressing ``var2`` (getting or setting), no module prefix needed. More on this topic as we go along.
 47 | 
 48 | ## Modules
 49 | 
 50 | Even though we call `simple_script.py` a "script", any file that ends in `.py` and contains valid Python is technically a module. Often, modules are defined such that they can be imported into *other* code and used. (note: `simple_script.py` won't actually place nice if you open a Python interpreter and `import simple_script` because - I think - it's waiting for the "end" of the `sys.stdin` buffer). Instead, let's use the `simple_module.py` example. Open the file and have a look.
 51 | 
 52 | ### Part 1
 53 | 
 54 | In "part 1" of ``simple_module.py`` we define a couple of variables (at the outer-most level, so these are within the scope of the whole module), and one function which returns the square of it's input. Start a CLI Python session (nb: if you use `ipython` you will have tab autocompletion and general merriment) from this project directory and import our module - this time with an alias. The alias allows you to avoid typing out the name of the module every time, but otherwise behaves the same as discussed before. See the Appendix at the end for more specific examples of ``import`` statements. Check that you can access the things defined in "part 1" of the module:
 55 | 
 56 |     In [1]: import simple_module as sm
 57 | 
 58 |     In [2]: sm.my_int       # get the value stored in the my_int attribute
 59 |     Out[2]: 4
 60 | 
 61 |     In [3]: sm.square(6)
 62 |     Out[3]: 36
 63 | 
 64 | 
 65 | ### Part 2
 66 | 
 67 | In "part 2" of ``simple_module.py``, we're starting to use some of the more rich OO structures. This part of the module defines a ``Dog`` class, which has two methods: ``__init__()`` ("dunder init"), and ``talk()``. ``__init__()`` is referred to as the constructor or the initializer, depending on what other languages one has learned; this is the method that gets called automatically when you instantiate a new object of this particular class. The definition of the class often includes a base class from which we inherit things, and in Python 2.x, this is typically ``object`` (allows us to use newer features of Python). In Python 3, ``object`` doesn't have to be passed explicitly. We'll use inheritance again later; for now, just remember that you probably want to add ``object`` to your base class definition.
 68 | 
 69 | When we create a new object based on a class, the constructor assigns that particular object all of the attributes defined in the constructor, and we can again address all the methods and variables (collectively, called attributes):
 70 | 
 71 |     In [1]: import simple_module as sm
 72 | 
 73 |     In [2]: d1 = sm.Dog()       # create a Dog object
 74 | 
 75 |     In [3]: d1                  # without a __repr__() defined in the class, returns the object's module, class, and memory location
 76 |     Out[3]: <simple_module.Dog at 0x10b1d0a10>
 77 | 
 78 |     In [4]: vars(d1)            # display all the internal variables of this object
 79 |     Out[4]: {'legs': 4, 'name': 'rex', 'owner': 'jane', 'word': 'woof'}
 80 | 
 81 |     In [5]: d1.legs             # get the value of a particular variable
 82 |     Out[5]: 4
 83 | 
 84 |     In [6]: d1.talk()          # have d1 use its talk() method
 85 |     Out[6]: 'woof, my name is rex. i have 4 legs and belong to jane.'
 86 | 
 87 |     In [7]: d1.name = "lichtenstien"    # overwrite the name attribute in d1
 88 | 
 89 |     In [8]: d1.talk()          # note the different output
 90 |     Out[8]: 'woof, my name is lichtenstien. i have 4 legs and belong to jane.'
 91 | 
 92 |     In [9]: d2 = sm.Dog()       # create a new Dog
 93 | 
 94 |     In [10]: d2                 # note that this is in a different memory location
 95 |     Out[10]: <simple_module.Dog at 0x10b1d0cd0>
 96 | 
 97 |     In [11]: d2.talk()         # d2 has the same talk() method, but the original variable values
 98 |     Out[11]: 'woof, my name is rex. i have 4 legs and belong to jane.'
 99 | 
100 | 
101 | Above, ``d1`` and ``d2`` are two unique instances of the same ``Dog`` class. As we've defined the ``Dog`` class, there's never going to be any variation from one ``Dog`` to another (at least upon creation - we could still reach in and overwrite the attributes). Sadness. But never fear, Part 3 brings hope...
102 | 
103 | ### Part 3
104 | 
105 | Finally, in "part 3", we have a ``Cat`` class definition similar to that of the ``Dog`` class, except note that the constructor takes arguments. There are a few ways to require or use arguments in the class definition for the creation of a new object. The first argument in any class constructor is ``self`` and this allows Python to use and access the attributes and methods that are unique to this particular instance. Later, when we want to manipulate or access these internal things, we can use the ``self.`` notation to do so.
106 | 
107 | The first argument that we created is ``name`` and this is a required argument. If you try to create a ``Cat`` without passing in at least this one argument, you'll get a ``TypeError``. The second argument is ``legs``, but notice that in the constructor, we're already assigning it a default value. If you create a ``Cat`` and only give it a name, it will be created with four legs and an owner named John. However, since these arguments are in the constructor, you can override the default values. Importantly, the default arguments are positional, but if you pass them as keyword pairs the order doesn't matter (subtle!). Some examples:
108 | 
109 |     In [1]: import simple_module as sm
110 | 
111 |     In [2]: c = sm.Cat("fuzzball")                              # gets defaults for other attributes
112 | 
113 |     In [3]: vars(c)
114 |     Out[3]: {'legs': 4, 'name': 'fuzzball', 'owner': 'john', 'word': 'meow'}
115 | 
116 |     In [4]: c = sm.Cat("fuzzball", owner="josh", legs=56)       # as explicit keywords, order doesn't matter
117 | 
118 |     In [5]: c.talk()
119 |     Out[5]: 'meow, my name is fuzzball. i have 56 legs and belong to josh.'
120 | 
121 |     In [6]: c = sm.Cat("garfield", "john", 4)                   # without keywords, arguments are positional!
122 | 
123 |     In [7]: c.talk()
124 |     Out[7]: 'meow, my name is garfield. i have john legs and belong to 4.'      # d'oh!
125 | 
126 | 
127 | One final point on using a module like ``simple_module.py``. Whenever a module (say, ``A``) is imported into another one (``B``), a bunch of magic happens behind the scenes. One such thing is that the attribute ``__name__`` gets assigned to whichever module imports the new one. If, however, the module is executed from the command line, the ``__name__`` attribute is assigned the value ``__main__``, which is why many modules have a test for this at the bottom. This is a way to show some default behavior, or a quick way to test the behavior of a module, but it isn't really adequate for a real test suite. Run ``simple_module.py`` from the command line to see an example, and check that the code at the bottom of the file that runs makes sense.
128 | 
129 | 
130 | ## Packages
131 | 
132 | Often, there is a bunch of code (contained in many separate modules) that are logically related and should be carried around together. Think of the way ``gnacs`` contains many separate modules (``twacs.py``, ``fsqacs.py``, ...), contained within subdirectories of e.g. a Github repo. In a typical Python package, these directories will each have an ``__init__.py`` module with a single line of code in it that names all of the submodules (see, for example, the version in [``gnacs``](https://github.com/DrSkippy27/Gnacs/blob/master/acscsv/__init__.py)). The ``__init__.py`` is how Python knows that the files inside this directory are modules and that they can also be imported. Many of the programs that we use frequently (nearly all of the OSS Python code Scott has written, the "big" Python libraries like ``numpy``) are actually parts of packages, because they contain a lot of code. It seems that the word "package" is the proper term, but "library" is often used to mean the same thing. Generally, packages from any language or operating system are installed on your system by a package manager, e.g.: ``npm``, node.js; ``rpm``, Ruby; ``easy_install``, \*nix-like (?); ``apt-get``, \*nix; ``conda``, Python [Anaconda]; ``homebrew, fink, macports``, OS X. The most common package manager for Python is ``pip`` which goes out to get code from [PyPI](https://python.pypi.org/). 
133 | 
134 | Ideally, the naming of directories follows a logical hierarchy for the modules contained therein. In this repo, you'll find a directory called ``life`` which is perhaps haughty, but nonetheless representative of the submodules contained in it: ``beast.py`` and ``human.py`` (please wait until after the tutorial to have philosophical discussions about the (non-)orthogonality of "beast" and "human"). These two modules represent different, specialized versions of ``life``. Open ``beast.py`` first and have a look. The ``Animal`` class definition and all of its methods should look familiar from the earlier examples. The new hotness comes when we create another class ``Dog``, which is derived from the more general ``Animal`` class. The first line of the constructor then allows us to use all of the ``Animal`` methods and attributes from a ``Dog``. Note that the ``Dog`` constructor also overrides a couple of the ``Animal`` attributes.
135 | 
136 | If you stay at the bottom level of this project (the ``python-oop`` directory), you can still import the ``beast.py`` module because we've made ``life`` a package. Some example behavior:
137 | 
138 | 
139 |     In [1]: from life import beast as b
140 | 
141 |     In [2]: a = b.Animal()
142 | 
143 |     In [3]: a.talk()
144 |     Out[3]: " I'm None and hungry"
145 | 
146 |     In [4]: vars(a)
147 |     Out[4]: {'hungry': True, 'name': None, 'talk': ''}
148 | 
149 |     In [5]: d = b.Dog("rex")
150 | 
151 |     In [6]: d.talk()
152 |     Out[6]: "woof! I'm rex and hungry"
153 | 
154 |     In [7]: d.eat()
155 | 
156 |     In [8]: d.talk()
157 |     Out[8]: "woof! I'm rex and not hungry"
158 | 
159 | 
160 | The last section of this little walk-through combines many of the above concepts into the ``human.py`` module in the ``life`` package. Open that up and have a look. The most basic class is that of a ``Person`` where a few attributes are set to default, null-ish values. In fact, we don't need to assign any attributes here, but I wanted to include a base ``talk()`` method here that would include some default behavior. When you create a ``Woman`` object (which, note, has required arguments in addition to some keyword arguments) you get to override the boring defaults of the ``Person``, and you get a new method ``high_five()`` that isn't callable by a general ``Person`` object.
161 | 
162 | Finally, in the last class definition in ``human.py``, the constructor has a combination of required arguments, default arguments that get passed to the parent constructor, and the keyword argument wildcard ``**kwargs``. This last one allows you to pass an arbitrarily-long list of ``key=value`` pairs to the constructor (for a longer introduction to ``*args`` and ``**kwargs``, check out the [official docs **fix link**](blah)). After calling the parent constructor, you can see that we iterate through all of the key-value pairs, making an attribute of the former, and assigning it a value of the latter. Examples are always good. Here's a run-through of the different things that came from the ``life`` package:
163 | 
164 | 
165 |     In [1]: from life import *                      # since we know there's not much in life, * is ok
166 |                                                     #   and gives access to beast and human modules
167 |     In [2]: d = beast.Dog("fluffy")
168 | 
169 |     In [5]: w = human.Woman("sarah")                # requires a name
170 | 
171 |     In [6]: vars(w)
172 |     Out[6]: {'eyes': 'blue', 'gender': 'female', 'name': 'sarah', 'word': 'yo'}
173 | 
174 |     In [7]: w.talk()
175 |     Out[7]: 'yo, my name is sarah. my gender is female, and i have blue eyes.'
176 | 
177 |     In [8]: aw = human.AmericanWoman("jane")        # requires a name because Woman requires a name
178 | 
179 |     In [9]: vars(aw)
180 |     Out[9]: {'eyes': 'brown', 'gender': 'female', 'name': 'jane', 'word': 'holla'}
181 | 
182 |     In [10]: aw.talk()                              # AmericanWoman has overwritten the talk() method
183 | 
184 |     Don't come hangin' 'round my door
185 |     I don't wanna see your shadow no more.
186 | 
187 |     In [11]: aw.lenny_kravitz()
188 |     Out[11]: 'guitar solo! ( http://www.youtube.com/watch?v=UzWHE32IxUc ) '
189 | 
190 |     In [12]: aw = human.AmericanWoman("jane", bear="teddy", cow=3)  # arb key-value pairs get assigned as attributes
191 | 
192 |     In [13]: vars(aw)
193 |     Out[13]:
194 |     {'bear': 'teddy',
195 |      'cow': 3,
196 |      'eyes': 'brown',
197 |      'gender': 'female',
198 |      'name': 'jane',
199 |      'word': 'holla'}
200 | 
201 | 
202 | Phew! There is, of course, much more to say and know about all of these topics (e.g. how to do some of these things even better), so do check out all of the official docs and/or stackoverflow for more examples and info. 
203 | 
204 | 
205 | 
206 | ---------------------------------------------------------------
207 | ### \*Appendix A: ``import`` statements
208 | 
209 | There are three common ways you'll see modules and packages imported; the differences are mainly in how much new stuff you add to the interpreter's namespace. That is, do you have to specifically reach through `sys` to get to `sys.stdin`, or do you have direct access to `stdin`? Examples of the three types of imports are as follows:
210 | 
211 |     In [1]: import simple_module
212 |     In [2]: simple_module.my_s
213 |     Out[2]: 'hello world!'
214 |     ####
215 | 
216 |     In [1]: import simple_module as sm      # same as above, but with alias
217 |     In [2]: sm.my_int
218 |     Out[2]: 4
219 |     ####
220 | 
221 |     In [1]: from simple_module import square
222 |     In [2]: square(6)
223 |     Out[2]: 36
224 | 
225 |     In [3]: my_int
226 |     ---------------------------------------------------------------------------
227 |     NameError                                 Traceback (most recent call last)
228 |     <ipython-input-8-9b53895acf20> in <module>()
229 |     ----> 1 my_int
230 |     NameError: name 'my_int' is not defined
231 |     ####
232 | 
233 |     In [1]: from simple_module import *
234 |     In [2]: square(4)
235 |     Out[2]: 16
236 | 
237 |     In [3]: my_s
238 |     Out[3]: 'hello world!'
239 | 
240 | 
241 | ``import`` take-aways:
242 | 
243 | - Python allows you to import all willy-nilly
244 | 
245 | - order of operations for "places to look for imports" is ~ ``./``, ``PYTHONPATH`` environment variable, ``PATH`` environment variable 
246 | 
247 | - ``import *`` is often considered "poor form" because it clutters (and can clobber) the session namespace. it still happens, but be aware of that fact for e.g. debugging when your function named ``random()`` isn't working as you'd like 
248 | 
249 | - many package/module aliases have convention that are good to follow (``mpl, plt, pd, np, sp, ...``), particularly when sharing code with others or looking for help online
250 | 
251 | - all that said, code however you want but always assume the next person reading/using your code knows where you sleep 
252 | 
253 | 


--------------------------------------------------------------------------------
/classical-stats-and-social-data-101/classical-stats-and-social-data-101.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "#Outline\n",
 15 |       "* Motivate the use of the Binomial and Poisson distributions in social media analysis\n",
 16 |       "* Explore an important hypothesis test: the p-value\n",
 17 |       "* Address shortcomings"
 18 |      ]
 19 |     },
 20 |     {
 21 |      "cell_type": "markdown",
 22 |      "metadata": {},
 23 |      "source": [
 24 |       "# The Binomial and Poisson distributions"
 25 |      ]
 26 |     },
 27 |     {
 28 |      "cell_type": "markdown",
 29 |      "metadata": {},
 30 |      "source": [
 31 |       "**Goal:** demonstrate why many events in social media should be Poisson distributed.\n",
 32 |       "\n",
 33 |       "Consider a set of trials where, for each trial, something happens (a \"success\") or does not happen (a \"failure\").\n",
 34 |       "\n",
 35 |       "$p = $ probability of a success  \n",
 36 |       "$1-p = $ the probability of a failure  \n",
 37 |       "$N = $ number of trials\n",
 38 |       "\n",
 39 |       "The Binomial distribution is a function of the discrete variable $k$. Its value $B(k;p,N)$ give the probabilty of observing $k$ successes in $N$ trials, given success probability $p$. \n",
 40 |       "\n",
 41 |       "$B(k;p,N) = \\frac{N!}{k!(N-k)!}p^k(1-p)^{N-k}$\n",
 42 |       "\n",
 43 |       "A series of coin flips is a process whose distribution of outcomes is described by the Binomial distribution with $p=0.5$.\n",
 44 |       "\n",
 45 |       "Let's start by looking at the exact distribution. Remember that this is a function of a _discrete_ variable."
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "code",
 50 |      "collapsed": false,
 51 |      "input": [
 52 |       "# matplotlib plots are placed inline\n",
 53 |       "%matplotlib inline  \n",
 54 |       "\n",
 55 |       "# standard matplotlib and numpy imports\n",
 56 |       "import matplotlib.pyplot as plt\n",
 57 |       "import numpy as np\n",
 58 |       "\n",
 59 |       "# use scipy.stats to define the distribution\n",
 60 |       "import scipy\n",
 61 |       "from scipy import stats\n",
 62 |       "\n",
 63 |       "N = 10 # number of coin flips in a set\n",
 64 |       "p = 0.5 # probability of head\n",
 65 |       "\n",
 66 |       "x = scipy.linspace(0,N,N+1) # create bins\n",
 67 |       "pmf = scipy.stats.binom.pmf(x,N,p) \n",
 68 |       "# \"pmf\" => probability mass function, which (in a snowstorm of ill-used vocabulary) is in this case what we would usually call\n",
 69 |       "# the probability density function\n",
 70 |       "\n",
 71 |       "plt.bar(x,pmf)"
 72 |      ],
 73 |      "language": "python",
 74 |      "metadata": {},
 75 |      "outputs": []
 76 |     },
 77 |     {
 78 |      "cell_type": "markdown",
 79 |      "metadata": {},
 80 |      "source": [
 81 |       "Note the unit normalization in the plot above.\n",
 82 |       "\n",
 83 |       "Now let's look at a histogram made from values randomly draw from the exact distribution. This histograms represents, in this example, a finite-sized set of coin flips."
 84 |      ]
 85 |     },
 86 |     {
 87 |      "cell_type": "code",
 88 |      "collapsed": false,
 89 |      "input": [
 90 |       "N = 10 # number of trials (\"coin-flips\") in a set\n",
 91 |       "p = 0.5 # probability of success (\"heads\")\n",
 92 |       "size = 50 # number of sets of coin flips\n",
 93 |       "\n",
 94 |       "np.random.binomial(N,p,size) # number of heads in a set of coin flips"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": []
 99 |     },
100 |     {
101 |      "cell_type": "code",
102 |      "collapsed": false,
103 |      "input": [
104 |       "# how many sets of trials are required to make the approximation appear visually identical to the exact distribution?\n",
105 |       "size = 10\n",
106 |       "\n",
107 |       "data = np.random.binomial(N,p,size)\n",
108 |       "n_bins = N\n",
109 |       "binned_data, bins, patches = plt.hist(data, n_bins, range = (0,10), normed=True)\n"
110 |      ],
111 |      "language": "python",
112 |      "metadata": {},
113 |      "outputs": []
114 |     },
115 |     {
116 |      "cell_type": "markdown",
117 |      "metadata": {},
118 |      "source": [
119 |       "Let's go back to the exact distribution and mess with the parameters. Be sure to adjust both the success probability and the number of trials."
120 |      ]
121 |     },
122 |     {
123 |      "cell_type": "code",
124 |      "collapsed": false,
125 |      "input": [
126 |       "N = 100 # number of trials in a set\n",
127 |       "p = 0.5 # probability of success \n",
128 |       "\n",
129 |       "x = scipy.linspace(0,N,N+1) # create bins\n",
130 |       "pmf = scipy.stats.binom.pmf(x,N,p)\n",
131 |       "plt.bar(x,pmf)"
132 |      ],
133 |      "language": "python",
134 |      "metadata": {},
135 |      "outputs": []
136 |     },
137 |     {
138 |      "cell_type": "markdown",
139 |      "metadata": {},
140 |      "source": [
141 |       "\n",
142 |       "**Question:** Do you expect this distribution represent the behavior of social media variables?\n",
143 |       "\n",
144 |       "<br>\n",
145 |       "\n",
146 |       "Now consider the limit where $p\\to 0$ and $N\\to\\infty$, but where $pN = \\nu$ stays constant. \n",
147 |       "\n",
148 |       "In this case, $\\nu$ is the *expected* number of \"successes\", or the number of any \"thing\" in a _large_ data set that appears with low relative frequency. This value $\\nu$ is the single parameter, called the _mean_, of the Poisson distribution. This distribution describes many phenomena, including many variables in social data.\n",
149 |       "\n",
150 |       "* 1 hour or tweets from the firehose $\\longrightarrow$ large $N$  \n",
151 |       "* the probability of ProfileLocation = \"Boulder, Colorado, United States\" $\\longrightarrow$ small $p$\n",
152 |       "\n",
153 |       "So, the number of tweets with ProfileLocation = \"Boulder, Colorado, United States\" in a large sample of tweets will be described by the Poisson distribution with mean $\\nu$. \n",
154 |       "\n",
155 |       "The Poisson distribution is given by:\n",
156 |       "\n",
157 |       "$P(k;\\nu) = \\frac{\\nu^k e^{-\\nu}}{k!}$\n",
158 |       "\n",
159 |       "Let's look at the shape of the distribution:"
160 |      ]
161 |     },
162 |     {
163 |      "cell_type": "code",
164 |      "collapsed": false,
165 |      "input": [
166 |       "nu = 3 # Poisson mean...the most likely value\n",
167 |       "n = nu*3 # this simply sets a range; it is not a parameter of the distribution\n",
168 |       "\n",
169 |       "k = scipy.linspace(0,n,n+1) # create bins\n",
170 |       "pmf = scipy.stats.poisson.pmf(k,nu) \n",
171 |       "\n",
172 |       "plt.bar(k,pmf)"
173 |      ],
174 |      "language": "python",
175 |      "metadata": {},
176 |      "outputs": []
177 |     },
178 |     {
179 |      "cell_type": "markdown",
180 |      "metadata": {},
181 |      "source": [
182 |       "Things to note:  \n",
183 |       "\n",
184 |       "* the distribution is a function of a discrete variable\n",
185 |       "* the distribution has one continuous parameter\n",
186 |       "* the distribution is asymmetric for small values of $\\nu$\n",
187 |       "* the distribution becomes symmetric for large values of $\\nu$\n",
188 |       "\n",
189 |       "Conclusion: the Poisson distribution can be expected to model the number of observations of a rare event in a large set of trials. "
190 |      ]
191 |     },
192 |     {
193 |      "cell_type": "markdown",
194 |      "metadata": {},
195 |      "source": [
196 |       "# Hypothesis Tests and p-values"
197 |      ]
198 |     },
199 |     {
200 |      "cell_type": "markdown",
201 |      "metadata": {},
202 |      "source": [
203 |       "##Digresion on the Scientific method \n",
204 |       "\n",
205 |       "The scientific method does not prove that a hypothesis is true/correct. Rather, hypotheses are disproved by experimental results that are inconsistent with the hypothesis.\n",
206 |       "\n",
207 |       "In many fields of science, consistency or inconsistency is a discrete state. For systems where the statistics come into play, things are more complicated. We must define what _level of inconsistency_ constitutes the disproval of a hypothesis.\n",
208 |       "\n",
209 |       "## Hypothesis forms\n",
210 |       "\n",
211 |       "To quantitatively test a hypothesis, we need a quantitative hypothesis, and some data.\n",
212 |       "\n",
213 |       "Example hypotheses:  \n",
214 |       "\n",
215 |       "* the rate of tweets with ProfileLocation = \"Boulder, Colorado, United States\" is Poisson distributed with a mean of 4.5 / hour. \n",
216 |       "* the rate of tweets with ProfileLocation = \"Boulder, Colorado, United States\" is Poisson distributed with a mean of 4.5 / 10k tweets. \n",
217 |       "* the number of Twitter users with more than 1000 @-mentions in the past hour is normally distributed with mean 301 and variance 200.\n",
218 |       "\n",
219 |       "**Complication:** note that the hypotheses specify the form of the data distribution, as well as numerical parameters of those distributions. So this is in some sense a composite, conditional hypothesis: we assume both the shape of the distribution and particular values for the parameters. \n",
220 |       "\n",
221 |       "**Simplification:** For now, we'll assume that our _form_ is a well-founded hypothesis, and will not be tested. We will test hypotheses that are defined by the values of the parameters.\n",
222 |       "\n",
223 |       "Now look at some data..."
224 |      ]
225 |     },
226 |     {
227 |      "cell_type": "code",
228 |      "collapsed": false,
229 |      "input": [
230 |       "# define a function that counts the number of \"Boulder, Colorado, United States\" entries \n",
231 |       "# in a file containing ProfileLocation strings \n",
232 |       "def count_boulders(file_name):\n",
233 |       "    f = open(file_name)\n",
234 |       "    boulder_counter = 0\n",
235 |       "    for line in f:\n",
236 |       "        if 'Boulder, Colorado, United States' in line:\n",
237 |       "            boulder_counter += 1\n",
238 |       "    return boulder_counter"
239 |      ],
240 |      "language": "python",
241 |      "metadata": {},
242 |      "outputs": []
243 |     },
244 |     {
245 |      "cell_type": "markdown",
246 |      "metadata": {},
247 |      "source": [
248 |       "The files 'locations.X.txt', where 1<=X<=12, each contain the ProfileLocation strings from 10k tweets. "
249 |      ]
250 |     },
251 |     {
252 |      "cell_type": "code",
253 |      "collapsed": false,
254 |      "input": [
255 |       "print('{} tweets from Boulder'.format(count_boulders('locations.1.txt')))"
256 |      ],
257 |      "language": "python",
258 |      "metadata": {},
259 |      "outputs": []
260 |     },
261 |     {
262 |      "cell_type": "markdown",
263 |      "metadata": {},
264 |      "source": [
265 |       "This value seems like a good first hypothesis. When put into a Poisson distribution, our hypothesis for the distribution of the number of Boulder tweets in 10k tweets is:"
266 |      ]
267 |     },
268 |     {
269 |      "cell_type": "code",
270 |      "collapsed": false,
271 |      "input": [
272 |       "nu = count_boulders('locations.1.txt') # Poisson mean...the most likely value\n",
273 |       "n = nu*3 # this simply sets a range; it is not a parameter of the distribution\n",
274 |       "\n",
275 |       "k = scipy.linspace(0,n,n+1) # create bins\n",
276 |       "pmf = scipy.stats.poisson.pmf(k,nu) #pmf = probability mass function\n",
277 |       "plt.bar(k,pmf)"
278 |      ],
279 |      "language": "python",
280 |      "metadata": {},
281 |      "outputs": []
282 |     },
283 |     {
284 |      "cell_type": "markdown",
285 |      "metadata": {},
286 |      "source": [
287 |       "The distribution is normalized to unit area, so that probabilities for individual bins can be read off the plot. The probability of observing 0 Boulder tweets in 10k is ~5%, while observing 5 is about 10% probable.\n",
288 |       "\n",
289 |       "## Hypothesis tests and the p-value\n",
290 |       "\n",
291 |       "The p-value is defined for a hypothesis and a single experimental result. It is the total probablility, under the given hypothesis, of observing a result as likely or less likely than the actual observation. In its simplest form, this corresponds to integrating the tail of the distribution at and beyond the observed value."
292 |      ]
293 |     },
294 |     {
295 |      "cell_type": "code",
296 |      "collapsed": false,
297 |      "input": [
298 |       "from IPython.display import Image\n",
299 |       "Image(filename='p_val.jpeg')"
300 |      ],
301 |      "language": "python",
302 |      "metadata": {},
303 |      "outputs": []
304 |     },
305 |     {
306 |      "cell_type": "markdown",
307 |      "metadata": {},
308 |      "source": [
309 |       "In many fields of science, the observation of a result with a p-value of less than 0.05 is considered a disproval of the null hypothesis.\n",
310 |       "\n",
311 |       "Next: define a function that returns the p-value for a Poisson distribution with mean $\\nu$ and observed value $x$."
312 |      ]
313 |     },
314 |     {
315 |      "cell_type": "code",
316 |      "collapsed": false,
317 |      "input": [
318 |       "def p_value_poisson(nu,x_obs):\n",
319 |       "    dist = scipy.stats.poisson(nu)\n",
320 |       "    return 1-dist.cdf(x_obs-1) # cdf = cumulative distribution function...integrates from 0 up to x"
321 |      ],
322 |      "language": "python",
323 |      "metadata": {},
324 |      "outputs": []
325 |     },
326 |     {
327 |      "cell_type": "markdown",
328 |      "metadata": {},
329 |      "source": [
330 |       "Now test some values:"
331 |      ]
332 |     },
333 |     {
334 |      "cell_type": "code",
335 |      "collapsed": false,
336 |      "input": [
337 |       "print(p_value_poisson(5,10)) # probability of observing 10,11,12... when the true mean is 5\n",
338 |       "print(p_value_poisson(4,5)) # probability of observing 5,6,7,8... when the true mean is 4"
339 |      ],
340 |      "language": "python",
341 |      "metadata": {},
342 |      "outputs": []
343 |     },
344 |     {
345 |      "cell_type": "markdown",
346 |      "metadata": {},
347 |      "source": [
348 |       "Is this a good hypothesis test? Let's use the mean estimated from 'locations.1.txt', and test the p-value of 'locations.2.txt'."
349 |      ]
350 |     },
351 |     {
352 |      "cell_type": "code",
353 |      "collapsed": false,
354 |      "input": [
355 |       "print('{} boulder tweets in {}'.format(count_boulders('locations.1.txt'),'locations.1.txt'))\n",
356 |       "print('{} boulder tweets in {}'.format(count_boulders('locations.2.txt'),'locations.2.txt'))\n",
357 |       "\n",
358 |       "p_value_poisson(count_boulders('locations.1.txt'),count_boulders('locations.2.txt'))"
359 |      ],
360 |      "language": "python",
361 |      "metadata": {},
362 |      "outputs": []
363 |     },
364 |     {
365 |      "cell_type": "markdown",
366 |      "metadata": {},
367 |      "source": [
368 |       "This says that, under our hypothesis of $\\nu=4$, there is a 1% probability of observing a result like or less likely than that observed in 'locations.2.txt'. Is our hypothesis disproved? \n",
369 |       "\n",
370 |       "<br>\n",
371 |       "\n",
372 |       "Let's look at the of p-values for an ensemble of 10k sets of tweets. "
373 |      ]
374 |     },
375 |     {
376 |      "cell_type": "code",
377 |      "collapsed": false,
378 |      "input": [
379 |       "for file_number in range(1,12):\n",
380 |       "    file_name = 'locations.' + str(file_number) + '.txt'\n",
381 |       "    print('{0} boulder tweets -> p_value: {1:0.3f}'.format(\n",
382 |       "            count_boulders(file_name),\n",
383 |       "            p_value_poisson(\n",
384 |       "                count_boulders('locations.1.txt'),\n",
385 |       "                count_boulders(file_name)\n",
386 |       "            )\n",
387 |       "        )\n",
388 |       "    )\n",
389 |       "    "
390 |      ],
391 |      "language": "python",
392 |      "metadata": {},
393 |      "outputs": []
394 |     },
395 |     {
396 |      "cell_type": "markdown",
397 |      "metadata": {},
398 |      "source": [
399 |       "It turns out that we based our hypothesis on a relatively uncommon frequency of Boulder tweets. And we tested this hypothesis with a set of data from 'locations.2.txt' which turn out to be even more uncommon.\n",
400 |       "\n",
401 |       "Let's make a few improvements:   \n",
402 |       "\n",
403 |       "* get slightly larger numbers by counting tweets from Boulder in 50k tweet buckets\n",
404 |       "* calculate the $\\nu$ value for our hypothesis by averaging the $\\nu$'s from the first 10 buckets.\n",
405 |       "* calculate the $p$-values for the remaining buckets\n",
406 |       "\n",
407 |       "\n",
408 |       "These data can be found in 'num_boulder_50k.txt', where each line gives the number of tweets from Boulder in a 50k tweet bucket."
409 |      ]
410 |     },
411 |     {
412 |      "cell_type": "code",
413 |      "collapsed": false,
414 |      "input": [
415 |       "line_counter = 0\n",
416 |       "count_values = [] # array of number of Boulder tweets\n",
417 |       "p_values = [] # array of p-values\n",
418 |       "sum_of_boulder_tweets = 0 # used to get average number of Boulder tweets from first 10 values\n",
419 |       "for line in open('num_boulder_50k.txt'):\n",
420 |       "    if line_counter < 10:\n",
421 |       "        sum_of_boulder_tweets += float(line)\n",
422 |       "    else:\n",
423 |       "        mean = sum_of_boulder_tweets/10\n",
424 |       "        count_values.append(int(line))\n",
425 |       "        p_values.append( p_value_poisson(mean,int(line)) )\n",
426 |       "    line_counter += 1\n",
427 |       "    \n",
428 |       "print('\\nestimate for nu is {}\\n'.format(mean))\n"
429 |      ],
430 |      "language": "python",
431 |      "metadata": {},
432 |      "outputs": []
433 |     },
434 |     {
435 |      "cell_type": "code",
436 |      "collapsed": false,
437 |      "input": [
438 |       "# construct Poisson distribution and overlay it with the observed data\n",
439 |       "n = 10\n",
440 |       "k = scipy.linspace(0,n,n+1) # create bins\n",
441 |       "pmf = scipy.stats.poisson.pmf(k,mean) # use 'mean' from above\n",
442 |       "plt.bar(k,pmf,color='yellow')\n",
443 |       "tmp = plt.hist(count_values, n, range = (0,n), normed=True, histtype='step')"
444 |      ],
445 |      "language": "python",
446 |      "metadata": {},
447 |      "outputs": []
448 |     },
449 |     {
450 |      "cell_type": "code",
451 |      "collapsed": false,
452 |      "input": [
453 |       "# now show the p-values\n",
454 |       "print('printing {} p-values:\\n'.format(len(p_values)))\n",
455 |       "for val in sorted(p_values):\n",
456 |       "    print('{0:0.3f}'.format(val))\n"
457 |      ],
458 |      "language": "python",
459 |      "metadata": {},
460 |      "outputs": []
461 |     },
462 |     {
463 |      "cell_type": "markdown",
464 |      "metadata": {},
465 |      "source": [
466 |       "Even with a hypothesis that we know represents the data relatively well, we still occasionally get small p-values. I hope this makes you uncomfortable.\n"
467 |      ]
468 |     },
469 |     {
470 |      "cell_type": "markdown",
471 |      "metadata": {},
472 |      "source": [
473 |       "## Some thoughts to think\n",
474 |       "\n",
475 |       "We've made some assumptions:\n",
476 |       "\n",
477 |       "The value of $\\nu$ is not changing over the time period during which our samples were taken.    \n",
478 |       "\n",
479 |       "* we control this effect by ensuring that all tweets are actually from hour 00 on Mondays in May 2014\n",
480 |       "* if you can't or don't control for this potential bias, your probability for observing very small p-values will increase\n",
481 |       "\n",
482 |       "We could also use a more sophisticated estimation method for $\\nu$, but this won't significantly change the hypothesis  \n",
483 |       "\n",
484 |       "* the problem is not the hypothesis, it's the hypothesis test\n",
485 |       "\n",
486 |       "For a sufficiently large number of results, there will ALWAYS be some with arbitrarily small p-values. So how do we disprove a hypothesis?\n",
487 |       "\n",
488 |       "* some fields of science define disproval with very small p-values (10^-7)\n",
489 |       "\n",
490 |       "We can account for multiple trials with the look-elsewhere effect:  \n",
491 |       "\n",
492 |       "* this is a correction factor to the p-values which account the fact that looking in lots of places leads to improbable results\n",
493 |       "* but it assumes the trials are completely independant\n",
494 |       "\n",
495 |       "**Question:** Instead of performing a series of experiments and making an independant analysis of each result, wouldn't it make more sense to use the output of each trial to influence subsequent analysis results? \n",
496 |       "\n",
497 |       "**Answer:** That sounds awfully Bayesian..."
498 |      ]
499 |     },
500 |     {
501 |      "cell_type": "code",
502 |      "collapsed": false,
503 |      "input": [],
504 |      "language": "python",
505 |      "metadata": {},
506 |      "outputs": []
507 |     }
508 |    ],
509 |    "metadata": {}
510 |   }
511 |  ]
512 | }


--------------------------------------------------------------------------------
/count-min/CountMinSketch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:469285e7d235d84f6d806ae277b686f47e67e1b6e717692769251a10e89e4b24"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Basic Idea of Count Min sketch"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "We map the input value to _multiple_ points in a _relatively small_ output space. Therefore, the count associated with a given input will be applied to multiple counts in the output space. Even though collisions will occur, the _minimum_ count associated with a given input will have some desirable properties, including the ability to be used to estimate the largest N counts.\n",
 24 |       "\n",
 25 |       "<img src=\"files/count_min_2.png\">\n",
 26 |       "\n",
 27 |       "http://debasishg.blogspot.com/2014/01/count-min-sketch-data-structure-for.html\n",
 28 |       "\n"
 29 |      ]
 30 |     },
 31 |     {
 32 |      "cell_type": "markdown",
 33 |      "metadata": {},
 34 |      "source": [
 35 |       "Parameters of the sketch:\n",
 36 |       "\n",
 37 |       "*   epsilon\n",
 38 |       "*   delta\n",
 39 |       "\n",
 40 |       "These parameters are inversely and exponentially (respectively) related to the sketch size parameters, d and w. "
 41 |      ]
 42 |     },
 43 |     {
 44 |      "cell_type": "heading",
 45 |      "level": 1,
 46 |      "metadata": {},
 47 |      "source": [
 48 |       "Implementation of the CM sketch"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "import sys\n",
 56 |       "import random\n",
 57 |       "import numpy as np\n",
 58 |       "import heapq\n",
 59 |       "import json\n",
 60 |       "import time\n",
 61 |       "\n",
 62 |       "BIG_PRIME = 9223372036854775783\n",
 63 |       "\n",
 64 |       "def random_parameter():\n",
 65 |       "    return random.randrange(0, BIG_PRIME - 1)\n",
 66 |       "\n",
 67 |       "\n",
 68 |       "class Sketch:\n",
 69 |       "    def __init__(self, delta, epsilon, k):\n",
 70 |       "        \"\"\"\n",
 71 |       "        Setup a new count-min sketch with parameters delta, epsilon and k\n",
 72 |       "\n",
 73 |       "        The parameters delta and epsilon control the accuracy of the\n",
 74 |       "        estimates of the sketch\n",
 75 |       "\n",
 76 |       "        Cormode and Muthukrishnan prove that for an item i with count a_i, the\n",
 77 |       "        estimate from the sketch a_i_hat will satisfy the relation\n",
 78 |       "\n",
 79 |       "        a_hat_i <= a_i + epsilon * ||a||_1\n",
 80 |       "\n",
 81 |       "        with probability at least 1 - delta, where a is the the vector of all\n",
 82 |       "        all counts and ||x||_1 is the L1 norm of a vector x\n",
 83 |       "\n",
 84 |       "        Parameters\n",
 85 |       "        ----------\n",
 86 |       "        delta : float\n",
 87 |       "            A value in the unit interval that sets the precision of the sketch\n",
 88 |       "        epsilon : float\n",
 89 |       "            A value in the unit interval that sets the precision of the sketch\n",
 90 |       "        k : int\n",
 91 |       "            A positive integer that sets the number of top items counted\n",
 92 |       "\n",
 93 |       "        Examples\n",
 94 |       "        --------\n",
 95 |       "        >>> s = Sketch(10**-7, 0.005, 40)\n",
 96 |       "\n",
 97 |       "        Raises\n",
 98 |       "        ------\n",
 99 |       "        ValueError\n",
100 |       "            If delta or epsilon are not in the unit interval, or if k is\n",
101 |       "            not a positive integer\n",
102 |       "\n",
103 |       "        \"\"\"\n",
104 |       "        if delta <= 0 or delta >= 1:\n",
105 |       "            raise ValueError(\"delta must be between 0 and 1, exclusive\")\n",
106 |       "        if epsilon <= 0 or epsilon >= 1:\n",
107 |       "            raise ValueError(\"epsilon must be between 0 and 1, exclusive\")\n",
108 |       "        if k < 1:\n",
109 |       "            raise ValueError(\"k must be a positive integer\")\n",
110 |       "\n",
111 |       "        self.w = int(np.ceil(np.exp(1) / epsilon))\n",
112 |       "        self.d = int(np.ceil(np.log(1 / delta)))\n",
113 |       "        self.k = k\n",
114 |       "        self.hash_functions = [self.__generate_hash_function() for i in range(self.d)]\n",
115 |       "        self.count = np.zeros((self.d, self.w), dtype='int32')\n",
116 |       "        self.heap, self.top_k = [], {} # top_k => [estimate, key] pairs\n",
117 |       "\n",
118 |       "    def update(self, key, increment):\n",
119 |       "        \"\"\"\n",
120 |       "        Updates the sketch for the item with name of key by the amount\n",
121 |       "        specified in increment\n",
122 |       "\n",
123 |       "        Parameters\n",
124 |       "        ----------\n",
125 |       "        key : string\n",
126 |       "            The item to update the value of in the sketch\n",
127 |       "        increment : integer\n",
128 |       "            The amount to update the sketch by for the given key\n",
129 |       "\n",
130 |       "        Examples\n",
131 |       "        --------\n",
132 |       "        >>> s = Sketch(10**-7, 0.005, 40)\n",
133 |       "        >>> s.update('http://www.cnn.com/', 1)\n",
134 |       "\n",
135 |       "        \"\"\"\n",
136 |       "        for row, hash_function in enumerate(self.hash_functions):\n",
137 |       "            column = hash_function(abs(hash(key)))\n",
138 |       "            self.count[row, column] += increment\n",
139 |       "\n",
140 |       "        self.update_heap(key)\n",
141 |       "\n",
142 |       "    def update_heap(self, key):\n",
143 |       "        \"\"\"\n",
144 |       "        Updates the class's heap that keeps track of the top k items for a\n",
145 |       "        given key\n",
146 |       "\n",
147 |       "        For the given key, it checks whether the key is present in the heap,\n",
148 |       "        updating accordingly if so, and adding it to the heap if it is\n",
149 |       "        absent\n",
150 |       "\n",
151 |       "        Parameters\n",
152 |       "        ----------\n",
153 |       "        key : string\n",
154 |       "            The item to check against the heap\n",
155 |       "\n",
156 |       "        \"\"\"\n",
157 |       "        estimate = self.get(key)\n",
158 |       "\n",
159 |       "        if not self.heap or estimate >= self.heap[0][0]:\n",
160 |       "            if key in self.top_k:\n",
161 |       "                old_pair = self.top_k.get(key)\n",
162 |       "                old_pair[0] = estimate\n",
163 |       "                heapq.heapify(self.heap)\n",
164 |       "            else:\n",
165 |       "                if len(self.top_k) < self.k:\n",
166 |       "                    heapq.heappush(self.heap, [estimate, key])\n",
167 |       "                    self.top_k[key] = [estimate, key]\n",
168 |       "                else:\n",
169 |       "                    new_pair = [estimate, key]\n",
170 |       "                    old_pair = heapq.heappushpop(self.heap, new_pair)\n",
171 |       "                    del self.top_k[old_pair[1]]\n",
172 |       "                    self.top_k[key] = new_pair\n",
173 |       "\n",
174 |       "    def get(self, key):\n",
175 |       "        \"\"\"\n",
176 |       "        Fetches the sketch estimate for the given key\n",
177 |       "\n",
178 |       "        Parameters\n",
179 |       "        ----------\n",
180 |       "        key : string\n",
181 |       "            The item to produce an estimate for\n",
182 |       "\n",
183 |       "        Returns\n",
184 |       "        -------\n",
185 |       "        estimate : int\n",
186 |       "            The best estimate of the count for the given key based on the\n",
187 |       "            sketch\n",
188 |       "\n",
189 |       "        Examples\n",
190 |       "        --------\n",
191 |       "        >>> s = Sketch(10**-7, 0.005, 40)\n",
192 |       "        >>> s.update('http://www.cnn.com/', 1)\n",
193 |       "        >>> s.get('http://www.cnn.com/')\n",
194 |       "        1\n",
195 |       "\n",
196 |       "        \"\"\"\n",
197 |       "        value = sys.maxint\n",
198 |       "        for row, hash_function in enumerate(self.hash_functions):\n",
199 |       "            column = hash_function(abs(hash(key)))\n",
200 |       "            value = min(self.count[row, column], value)\n",
201 |       "\n",
202 |       "        return value\n",
203 |       "\n",
204 |       "    def __generate_hash_function(self):\n",
205 |       "        \"\"\"\n",
206 |       "        Returns a hash function from a family of pairwise-independent hash\n",
207 |       "        functions\n",
208 |       "\n",
209 |       "        \"\"\"\n",
210 |       "        a, b = random_parameter(), random_parameter()\n",
211 |       "        return lambda x: (a * x + b) % BIG_PRIME % self.w\n",
212 |       "        "
213 |      ],
214 |      "language": "python",
215 |      "metadata": {},
216 |      "outputs": [],
217 |      "prompt_number": 1
218 |     },
219 |     {
220 |      "cell_type": "code",
221 |      "collapsed": false,
222 |      "input": [
223 |       "# define a function to return a list of the exact top users, sorted by count\n",
224 |       "def exact_top_users(f, top_n = 10):\n",
225 |       "    import operator\n",
226 |       "    counts = {}\n",
227 |       "    for line in f:\n",
228 |       "        try:\n",
229 |       "            user = json.loads(line)['actor']['preferredUsername']\n",
230 |       "            if user not in counts:\n",
231 |       "                counts[user] = 1\n",
232 |       "            else:\n",
233 |       "                counts[user] += 1\n",
234 |       "        except ValueError:\n",
235 |       "            pass\n",
236 |       "        except KeyError:\n",
237 |       "            pass\n",
238 |       "    counter = 0\n",
239 |       "    results = []\n",
240 |       "    for user,count in reversed(sorted(counts.iteritems(), key=operator.itemgetter(1))):\n",
241 |       "        if counter >= top_n:\n",
242 |       "            break\n",
243 |       "        results.append('{} {}'.format(user,str(count)))\n",
244 |       "        counter += 1\n",
245 |       "    return results"
246 |      ],
247 |      "language": "python",
248 |      "metadata": {},
249 |      "outputs": [],
250 |      "prompt_number": 2
251 |     },
252 |     {
253 |      "cell_type": "code",
254 |      "collapsed": false,
255 |      "input": [
256 |       "f = open('CM_small.json')\n",
257 |       "results_exact = sorted(exact_top_users(f))\n",
258 |       "print(results_exact)"
259 |      ],
260 |      "language": "python",
261 |      "metadata": {},
262 |      "outputs": [
263 |       {
264 |        "output_type": "stream",
265 |        "stream": "stdout",
266 |        "text": [
267 |         "['HPSupport 3', 'user1 3', 'user2 3', 'user4 1']\n"
268 |        ]
269 |       }
270 |      ],
271 |      "prompt_number": 8
272 |     },
273 |     {
274 |      "cell_type": "code",
275 |      "collapsed": false,
276 |      "input": [
277 |       "# define a function to return a list of the estimated top users, sorted by count\n",
278 |       "def CM_top_users(f, s):\n",
279 |       "    for line in f:\n",
280 |       "        try:\n",
281 |       "            user_name = json.loads(line)['actor']['preferredUsername']\n",
282 |       "            s.update(user_name,1)\n",
283 |       "        except ValueError:\n",
284 |       "            pass\n",
285 |       "        except KeyError:\n",
286 |       "            pass\n",
287 |       "    \n",
288 |       "    results = []\n",
289 |       "    for value in reversed(sorted(s.top_k.values())):\n",
290 |       "        results.append('{1} {0}'.format(str(value[0]),str(value[1])))\n",
291 |       "    return results"
292 |      ],
293 |      "language": "python",
294 |      "metadata": {},
295 |      "outputs": [],
296 |      "prompt_number": 9
297 |     },
298 |     {
299 |      "cell_type": "code",
300 |      "collapsed": false,
301 |      "input": [
302 |       "# instantiate a Sketch object\n",
303 |       "s = Sketch(10**-3, 0.1, 10)"
304 |      ],
305 |      "language": "python",
306 |      "metadata": {},
307 |      "outputs": [],
308 |      "prompt_number": 10
309 |     },
310 |     {
311 |      "cell_type": "code",
312 |      "collapsed": false,
313 |      "input": [
314 |       "f = open('CM_small.json')\n",
315 |       "results_CM = sorted(CM_top_users(f,s))\n",
316 |       "print(results_CM)"
317 |      ],
318 |      "language": "python",
319 |      "metadata": {},
320 |      "outputs": [
321 |       {
322 |        "output_type": "stream",
323 |        "stream": "stdout",
324 |        "text": [
325 |         "['HPSupport 6', 'user1 6', 'user2 6', 'user4 2']\n"
326 |        ]
327 |       }
328 |      ],
329 |      "prompt_number": 13
330 |     },
331 |     {
332 |      "cell_type": "code",
333 |      "collapsed": false,
334 |      "input": [
335 |       "for item in zip(results_exact,results_CM):\n",
336 |       "    print(item)"
337 |      ],
338 |      "language": "python",
339 |      "metadata": {},
340 |      "outputs": [
341 |       {
342 |        "output_type": "stream",
343 |        "stream": "stdout",
344 |        "text": [
345 |         "('HPSupport 3', 'HPSupport 6')\n",
346 |         "('user1 3', 'user1 6')\n",
347 |         "('user2 3', 'user2 6')\n",
348 |         "('user4 1', 'user4 2')\n"
349 |        ]
350 |       }
351 |      ],
352 |      "prompt_number": 14
353 |     },
354 |     {
355 |      "cell_type": "markdown",
356 |      "metadata": {},
357 |      "source": [
358 |       "Is it possible to make the sketchs so coarse that its estimates are wrong even for this data set?\n"
359 |      ]
360 |     },
361 |     {
362 |      "cell_type": "code",
363 |      "collapsed": false,
364 |      "input": [
365 |       "s = Sketch(0.9, 0.9, 10)\n",
366 |       "f = open('CM_small.json')\n",
367 |       "results_coarse_CM = CM_top_users(f,s)\n",
368 |       "print(results_coarse_CM)"
369 |      ],
370 |      "language": "python",
371 |      "metadata": {},
372 |      "outputs": [
373 |       {
374 |        "output_type": "stream",
375 |        "stream": "stdout",
376 |        "text": [
377 |         "['user2 6', 'HPSupport 5', 'user1 3', 'user4 1']\n"
378 |        ]
379 |       }
380 |      ],
381 |      "prompt_number": 182
382 |     },
383 |     {
384 |      "cell_type": "markdown",
385 |      "metadata": {},
386 |      "source": [
387 |       "Yes! (if you try enough) Why? \n",
388 |       "\n",
389 |       "* The 'w' parameter goes like ceiling(exp(1)/epsilon), which is always >=~ 3.\n",
390 |       "* The 'd' parameter goes like ceiling(log(1/delta), which is always >= 1.\n",
391 |       "\n",
392 |       "So, you're dealing with a space with minimum size 3 x 1. With 10 records, it's possible that all 4 users map their counts to the point. So it's possible to see an estimate as high as 10, in this case."
393 |      ]
394 |     },
395 |     {
396 |      "cell_type": "heading",
397 |      "level": 2,
398 |      "metadata": {},
399 |      "source": [
400 |       "Now for a larger data set."
401 |      ]
402 |     },
403 |     {
404 |      "cell_type": "code",
405 |      "collapsed": false,
406 |      "input": [
407 |       "f = open('CM_large.json')\n",
408 |       "%time results_exact = exact_top_users(f)\n",
409 |       "print(results_exact)"
410 |      ],
411 |      "language": "python",
412 |      "metadata": {},
413 |      "outputs": [
414 |       {
415 |        "output_type": "stream",
416 |        "stream": "stdout",
417 |        "text": [
418 |         "CPU times: user 18.6 s, sys: 255 ms, total: 18.9 s\n",
419 |         "Wall time: 19 s\n",
420 |         "['ryuutuu19 39', 'jidousya_ 29', 'food_nourin 27', 'life_style_s 26', 'punpun4 25', 'SportsAB 25', 'fudousankensetu 22', 'TaylorMadeGolf 17', '333_shy 15', 'FuckMica_ 14']\n"
421 |        ]
422 |       }
423 |      ],
424 |      "prompt_number": 184
425 |     },
426 |     {
427 |      "cell_type": "code",
428 |      "collapsed": false,
429 |      "input": [
430 |       "f = open('CM_large.json')\n",
431 |       "s = Sketch(10**-4, 0.001, 10)\n",
432 |       "%time results_CM = CM_top_users(f,s)\n",
433 |       "print(results_CM)"
434 |      ],
435 |      "language": "python",
436 |      "metadata": {},
437 |      "outputs": [
438 |       {
439 |        "output_type": "stream",
440 |        "stream": "stdout",
441 |        "text": [
442 |         "CPU times: user 40.1 s, sys: 222 ms, total: 40.4 s\n",
443 |         "Wall time: 40.3 s\n",
444 |         "['ryuutuu19 82', 'food_nourin 72', 'life_style_s 70', 'jidousya_ 69', 'SportsAB 67', '333_shy 62', 'fudousankensetu 61', 'punpun4 59', 'TaylorMadeGolf 59', 'FuckMica_ 58']\n"
445 |        ]
446 |       }
447 |      ],
448 |      "prompt_number": 185
449 |     },
450 |     {
451 |      "cell_type": "markdown",
452 |      "metadata": {},
453 |      "source": [
454 |       "For this precision and dataset size, the CM algo takes _longer_ than the exact solution. How's the accuracy?"
455 |      ]
456 |     },
457 |     {
458 |      "cell_type": "code",
459 |      "collapsed": false,
460 |      "input": [
461 |       "for item in zip(results_exact,results_CM):\n",
462 |       "    print(item)\n",
463 |       "    print item[1] in results_exact and item[0] in results_CM"
464 |      ],
465 |      "language": "python",
466 |      "metadata": {},
467 |      "outputs": [
468 |       {
469 |        "output_type": "stream",
470 |        "stream": "stdout",
471 |        "text": [
472 |         "('ryuutuu19 39', 'yuki_kkth 538')\n",
473 |         "False\n",
474 |         "('jidousya_ 29', 'georgiaa_wbuu 531')\n",
475 |         "False\n",
476 |         "('food_nourin 27', '_MahlonDallee 529')\n",
477 |         "False\n",
478 |         "('life_style_s 26', 'daniel_leparulo 527')\n",
479 |         "False\n",
480 |         "('punpun4 25', 'Carla_Carpediem 525')\n",
481 |         "False\n",
482 |         "('SportsAB 25', 'viishthay 523')\n",
483 |         "False\n",
484 |         "('fudousankensetu 22', 'YOLOfonseca 521')\n",
485 |         "False\n",
486 |         "('TaylorMadeGolf 17', 'cautiouX 520')\n",
487 |         "False\n",
488 |         "('333_shy 15', 'Ana_Bia147 520')\n",
489 |         "False\n",
490 |         "('FuckMica_ 14', 'AdrianaCelis7 520')\n",
491 |         "False\n"
492 |        ]
493 |       }
494 |      ],
495 |      "prompt_number": 190
496 |     },
497 |     {
498 |      "cell_type": "code",
499 |      "collapsed": false,
500 |      "input": [
501 |       "f = open('CM_large.json')\n",
502 |       "s = Sketch(10**-3, 0.01, 10)\n",
503 |       "%time results_CM = CM_top_users(f,s)\n",
504 |       "print(results_CM)"
505 |      ],
506 |      "language": "python",
507 |      "metadata": {},
508 |      "outputs": [
509 |       {
510 |        "output_type": "stream",
511 |        "stream": "stdout",
512 |        "text": [
513 |         "CPU times: user 35.9 s, sys: 254 ms, total: 36.1 s\n",
514 |         "Wall time: 36.1 s\n",
515 |         "['yuki_kkth 538', 'georgiaa_wbuu 531', '_MahlonDallee 529', 'daniel_leparulo 527', 'Carla_Carpediem 525', 'viishthay 523', 'YOLOfonseca 521', 'cautiouX 520', 'Ana_Bia147 520', 'AdrianaCelis7 520']\n"
516 |        ]
517 |       }
518 |      ],
519 |      "prompt_number": 188
520 |     },
521 |     {
522 |      "cell_type": "code",
523 |      "collapsed": false,
524 |      "input": [
525 |       "for item in zip(results_exact,results_CM):\n",
526 |       "    print(item)\n",
527 |       "    print item[1] in results_exact and item[0] in results_CM"
528 |      ],
529 |      "language": "python",
530 |      "metadata": {},
531 |      "outputs": [
532 |       {
533 |        "output_type": "stream",
534 |        "stream": "stdout",
535 |        "text": [
536 |         "('ryuutuu19 39', 'yuki_kkth 538')\n",
537 |         "False\n",
538 |         "('jidousya_ 29', 'georgiaa_wbuu 531')\n",
539 |         "False\n",
540 |         "('food_nourin 27', '_MahlonDallee 529')\n",
541 |         "False\n",
542 |         "('life_style_s 26', 'daniel_leparulo 527')\n",
543 |         "False\n",
544 |         "('punpun4 25', 'Carla_Carpediem 525')\n",
545 |         "False\n",
546 |         "('SportsAB 25', 'viishthay 523')\n",
547 |         "False\n",
548 |         "('fudousankensetu 22', 'YOLOfonseca 521')\n",
549 |         "False\n",
550 |         "('TaylorMadeGolf 17', 'cautiouX 520')\n",
551 |         "False\n",
552 |         "('333_shy 15', 'Ana_Bia147 520')\n",
553 |         "False\n",
554 |         "('FuckMica_ 14', 'AdrianaCelis7 520')\n",
555 |         "False\n"
556 |        ]
557 |       }
558 |      ],
559 |      "prompt_number": 191
560 |     },
561 |     {
562 |      "cell_type": "markdown",
563 |      "metadata": {},
564 |      "source": [
565 |       "The most common use of the CM sketch is analysis of streaming data. Why?\n",
566 |       "\n",
567 |       "* Becasue the data are arriving in real time, the hashing of the inputs is not a bottleneck as it is when the data are already collected.\n",
568 |       "* The sketches are associative, meaning that the operation can be parallelized trivially, and the results easily combined in the end.\n",
569 |       "\n",
570 |       "This will be a primary function of the Enumerator, which will be demo-ed soon."
571 |      ]
572 |     },
573 |     {
574 |      "cell_type": "code",
575 |      "collapsed": false,
576 |      "input": [],
577 |      "language": "python",
578 |      "metadata": {},
579 |      "outputs": []
580 |     }
581 |    ],
582 |    "metadata": {}
583 |   }
584 |  ]
585 | }


--------------------------------------------------------------------------------
/topic-modeling-101/topic_modeling_part1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:887cbfce4d5ff4f963164c74b13c9cd0f731910a4cb99e031146ed0cb8461d6c"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "\n",
 16 |       "#Definitions:\n",
 17 |       "* 1.) document - a body of text (eg. tweet)\n",
 18 |       "* 2.) text corpus - the set of documents that contains the text for the analysis (eg. many tweets)\n",
 19 |       "* 3.) dictionary - a mapping between tokens and their integer ids. In other words, the key:value pairs are token:unique_id for each unique token in the text corpus (eg. {'mathematics':1,'engineering':1,'physics':3})\n",
 20 |       "* 4.) vector corpus - the set of documents transformed such that each token is a tuple (token_id , doc_freq).\n",
 21 |       "\n",
 22 |       "\n",
 23 |       "#Stop words:\n",
 24 |       "**Topics are limited to tokens contained within the text corpus**. We can remove specific tokens from consideration using a set of stopwords, which can be edited per project requirement. \n"
 25 |      ]
 26 |     },
 27 |     {
 28 |      "cell_type": "code",
 29 |      "collapsed": false,
 30 |      "input": [
 31 |       "from nltk.corpus import stopwords\n",
 32 |       "import pprint as pp\n",
 33 |       "\n",
 34 |       "stopset = set(stopwords.words('english'))\n",
 35 |       "print type(stopset)\n",
 36 |       "stopset.update([\"ruby tuesday\"]) # add token\n",
 37 |       "stopset.remove(\"own\")            # remove token\n",
 38 |       "\n",
 39 |       "# single lang\n",
 40 |       "print \"--English stopset\"\n",
 41 |       "print stopset\n",
 42 |       "\n",
 43 |       "# multi lang\n",
 44 |       "print\n",
 45 |       "print \"--Multi language stopset\"\n",
 46 |       "langs=['danish', 'dutch', 'english', 'french', 'german', 'italian','norwegian', 'portuguese', 'russian', 'spanish', 'swedish']\n",
 47 |       "stop_list = []\n",
 48 |       "for lang in langs:\n",
 49 |       "    stop_list.extend(stopwords.words(lang))\n",
 50 |       "\n",
 51 |       "stop_words_set=set(stop_list) #  -- could save to disk --\n",
 52 |       "print stop_words_set\n"
 53 |      ],
 54 |      "language": "python",
 55 |      "metadata": {},
 56 |      "outputs": []
 57 |     },
 58 |     {
 59 |      "cell_type": "markdown",
 60 |      "metadata": {},
 61 |      "source": [
 62 |       "#Text Corpus:\n",
 63 |       "The text corpus used for the demo is contained in a file with lines of text separated by carriage returns. Each line of text is it's own document. We will use the entire text corpus as our training set to build the dictionary and then remove stopwords; however, the dictionary need not be built from the entire text corpus if a smaller set of documents is sufficient. "
 64 |      ]
 65 |     },
 66 |     {
 67 |      "cell_type": "code",
 68 |      "collapsed": false,
 69 |      "input": [
 70 |       "with open('text_corpus.txt', 'r') as f:\n",
 71 |       "    documents=[]\n",
 72 |       "    for line in f.readlines():\n",
 73 |       "        documents.append(line.strip())\n",
 74 |       "pp.pprint(documents)    "
 75 |      ],
 76 |      "language": "python",
 77 |      "metadata": {},
 78 |      "outputs": []
 79 |     },
 80 |     {
 81 |      "cell_type": "markdown",
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "#Dictionary:\n",
 85 |       "Next, we'll create a dictionary from the tokens in the entire text corpus. We're splitting the documents on white space for this demo; however, we'll use regex in later. We'll then remove stopwords and tokens that only appear once in the entire text corpus."
 86 |      ]
 87 |     },
 88 |     {
 89 |      "cell_type": "code",
 90 |      "collapsed": false,
 91 |      "input": [
 92 |       "from gensim import corpora, models, similarities\n",
 93 |       "import logging\n",
 94 |       "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=\"./log/topic-log\")\n",
 95 |       "logr = logging.getLogger(\"topic_model\")\n",
 96 |       "logr.info(\"#\"*15 + \" started \" + \"#\"*15)\n",
 97 |       "\n",
 98 |       "print \"Dictionary (full text corpus):\"\n",
 99 |       "dictionary = corpora.Dictionary(line.lower().split() for line in open('text_corpus.txt'))\n",
100 |       "print dictionary\n",
101 |       "print (dictionary.token2id)\n",
102 |       "\n",
103 |       "print\n",
104 |       "\n",
105 |       "print \"Dictionary (removed stopwords and once-ids):\"\n",
106 |       "stop_ids = [dictionary.token2id[stopword] for stopword in stop_words_set if stopword in dictionary.token2id]\n",
107 |       "\n",
108 |       "once_ids = [tokenid for tokenid, corpus_freq in dictionary.dfs.iteritems() if corpus_freq == 1]\n",
109 |       "#remove stop_ids,\"+\",once_ids\n",
110 |       "dictionary.filter_tokens(bad_ids=stop_ids + once_ids,good_ids=None)\n",
111 |       "## consider: dictionary.filter_extremes(no_below=2) \n",
112 |       "dictionary.compactify()\n",
113 |       "print dictionary\n",
114 |       "print (dictionary.token2id)\n"
115 |      ],
116 |      "language": "python",
117 |      "metadata": {},
118 |      "outputs": []
119 |     },
120 |     {
121 |      "cell_type": "markdown",
122 |      "metadata": {},
123 |      "source": [
124 |       "We can also **add documents dynamically**, which is a huge advantage for real time data! Notice how the dictionary starts with 12 unique tokens (above) and ends with 25 tokens (below). Also note that we must add a list of lists such that docs=[[doc1],[doc2]...] where doc1 and doc2 are tokenized strings. "
125 |      ]
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "import copy\n",
132 |       "print \"Add documents to dictionary dynamically:\"\n",
133 |       "print \"doc to add = \\\"Pooh bear says, 'People say nothing is impossible, but I do nothing every day.'\\\"\"\n",
134 |       "print\n",
135 |       "print \"doc tokenized =\",[item for item in \"Pooh bear says 'People say nothing is impossible, but I do nothing every day.'\".lower().split() if item not in stop_ids]\n",
136 |       "print\n",
137 |       "\n",
138 |       "docs=[[item for item in \"Pooh bear says, 'People say nothing is impossible, but I do nothing every day.'\".lower().split() if item not in stop_ids]]\n",
139 |       "d=copy.deepcopy(dictionary)\n",
140 |       "d.add_documents(docs)\n",
141 |       "\n",
142 |       "d.compactify()\n",
143 |       "\n",
144 |       "print \"#NOTE: since we were only splitting on space, the punctuation is included.\"\n",
145 |       "print\n",
146 |       "print d\n",
147 |       "print d.token2id\n",
148 |       "\n",
149 |       "\n"
150 |      ],
151 |      "language": "python",
152 |      "metadata": {},
153 |      "outputs": []
154 |     },
155 |     {
156 |      "cell_type": "markdown",
157 |      "metadata": {},
158 |      "source": [
159 |       "#Vectorize:\n",
160 |       "Essentially a word frequency for each document is created in this step. Each document in the text corpus will be transformed into list of tuples \n",
161 |       "[[(token_id , doc_freq),(token_id , doc_freq),(token_id , doc_freq)] , [(token_id , doc_freq),(token_id , doc_freq)]...]. We must iterate through the text corpus to create this set.\n"
162 |      ]
163 |     },
164 |     {
165 |      "cell_type": "code",
166 |      "collapsed": false,
167 |      "input": [
168 |       "\n",
169 |       "\n",
170 |       "vector_corpus=[]\n",
171 |       "with open('text_corpus.txt', 'r') as f:\n",
172 |       "    for line in f.readlines():\n",
173 |       "        vector_corpus.append(dictionary.doc2bow(line.lower().split()))\n",
174 |       "\n",
175 |       "print \"Vector corpus:\"\n",
176 |       "pp.pprint(vector_corpus)\n",
177 |       "counter=0\n",
178 |       "print dictionary\n",
179 |       "\n",
180 |       "# save to disk\n",
181 |       "corpora.MmCorpus.serialize('vector_corpus.mm', vector_corpus)\n",
182 |       "serialized_corpus = corpora.MmCorpus('vector_corpus.mm')\n",
183 |       "pp.pprint(list(serialized_corpus))"
184 |      ],
185 |      "language": "python",
186 |      "metadata": {},
187 |      "outputs": []
188 |     },
189 |     {
190 |      "cell_type": "markdown",
191 |      "metadata": {},
192 |      "source": [
193 |       "# [TfIdf](http://radimrehurek.com/gensim/models/tfidfmodel.html):\n",
194 |       "The TfIdf, term frequency inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. The TfIdf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to control for the fact that some words are generally more common than others. \n",
195 |       "\n",
196 |       "To train a model using TfIdf, you first need to go through the corpus once and copute doc frequencies, which we already did above.  \n",
197 |       "\n",
198 |       "Typically, the TfIdf weight is composed by two terms: the first computes the normalized term frequency, which is the number of times a word appears in a document, divided by the total number of words in that document; the second term is the inverse document frequency (idf), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.\n",
199 |       "\n",
200 |       "$$TfIdf = {token\\ frequency\\ in\\ doc} * \\ln(\\frac{total\\ docs\\ in\\ corpus}{total\\ docs\\ w/\\ token})$$\n",
201 |       "\n",
202 |       "Tf: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization.\n",
203 |       "\n",
204 |       "\n",
205 |       "Idf: Inverse Document Frequency, which measures how important a term is. While computing Tf, all terms are considered equally important. However it is known that certain terms, such as \"is\", \"of\", and \"that\", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: \n"
206 |      ]
207 |     },
208 |     {
209 |      "cell_type": "code",
210 |      "collapsed": false,
211 |      "input": [
212 |       "from gensim import corpora, models, similarities\n",
213 |       "tfidf = models.TfidfModel(vector_corpus, normalize=False) # trains the model \n",
214 |       "print tfidf\n",
215 |       "corpus_tfidf=tfidf[vector_corpus]\n",
216 |       "print (dictionary.token2id)\n",
217 |       "for doc in corpus_tfidf:\n",
218 |       "    print doc\n",
219 |       "#   tfidf = (<normalized> term frequency)                                     * (inverse document frequency) \n",
220 |       "\n",
221 |       "#   tfidf = (# of instances of word in single doc / # of words in single doc) * ln(# of total documents / # of docs in which word appears) = tfidf\n",
222 |       "\n",
223 |       "#   the tfidf matrix can be used to convert any vector ( uniq id, count per doc ) to ( uniq id, tfidf score )\n"
224 |      ],
225 |      "language": "python",
226 |      "metadata": {},
227 |      "outputs": []
228 |     },
229 |     {
230 |      "cell_type": "markdown",
231 |      "metadata": {},
232 |      "source": [
233 |       "#Latent Dirichlet Allocation vs Latent Semantic Indexing ---improve explanation---\n",
234 |       "\n",
235 |       "Inerteresting perspective: [link](http://stats.stackexchange.com/questions/32310/topic-models-and-word-co-occurrence-methods)\n",
236 |       "\n",
237 |       "\"I will just describe four milestones/popular models and their advantages/disadvantages and thus highlight (some of) the main differences (or at least what I think are the main/most important differences).\n",
238 |       "\n",
239 |       "The \"easiest\" approach, which would be to cluster the documents by matching them against a predefined query of terms (as in PMI). These lexical matching methods however might be inaccurate due to polysemy (multiple meanings) and synonymy (multiple words that have similar meanings) of single terms.\n",
240 |       "\n",
241 |       "As a remedy, latent semantic indexing (LSI) tries to overcome this by mapping terms and documents into a latent semantic space via a singular value decomposition. The LSI results are more robust indicators of meaning than individual terms would be. However, one drawback of LSI is that it lacks in terms of solid probabilistic foundation.\n",
242 |       "\n",
243 |       "This was partly solved by the invention of probabilistic LSI (pLSI). In pLSI models each word in a document is drawn from a mixture model specified via multinomial random variables (which also allows higher-order co-occurences as @sviatoslav hong mentioned). This was an important step forward in probabilistic text modeling, but was incomplete in the sense that it offers no probabilistic structure at the level of documents.\n",
244 |       "\n",
245 |       "Latent Dirichlet Allocation (LDA) alleviates this and was the first fully probabilistic model for text clustering. Blei et al. (2003) show that pLSI is a maximum a-posteriori estimated LDA model under a uniform Dirichlet prior.\n",
246 |       "\n",
247 |       "Note that the models mentioned above (LSI, pLSI, LDA) have in common that they are based on the \u201cbag-of-words\u201d assumption - i.e. that within a document, words are exchangeable, i.e. the order of words in a document can be neglected. This assumption of exchangeability offers a further justification for LDA over the other approaches: Assuming that not only words within documents are exchangeable, but also documents, i.e., the order of documents within a corpus can be neglected, De Finetti's theorem states that any set of exchangeable random variables has a representation as a mixture distribution. Thus if exchangeability for documents and words within documents is assumed, a mixture model for both is needed. Exactly this is what LDA generally achieves but PMI or LSI do not (and even pLSI not as beautiful as LDA).\"\n",
248 |       "\n",
249 |       "\\----\n",
250 |       "\n",
251 |       "The LSI process transforms documents from TfIdf-weighted space into a latent space of a lower dimensionality.\n",
252 |       "\n",
253 |       "LDA can be explained using plate notation. The boxes are \u201cplates\u201d representing replicates. The outer plate represents documents, while the inner plate represents the repeated choice of topics and words within a document. \n",
254 |       "* M denotes the number of documents\n",
255 |       "* N the number of words in a document\n",
256 |       "* \u03b1 is the parameter of the Dirichlet prior on the per-document topic distributions\n",
257 |       "* \u03b2 is the parameter of the Dirichlet prior on the per-topic word distribution\n",
258 |       "* $\\theta_i$ is the topic distribution for document i\n",
259 |       "* $\\phi_k$is the word distribution for topic k\n",
260 |       "* z_{ij} is the topic for the jth word in document i\n",
261 |       "* w_{ij} is the specific word.\n",
262 |       "<img src=\"files/image.png\">\n",
263 |       "\n"
264 |      ]
265 |     },
266 |     {
267 |      "cell_type": "code",
268 |      "collapsed": false,
269 |      "input": [
270 |       "from itertools import *\n",
271 |       "number_of_clusters=3\n",
272 |       "lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_of_clusters) # initialize an LSI transformation\n",
273 |       "lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=number_of_clusters,\\\n",
274 |       "                               update_every=1, chunksize=10000, passes=1)\n",
275 |       "corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi\n",
276 |       "corpus_lda = lda[corpus_tfidf] \n",
277 |       "# for item in corpus_lsi:\n",
278 |       "#    print (item)\n",
279 |       "print \"-\"*10+\"LDA\"+\"-\"*10\n",
280 |       "t=0\n",
281 |       "for t, item in enumerate(lda.print_topics(number_of_clusters)):\n",
282 |       "    print \"topic#{0}: {1}\".format(t,item)\n",
283 |       "print\n",
284 |       "for item in corpus_lda:\n",
285 |       "    print item\n",
286 |       "    #print lsi.show_topics()\n",
287 |       "#print lsi.print_topic(0,topn=1)\n",
288 |       "# save to disk\n",
289 |       "#print lsi.projection.s\n",
290 |       "#lsi.save('corpus_lsi.lsi')\n",
291 |       "#lsi=models.LsiModel.load\n",
292 |       "print \n",
293 |       "print \n",
294 |       "#models.lsimodel.clip_spectrum(0.1,4,discard=0.001)\n",
295 |       "\n",
296 |       "# Find the threshold, let's set the threshold to be 1/#clusters,\n",
297 |       "# To prove that the threshold is sane, we average the sum of all probabilities:\n",
298 |       "scores = list(chain(*[[score for topic,score in topic] \\\n",
299 |       "                      for topic in [doc for doc in corpus_lda]]))\n",
300 |       "threshold = sum(scores)/len(scores)\n",
301 |       "print \"threshold:\",threshold\n",
302 |       "print\n",
303 |       "cluster1 = [j for i,j in zip(corpus_lda,documents) if i[0][1] > threshold]\n",
304 |       "cluster2 = [j for i,j in zip(corpus_lda,documents) if i[1][1] > threshold]\n",
305 |       "cluster3 = [j for i,j in zip(corpus_lda,documents) if i[2][1] > threshold]\n",
306 |       "\n",
307 |       "print \"topic#0: {0}\".format(cluster1)\n",
308 |       "print \"topic#1: {0}\".format(cluster2)\n",
309 |       "print \"topic#2: {0}\".format(cluster3)\n",
310 |       "\n",
311 |       "print \n",
312 |       "print\n",
313 |       "print \"-\"*10+\"LSI\"+\"-\"*10\n",
314 |       "t=0\n",
315 |       "for t, item in enumerate(lsi.print_topics(number_of_clusters)):\n",
316 |       "    print \"topic#{0}: {1}\".format(t,item)\n",
317 |       "print\n",
318 |       "\n",
319 |       "for item in corpus_lsi:\n",
320 |       "    print item\n",
321 |       "    #print lsi.show_topics()\n",
322 |       "#print lsi.print_topic(0,topn=1)\n",
323 |       "# save to disk\n",
324 |       "#print lsi.projection.s\n",
325 |       "#lsi.save('corpus_lsi.lsi')\n",
326 |       "#lsi=models.LsiModel.load\n",
327 |       "print \n",
328 |       "print \n",
329 |       "#models.lsimodel.clip_spectrum(0.1,4,discard=0.001)\n",
330 |       "\n",
331 |       "# Find the threshold, let's set the threshold to be 1/#clusters,\n",
332 |       "# To prove that the threshold is sane, we average the sum of all probabilities:\n",
333 |       "scores = list(chain(*[[score for topic,score in topic] \\\n",
334 |       "                      for topic in [doc for doc in corpus_lsi]]))\n",
335 |       "threshold = sum(scores)/len(scores)\n",
336 |       "print \"threshold:\",threshold\n",
337 |       "print\n",
338 |       "\n",
339 |       "cluster1 = [j for i,j in zip(corpus_lsi,documents) if i[0][1] > threshold]\n",
340 |       "cluster2 = [j for i,j in zip(corpus_lsi,documents) if i[1][1] > threshold]\n",
341 |       "cluster3 = [j for i,j in zip(corpus_lsi,documents) if i[2][1] > threshold]\n",
342 |       "\n",
343 |       "print \"topic#1: {0}\".format(cluster1)\n",
344 |       "print \"topic#2: {0}\".format(cluster2)\n",
345 |       "print \"topic#3: {0}\".format(cluster3)\n",
346 |       "\n",
347 |       "\n"
348 |      ],
349 |      "language": "python",
350 |      "metadata": {},
351 |      "outputs": []
352 |     },
353 |     {
354 |      "cell_type": "code",
355 |      "collapsed": false,
356 |      "input": [
357 |       "\n",
358 |       "\n",
359 |       "#play space\n",
360 |       "\n"
361 |      ],
362 |      "language": "python",
363 |      "metadata": {},
364 |      "outputs": []
365 |     },
366 |     {
367 |      "cell_type": "markdown",
368 |      "metadata": {},
369 |      "source": [
370 |       "#LDA\n",
371 |       "<img src=\"files/table.png\">"
372 |      ]
373 |     },
374 |     {
375 |      "cell_type": "markdown",
376 |      "metadata": {},
377 |      "source": [
378 |       "Inerteresting perspective: [link](http://stackoverflow.com/questions/10628262/inserting-image-into-ipython-notebook-markdown)\n",
379 |       "\n",
380 |       "Recently, a huge body of literature discussing how to extract information from written text has grown. Hence I will just describe four milestones/popular models and their advantages/disadvantages and thus highlight (some of) the main differences (or at least what I think are the main/most important differences).\n",
381 |       "\n",
382 |       "You mention the \"easiest\" approach, which would be to cluster the documents by matching them against a predefined query of terms (as in PMI). These lexical matching methods however might be inaccurate due to polysemy (multiple meanings) and synonymy (multiple words that have similar meanings) of single terms.\n",
383 |       "\n",
384 |       "As a remedy, latent semantic indexing (LSI) tries to overcome this by mapping terms and documents into a latent semantic space via a singular value decomposition. The LSI results are more robust indicators of meaning than individual terms would be. However, one drawback of LSI is that it lacks in terms of solid probabilistic foundation.\n",
385 |       "\n",
386 |       "This was partly solved by the invention of probabilistic LSI (pLSI). In pLSI models each word in a document is drawn from a mixture model specified via multinomial random variables (which also allows higher-order co-occurences as @sviatoslav hong mentioned). This was an important step forward in probabilistic text modeling, but was incomplete in the sense that it offers no probabilistic structure at the level of documents.\n",
387 |       "\n",
388 |       "Latent Dirichlet Allocation (LDA) alleviates this and was the first fully probabilistic model for text clustering. Blei et al. (2003) show that pLSI is a maximum a-posteriori estimated LDA model under a uniform Dirichlet prior.\n",
389 |       "\n",
390 |       "Note that the models mentioned above (LSI, pLSI, LDA) have in common that they are based on the \u201cbag-of-words\u201d assumption - i.e. that within a document, words are exchangeable, i.e. the order of words in a document can be neglected. This assumption of exchangeability offers a further justification for LDA over the other approaches: Assuming that not only words within documents are exchangeable, but also documents, i.e., the order of documents within a corpus can be neglected, De Finetti's theorem states that any set of exchangeable random variables has a representation as a mixture distribution. Thus if exchangeability for documents and words within documents is assumed, a mixture model for both is needed. Exactly this is what LDA generally achieves but PMI or LSI do not (and even pLSI not as beautiful as LDA)."
391 |      ]
392 |     }
393 |    ],
394 |    "metadata": {}
395 |   }
396 |  ]
397 | }


--------------------------------------------------------------------------------
/pandas-101/data/twitter_sample.csv:
--------------------------------------------------------------------------------
  1 | tag:search.twitter.com,2005:351835317671690241|2013-07-01T22:50:51.000Z|kavga edelim ama konuşalım|None|None|None|['tr']|en|tr|None|None|['[25.663883, 35.817497]', '[25.663883, 42.109993]', '[44.822762, 42.109993]', '[44.822762, 35.817497]']|Polygon|Türkiye|TR|7200|None|None|None|None|None|None|None|None|None|Rümeysa Özdemir|uykugibisiyok|248312738|35|178|129|0|2028|Tweet|None|None|None
  2 | tag:search.twitter.com,2005:351835317604593666|2013-07-01T22:50:51.000Z|@shane_joersz wooooow|None|None|None|['en']|en|es|[47.29088246, -101.0379045]|Point|['[-101.043785, 47.275933]', '[-101.043785, 47.306601]', '[-101.01285, 47.306601]', '[-101.01285, 47.275933]']|Polygon|Washburn, ND|US|-21600|sevenohone|None|None|None|None|None|None|None|None|cori▲alex.|CoBerg_|48025164|32|144|215|0|4071|Reply|http://twitter.com/shane_joersz/statuses/351828999086940160|None|None
  3 | tag:search.twitter.com,2005:351835317747191808|2013-07-01T22:50:51.000Z|お前との肌のふれあいなんぞ求めてない。自重しろ。|None|None|None|['ja']|en|ja|[35.70675048, 139.84273005]|Point|['[139.8332175, 35.6345694444444]', '[139.8332175, 35.7507544444444]', '[139.919876666667, 35.7507544444444]', '[139.919876666667, 35.6345694444444]']|Polygon|江戸川区, 東京都|JP|-36000|ちば|None|None|None|None|None|None|None|None|黒い恋人|yamasyoyamasyo|217987801|18|37|54|0|3505|Tweet|None|None|None
  4 | tag:search.twitter.com,2005:351835317608792064|2013-07-01T22:50:51.000Z|@Gabo_navoficial yo tambien creo en ti mi charro bello:))|None|None|None|['en']|en|es|None|None|['[-80.248663, 25.986366]', '[-80.248663, 26.093192]', '[-80.102066, 26.093192]', '[-80.102066, 25.986366]']|Polygon|Hollywood, FL|US|-14400|hollywood florida|None|None|None|None|None|None|None|None|MARIA|maria_e_pena|461188787|50|438|174|1|17636|Reply|http://twitter.com/Gabo_navoficial/statuses/351835075786186752|None|None
  5 | tag:search.twitter.com,2005:351835317755592705|2013-07-01T22:50:51.000Z|только ты об этом не знаешь... http://t.co/MOH8pcKyJY|['http://twitter.com/ElkaAlb/status/351835317755592705/photo/1']|None|None|['ru']|en|ru|None|None|['[23.179216999999998, 51.2626423]', '[23.179216999999998, 56.1717339]', '[32.794200000000004, 56.1717339]', '[32.794200000000004, 51.2626423]']|Polygon|Belarus|BY|None|None|None|None|None|None|None|None|None|None|Элька Алб|ElkaAlb|1433828712|21|12|6|0|145|Tweet|None|None|None
  6 | tag:search.twitter.com,2005:351835317801730048|2013-07-01T22:50:51.000Z|I'm at Büyükçekmece Sahil w/ @emineetrk http://t.co/30BZ8dBzxL|['http://test.gnip.com/mock']|['http://t.co/30BZ8dBzxL']|['http://4sq.com/1cJD0J4']|['tr']|en|tr|[41.01520298, 28.59359264]|Point|['[25.663883, 35.817497]', '[25.663883, 42.109993]', '[44.822762, 42.109993]', '[44.822762, 35.817497]']|Polygon|Türkiye|TR|-18000|FENERBAHÇE TARAFTARINDIR!|None|None|None|None|None|None|None|None|Özgür Ayı|curva1907|361352064|41|226|346|0|7759|Tweet|None|None|None
  7 | tag:search.twitter.com,2005:351835317554257920|2013-07-01T22:50:51.000Z|Dile Al Amor &gt;&gt;&gt;|None|None|None|['en']|en|pt|[47.4355124, -120.3279417]|Point|['[-120.364921, 47.396859]', '[-120.364921, 47.458797]', '[-120.29258, 47.458797]', '[-120.29258, 47.396859]']|Polygon|Wenatchee, WA|US|-28800|Wenatchee, Washington|None|None|None|None|None|None|None|None|Mailari ♥|MailariMunoz|30740676|42|247|64|1|2438|Tweet|None|None|None
  8 | tag:search.twitter.com,2005:351835318552506369|2013-07-01T22:50:51.000Z|Bağıra bağıra şarkı söylemek istiyoruum|None|None|None|['tr']|en|tr|[40.8025861, 29.4308738]|Point|None|None|None|None|-18000|None|None|None|None|None|None|None|None|None|Yağmur|ygmreroglu|349059450|40|304|233|0|3558|Tweet|None|None|None
  9 | tag:search.twitter.com,2005:351835318028222465|2013-07-01T22:50:51.000Z|@pafcdan Aww good! X|None|None|None|['en']|en|en|[50.3582949, -4.0930973]|Point|['[-4.209496, 50.3320883]', '[-4.209496, 50.444179]', '[-4.019642999999999, 50.444179]', '[-4.019642999999999, 50.3320883]']|Polygon|Plymouth, Plymouth|GB|0|Plymouth|None|None|None|None|None|None|None|None|Becky ❤|BeckyLou90x|26568635|38|380|860|0|3116|Reply|http://twitter.com/pafcdan/statuses/351834457453502466|None|None
 10 | tag:search.twitter.com,2005:351835318346981377|2013-07-01T22:50:51.000Z|Newest hobby: sending videos back and forth of us listening to screamo @kakessinger|None|None|None|['en']|en|en|[29.54588264, -95.10669141]|Point|['[-95.145152, 29.506973]', '[-95.145152, 29.555093]', '[-95.093693, 29.555093]', '[-95.093693, 29.506973]']|Polygon|Webster, TX|US|None|Houston, Texas|None|None|None|None|None|None|None|None|lil beyoncé|beezlebrat|541953037|41|160|135|0|4170|Tweet|None|None|None
 11 | tag:search.twitter.com,2005:351835318044983298|2013-07-01T22:50:51.000Z|@DiegoSanRoman no y no me gusta.|None|None|None|['en']|en|es|[19.34964, -99.19341]|Point|['[-99.32437499999999, 19.232228]', '[-99.32437499999999, 19.403855999999998]', '[-99.1717625, 19.403855999999998]', '[-99.1717625, 19.232228]']|Polygon|Álvaro Obregón, Distrito Federal|MX|-28800|None|None|None|None|None|None|None|None|None|Carlos Hermosillo|CHermosillo_FOX|1160945754|62|11873|69|56|1991|Reply|http://twitter.com/DiegoSanRoman/statuses/351530105224445953|None|None
 12 | tag:search.twitter.com,2005:351835318024028161|2013-07-01T22:50:51.000Z|~FINALLY OFF OF WORK~|None|None|None|['en']|en|en|[32.86107, -83.72062]|Point|['[-83.739741, 32.765651]', '[-83.739741, 32.899785]', '[-83.548632, 32.899785]', '[-83.548632, 32.765651]']|Polygon|Macon, GA|US|None|None|None|None|None|None|None|None|None|None|Tomeisha Cross|MISSMEISHA223|242505369|18|290|683|0|540|Tweet|None|None|None
 13 | tag:search.twitter.com,2005:351835318497980416|2013-07-01T22:50:51.000Z|Rubbing/massaging my feet is like giving me head... Either one'l do|None|None|None|['en']|en|en|[33.68890281, -84.27369741]|Point|['[-85.605165, 30.355756999999997]', '[-85.605165, 35.000659]', '[-80.751429, 35.000659]', '[-80.751429, 30.355756999999997]']|Polygon|Georgia, US|US|-18000|Atl...|None|None|None|None|None|None|None|None|Poison †|shanbonita|29619102|64|40543|116|486|60465|Tweet|None|None|None
 14 | tag:search.twitter.com,2005:351835318376337408|2013-07-01T22:50:51.000Z|Felicidadees @MAngele34977437! Pasatelo muy bienn en tu cumpleaños, un beso :)|None|None|None|['es']|en|es|[36.33657055, -5.31086285]|Point|['[-5.3559695, 36.325703]', '[-5.3559695, 36.5201365]', '[-5.2058175, 36.5201365]', '[-5.2058175, 36.325703]']|Polygon|Casares, Málaga|ES|None|None|None|None|None|None|None|None|None|None|Marcos Berenguer|Marcos7BB|716170291|41|162|155|0|303|Tweet|None|None|None
 15 | tag:search.twitter.com,2005:351835318837710849|2013-07-01T22:50:51.000Z|Que ascooooo, tengo paladar:(|None|None|None|['es']|en|es|[21.0214059, -89.6096588]|Point|['[-89.798197, 20.695104]', '[-89.798197, 21.186965999999998]', '[-89.44902499999999, 21.186965999999998]', '[-89.44902499999999, 20.695104]']|Polygon|Mérida, Yucatán|MX|None|None|None|None|None|None|None|None|None|None|Cinthia azcorra|CinthiaAzcorra|389125134|41|172|115|0|7528|Tweet|None|None|None
 16 | tag:search.twitter.com,2005:351835318724468737|2013-07-01T22:50:51.000Z|@Hayley_Brownn Same!!!! Very sad times😞😞😞😞|None|None|None|['en']|en|en|[54.6884467, -3.51776393]|Point|['[-3.580063, 54.454108]', '[-3.580063, 54.964178999999994]', '[-2.9829719999999993, 54.964178999999994]', '[-2.9829719999999993, 54.454108]']|Polygon|Allerdale, Cumbria|GB|-36000|None|None|None|None|None|None|None|None|None|Karl Hine|KarlHine|402334657|42|417|259|1|5242|Reply|http://twitter.com/Hayley_Brownn/statuses/351824983447711746|None|None
 17 | tag:search.twitter.com,2005:351835318896439296|2013-07-01T22:50:51.000Z|Jgc|None|None|None|['fr']|en|und|[45.7111863, 4.8557398]|Point|['[4.836346, 45.6805901]', '[4.836346, 45.7188012]', '[4.8666217, 45.7188012]', '[4.8666217, 45.6805901]']|Polygon|Saint-Fons, Rhône|FR|3600|Lyon|None|None|None|None|None|None|None|None|Lisa jtm(Alex Karev)|thatsniall__|564585123|53|1179|628|8|20141|Tweet|None|None|None
 18 | tag:search.twitter.com,2005:351835318745436163|2013-07-01T22:50:51.000Z|@MiriamRC6 vooy pisando tan fuerte que voy dejando grietas :333|None|None|None|['es']|en|es|[41.9016077, -8.8703524]|Point|['[-8.8881084, 41.8674229]', '[-8.8881084, 41.9419586]', '[-8.8292131, 41.9419586]', '[-8.8292131, 41.8674229]']|Polygon|A Guarda, Pontevedra|ES|7200|A GUARDA.|None|None|None|None|None|None|None|None|• LAURIS VERGARA ∞|LauurisVergara|526681547|33|231|216|1|11271|Reply|http://twitter.com/MiriamRC6/statuses/351835111869784064|None|None
 19 | tag:search.twitter.com,2005:351835318711889920|2013-07-01T22:50:51.000Z|התמימה היא הרעה מפגרים|None|None|None|['he']|en|he|[32.01775251, 34.75457897]|Point|None|None|None|None|None|None|None|None|None|None|None|None|None|None|✩KⓄℜÅℒ  OℏÅⓨoℕ✩|korall_ohayon|1318589041|37|151|284|0|641|Tweet|None|None|None
 20 | tag:search.twitter.com,2005:351835319085187072|2013-07-01T22:50:51.000Z|@Michael5SOS MIKEY I LOVE U|None|None|None|['it']|en|en|[43.82758701, 12.99396517]|Point|['[12.9010202, 43.7535429]', '[12.9010202, 43.8816564]', '[13.1336738, 43.8816564]', '[13.1336738, 43.7535429]']|Polygon|Fano, Pesaro e Urbino|IT|3600|♡|None|None|None|None|None|None|None|None|meconio.|liamsfaith|382824235|44|5589|2200|17|51769|Tweet|None|None|None
 21 | tag:search.twitter.com,2005:351835319315857408|2013-07-01T22:50:51.000Z|BONITO UNFOLLOW,ZORRA.|None|None|None|['es']|en|it|None|None|['[-18.1606948, 27.6377504]', '[-18.1606948, 28.8578067]', '[-16.1193629, 28.8578067]', '[-16.1193629, 27.6377504]']|Polygon|Santa Cruz de Tenerife, Islas Canarias|ES|3600|None|None|None|None|None|None|None|None|None|Robyn Rihanna Fenty|ConSdeSammy|554205628|46|1999|293|27|60236|Tweet|None|None|None
 22 | tag:search.twitter.com,2005:351835317956902912|2013-07-01T22:50:51.000Z|és uma merda|None|None|None|['pt']|en|pt|None|None|['[-8.7583395, 40.5284833]', '[-8.7583395, 40.7275539]', '[-8.520973, 40.7275539]', '[-8.520973, 40.5284833]']|Polygon|Aveiro, Aveiro|PT|None|Aveiro|None|None|None|None|None|None|None|None|Mafalda Azevedo|MafaldaAzevedo9|1316723420|43|137|89|0|4047|Tweet|None|None|None
 23 | tag:search.twitter.com,2005:351835318024011776|2013-07-01T22:50:51.000Z|Laku yam, aku mau beli cawan. Simpen dulu ya.. Cawanku pecah lagi :'( RT @rizkaNHS: @alprak_farmasi (cont) http://t.co/viW5n188Ep|['http://test.gnip.com/mock']|['http://t.co/viW5n188Ep']|['http://tl.gd/m5rb8k']|['en']|en|id|[-7.02039592, 110.48798927]|Point|['[110.450289, -7.04064]', '[110.450289, -6.978231999999999]', '[110.507375, -6.978231999999999]', '[110.507375, -7.04064]']|Polygon|Padurungan, Kota Semarang|ID|None|ÜT: -7.01624,110.48594|None|None|None|None|None|None|None|None|Novita Anggraini|phieanggra|616193946|43|258|302|0|2732|Tweet|None|None|None
 24 | tag:search.twitter.com,2005:351835318162427904|2013-07-01T22:50:51.000Z|@nazkoklu1 tamam mutlaka haber ver ben tum temmuz bodrumdayim :D|None|None|None|['en']|en|tr|None|None|['[25.663883, 35.817497]', '[25.663883, 42.109993]', '[44.822762, 42.109993]', '[44.822762, 35.817497]']|Polygon|Turkey|TR|7200|GALATASARAY|None|None|None|None|None|None|None|None|Helin|helindundar|45421764|None|349|244|1|12236|Reply|http://twitter.com/nazkoklu1/statuses/351835169587605504|None|None
 25 | tag:search.twitter.com,2005:351835318376345600|2013-07-01T22:50:51.000Z|@secutedame nie ignoruj mnie noooo|None|None|None|['en']|en|pl|[43.36999893, -80.98223114]|Point|['[-81.44303699999999, 43.20348]', '[-81.44303699999999, 43.844643999999995]', '[-80.735773, 43.844643999999995]', '[-80.735773, 43.20348]']|Polygon|Perth, Ontario|CA|-21600|secutedame   maturegomez|None|None|None|None|None|None|None|None|kinia|vansonselena|1041817357|48|2037|1984|1|23240|Tweet|None|None|None
 26 | tag:search.twitter.com,2005:351835318682525697|2013-07-01T22:50:51.000Z|A Very Happy Great Gramma. #picoftheday #family #instafam #aww #313 @ The Broadcast Booth http://t.co/DZk6HN22r1|['http://test.gnip.com/mock']|['http://t.co/DZk6HN22r1']|['http://instagram.com/p/bPaB-FtSpQ/']|['en']|en|en|[42.25865115, -83.20823908]|Point|['[-83.231911, 42.222783]', '[-83.231911, 42.301161]', '[-83.18364, 42.301161]', '[-83.18364, 42.222783]']|Polygon|Allen Park, MI|US|-18000|PCB/313|None|None|None|None|None|None|None|None|Kristopher Alan|KristopherAlan|27737035|32|342|2001|0|468|Tweet|None|None|None
 27 | tag:search.twitter.com,2005:351835318602842113|2013-07-01T22:50:51.000Z|I always mess things up fuck.|None|None|None|['en']|en|en|[40.17004533, -87.64867445]|Point|['[-87.665219, 40.103392]', '[-87.665219, 40.20446]', '[-87.531699, 40.20446]', '[-87.531699, 40.103392]']|Polygon|Danville, IL|US|-18000|d-ville ✌|None|None|None|None|None|None|None|None|geneva ✨|seper4tion|482102969|40|177|77|0|5581|Tweet|None|None|None
 28 | tag:search.twitter.com,2005:351835318619607043|2013-07-01T22:50:51.000Z|David Luiz, me come|None|None|None|['pt']|en|pt|None|None|['[-43.795449, -23.083019999999998]', '[-43.795449, -22.749043999999998]', '[-43.099381, -22.749043999999998]', '[-43.099381, -23.083019999999998]']|Polygon|Rio de Janeiro, Rio de Janeiro|BR|-10800|Rio de Janeiro|None|None|None|None|None|None|None|None|Buba Aguiar|Buba_polvilho|88753650|38|231|352|0|15233|Tweet|None|None|None
 29 | tag:search.twitter.com,2005:351835317864628224|2013-07-01T22:50:51.000Z|O meu ninfa girava e eu nem Creditava q aquilo era tao bom , minha juventude brotava e eu me apaixonava pela mina e por meu som 🎶|None|None|None|['pt']|en|pt|[-6.07340332, -49.89367576]|Point|['[-51.198932, -6.623769999999999]', '[-51.198932, -5.8535479]', '[-49.732423, -5.8535479]', '[-49.732423, -6.623769999999999]']|Polygon|Parauapebas, Pará|BR|-10800|:)|None|None|None|None|None|None|None|None|romullo araujo |romulloar7|220404906|47|421|345|0|15986|Tweet|None|None|None
 30 | tag:search.twitter.com,2005:351835318577676288|2013-07-01T22:50:51.000Z|Tanrow ►► "@Artjie_Wisnoe: Di liat dari tejongnya :p RT "@d_wisnu: « Mafhum insan sikoh.. "@Artjie_Wisnoe: Maf'ul &gt; "@d_wisnu: Bondett ►►|None|None|None|['en']|en|id|[-6.26235, 107.00444]|Point|['[106.974561, -6.301652]', '[106.974561, -6.20717]', '[107.048951, -6.20717]', '[107.048951, -6.301652]']|Polygon|Bekasi Timur, Bekasi|ID|-28800|ÜT: -6.26033,107.01062|None|None|None|None|None|None|None|None|Wisnu Sumantri|d_wisnu|243113979|33|404|378|0|9181|Tweet|None|None|None
 31 | tag:search.twitter.com,2005:351835318443446273|2013-07-01T22:50:51.000Z|"If it's in my cup, it's in MY cup, so don't fucking worry about it"|None|None|None|['en']|en|en|[41.16183828, -81.42191721]|Point|['[-81.489745, 41.135556]', '[-81.489745, 41.202973]', '[-81.391931, 41.202973]', '[-81.391931, 41.135556]']|Polygon|Stow, OH|US|-21600|The Clouds|None|None|None|None|None|None|None|None|Jacob King|JacobviaKing|379819371|33|243|290|0|6296|Tweet|None|None|None
 32 | tag:search.twitter.com,2005:351835317898199040|2013-07-01T22:50:51.000Z|@Verity97 just kiddin u bootiful dis u &gt; 👰|None|None|None|['en']|en|en|[53.54338422, -1.4962159]|Point|['[-1.822589, 53.438297]', '[-1.822589, 53.612815]', '[-1.275750999999999, 53.612815]', '[-1.275750999999999, 53.438297]']|Polygon|Barnsley, Barnsley|GB|None|None|None|None|None|None|None|None|None|None|beth field|_bethfieldx|601030480|31|112|239|0|526|Reply|http://twitter.com/Verity97/statuses/351834475501588480|None|None
 33 | tag:search.twitter.com,2005:351835318619607042|2013-07-01T22:50:51.000Z|Чªª"̮ allah skit ßàªªnğεε†‎|None|None|None|['id']|en|id|[-6.22595, 106.95018]|Point|['[106.898751, -6.2634]', '[106.898751, -6.214894999999999]', '[106.9658993, -6.214894999999999]', '[106.9658993, -6.2634]']|Polygon|Duren Sawit, Jakarta Timur|ID|None|None|None|None|None|None|None|None|None|None|Ocha|Febbymimie|1089309390|15|18|120|0|84|Tweet|None|None|None
 34 | tag:search.twitter.com,2005:351835318225346560|2013-07-01T22:50:51.000Z|ㅤ  ㅤ  ㅤ  ㅤㅤ  ㅤ  ㅤ  ㅤㅤ  ㅤ  ㅤ  ㅤ ㅤ  ㅤ  ㅤ  ㅤ  ㅤ  ㅤ  ㅤㅤ ㅤ  ㅤ  ㅤ  ㅤ  ㅤ  ㅤ  ㅤㅤ  ㅤ  ㅤ  ㅤㅤ  ㅤ  ㅤ  ㅤ ㅤ  ㅤ  ㅤ  ㅤ  ㅤ|None|None|None|['en']|en|ko|[43.08017289, -82.4778248]|Point|['[-82.518821, 42.999512]', '[-82.518821, 43.083635]', '[-82.435712, 43.083635]', '[-82.435712, 42.999512]']|Polygon|Fort Gratiot, MI|US|-18000|Michigan|None|None|None|None|None|None|None|None|Kyle Landry|kylelandry11|1016681408|39|170|120|0|1094|Tweet|None|None|None
 35 | tag:search.twitter.com,2005:351835318254706688|2013-07-01T22:50:51.000Z|Girls date bad boys because they think they can fix them, but at the end of the day...you can't get rid of a face tattoo. ha #girlcode #mtv|None|None|None|['en']|en|en|[30.41249811, -86.49854729]|Point|['[-86.515789, 30.378665]', '[-86.515789, 30.417791]', '[-86.397207, 30.417791]', '[-86.397207, 30.378665]']|Polygon|Destin, FL|US|-25200|Ft. Benning|None|None|None|None|None|None|None|None|Kelly Haight|kellhaight|125565884|57|192|235|3|4073|Tweet|None|None|None
 36 | tag:search.twitter.com,2005:351835318112096256|2013-07-01T22:50:51.000Z|@jono1006 not this week dude il go out the week after, after the party of your interested lad?|None|None|None|['en']|en|en|[52.55744511, -2.08298706]|Point|['[-2.206884, 52.543946999999996]', '[-2.206884, 52.637907999999996]', '[-2.048029, 52.637907999999996]', '[-2.048029, 52.543946999999996]']|Polygon|Wolverhampton, Wolverhampton|GB|0|None|None|None|None|None|None|None|None|None|Stefano|Stefan4LFC|231244313|36|153|300|0|3923|Reply|http://twitter.com/jono1006/statuses/351834266478452736|None|None
 37 | tag:search.twitter.com,2005:351835318602825729|2013-07-01T22:50:51.000Z|@power1063 PLEASE Pick me to Meet @justinbieber if Justin didn't give up on his dreams why should I give up on my dream #BIEBERPOWER (43)|None|None|None|['en']|en|en|[41.86511389, -87.69583348]|Point|['[-87.940101, 41.643919]', '[-87.940101, 42.023135]', '[-87.523661, 42.023135]', '[-87.523661, 41.643919]']|Polygon|Chicago, IL|US|-28800|Justin's wife #BelieveTour ♔|None|None|None|None|None|None|None|None|#HeartBreaker|BiebaholicFever|235436513|39|1752|2000|2|21618|Tweet|None|None|None
 38 | tag:search.twitter.com,2005:351835318674132992|2013-07-01T22:50:51.000Z|Lls you're so childish|None|None|None|['en']|en|en|[38.7619111, -77.60871738]|Point|['[-77.621907, 38.731253]', '[-77.621907, 38.784919]', '[-77.534545, 38.784919]', '[-77.534545, 38.731253]']|Polygon|Linton Hall, VA|US|-28800|DM[V]  ¯\_(ツ)_/¯|None|None|None|None|None|None|None|None|Kenzie|iKTAKenzie|513529537|38|407|280|0|38556|Tweet|None|None|None
 39 | tag:search.twitter.com,2005:351835318728671232|2013-07-01T22:50:51.000Z|@SigaSOJA sim|None|None|None|['en']|en|und|[-23.33169536, -51.17693177]|Point|['[-51.324349, -23.853841]', '[-51.324349, -23.1722019]', '[-50.875167999999995, -23.1722019]', '[-50.875167999999995, -23.853841]']|Polygon|Londrina, Paraná|BR|-10800|Londrina|None|None|None|None|None|None|None|None|Queen G.|GabrielliMach|308761245|35|228|185|0|24571|Tweet|None|None|None
 40 | tag:search.twitter.com,2005:351835318405705728|2013-07-01T22:50:51.000Z|El aburrimiento que llevo encima no es normal.|None|None|None|['es']|en|es|[39.9356653, -0.0956062]|Point|['[-0.1749828, 39.8916978]', '[-0.1749828, 39.9805156]', '[-0.060863, 39.9805156]', '[-0.060863, 39.8916978]']|Polygon|Villarreal, Castellón|ES|3600|Tomorrowland.|None|None|None|None|None|None|None|None|Juli.|JuliaLlorens|162779089|40|3285|2106|30|31002|Tweet|None|None|None
 41 | tag:search.twitter.com,2005:351835318552494080|2013-07-01T22:50:51.000Z|love driving and listening to music #relax|None|None|None|['en']|en|en|[50.59005388, -2.471168]|Point|['[-2.504146, 50.512847]', '[-2.504146, 50.678695]', '[-2.405389, 50.678695]', '[-2.405389, 50.512847]']|Polygon|Weymouth and Portland, Dorset|GB|None|Weymouth|None|None|None|None|None|None|None|None|lkm__|laurieeemaay|269446336|35|115|228|0|8138|Tweet|None|None|None
 42 | tag:search.twitter.com,2005:351835318829318144|2013-07-01T22:50:51.000Z|Pues esque mi hijo voto por AMLO.  -Una madre, ratando de justificar el hecho de que su hijo sea un donadie, desempleado y drogadicto.|None|None|None|['es']|en|es|None|None|['[-100.913535, 20.3568691]', '[-100.913535, 20.6885141]', '[-100.630887, 20.6885141]', '[-100.630887, 20.3568691]']|Polygon|Celaya, Guanajuato|MX|-18000|Celaya|None|None|None|None|None|None|None|None|Ryan Gaynolds|15Labra|413664383|34|1121|1013|0|7590|Tweet|None|None|None
 43 | tag:search.twitter.com,2005:351835318594453507|2013-07-01T22:50:51.000Z|ماتت كل الاكاذيب وبطلوها !! إلا 5 دقايق وأكون عندك تمشي علينا للحين ..   ههههههههههههههههههههههههههہَ هُ ˛☺￼™))|None|None|None|['ar']|en|ar|[26.28046, 50.18287]|Point|None|None|None|None|None|SA - Dhahran|None|None|None|None|None|None|None|None|Aj ♚#my birthday ↓22|optimistic014|1438925934|40|458|413|0|1061|Tweet|None|None|None
 44 | tag:search.twitter.com,2005:351835318732857344|2013-07-01T22:50:51.000Z|@MaevaRmz Dakkor Moi jvai pas oublier j'espère ke toi aussi !! #Menace|None|None|None|['fr']|en|fr|[48.9572585, 2.2946213]|Point|['[2.2883004, 48.942443999999995]', '[2.2883004, 48.9662874]', '[2.3460398, 48.9662874]', '[2.3460398, 48.942443999999995]']|Polygon|Epinay-sur-Seine, Seine-Saint-Denis|FR|None|None|None|None|None|None|None|None|None|None|SMILEY Négroo|miniBengalo|1270875637|40|116|122|0|3174|Reply|http://twitter.com/MaevaRmz/statuses/351834911688241153|None|None
 45 | tag:search.twitter.com,2005:351835319403950080|2013-07-01T22:50:51.000Z|Toquei nela sem querer ela gritou, quem você pensa que eu sou? Ela falou. Bem que que eu tentei acalmar mais não adiantou oh não não não não|None|None|None|['pt']|en|pt|None|None|['[-46.826038999999994, -24.008813999999997]', '[-46.826038999999994, -23.356792]', '[-46.365052, -23.356792]', '[-46.365052, -24.008813999999997]']|Polygon|São Paulo, São Paulo|BR|-10800|None|None|None|None|None|None|None|None|None|Doctor Muringa [ODS]|Alanpreto_|225935697|44|789|150|0|10134|Tweet|None|None|None
 46 | tag:search.twitter.com,2005:351835319244562433|2013-07-01T22:50:51.000Z|@IvanaChacon uy falsa|None|None|None|['es']|None|es|[7.7771127, -72.2051679]|Point|None|None|None|None|None|None|None|None|None|None|None|None|None|None|Maria Antonieta|MariaCoiran|380560670|41|264|198|0|12569|Reply|http://twitter.com/IvanaChacon/statuses/351835156451041282|None|None
 47 | tag:search.twitter.com,2005:351835318925795329|2013-07-01T22:50:51.000Z|inche dolor de estómago....|None|None|None|['es']|en|es|[19.01868, -98.20305]|Point|['[-98.289206, 18.83765]', '[-98.289206, 19.226809]', '[-98.01932699999999, 19.226809]', '[-98.01932699999999, 18.83765]']|Polygon|Puebla, Puebla|MX|-21600|Orizaba|None|None|None|None|None|None|None|None|C. Alam Solar|alamchli|199893733|33|247|256|0|5527|Tweet|None|None|None
 48 | tag:search.twitter.com,2005:351835319450079233|2013-07-01T22:50:51.000Z|#np Diddy Dirty Money - Coming Home|None|None|None|['en']|en|en|[3.0880018, 101.54858202]|Point|['[101.4898224, 2.9745362]', '[101.4898224, 3.1279044]', '[101.6291504, 3.1279044]', '[101.6291504, 2.9745362]']|Polygon|Damansara, Petaling|MY|28800|Shah Alam, Selangor|None|None|None|None|None|None|None|None|Faiz Arshad|Mohdfaiz13|186427422|44|1573|572|3|9865|Tweet|None|None|None
 49 | tag:search.twitter.com,2005:351835319584305153|2013-07-01T22:50:51.000Z|You opened my snapchat.  I'm expecting one back.|None|None|None|['en']|en|en|[47.53635183, -122.61271693]|Point|['[-122.674427, 47.495882]', '[-122.674427, 47.553867]', '[-122.603032, 47.553867]', '[-122.603032, 47.495882]']|Polygon|Port Orchard, WA|US|-28800|None|None|None|None|None|None|None|None|None|✝ʟɑuʀєɴ тɑʏʟoʀ✝|Laytayjay|422214279|37|558|1463|1|823|Tweet|None|None|None
 50 | tag:search.twitter.com,2005:351835319538167809|2013-07-01T22:50:51.000Z|Good Morning Worlddd|None|None|None|['en']|en|en|[1.4424622, 103.7756926]|Point|['[103.5363541, 1.3416253]', '[103.5363541, 1.673503]', '[104.0161667, 1.673503]', '[104.0161667, 1.3416253]']|Polygon|Johor Bahru, Johore|MY|-32400|None|None|None|None|None|None|None|None|None|Fandreyy♚|FandyClariBoe|404196083|45|325|292|0|11955|Tweet|None|None|None
 51 | tag:search.twitter.com,2005:351835319794020353|2013-07-01T22:50:51.000Z|Omfg....BOMBBBBBBB|None|None|None|['en']|da|pt|[34.0017761, -118.084164]|Point|['[-118.123238, 33.950362]', '[-118.123238, 34.028847]', '[-118.040001, 34.028847]', '[-118.040001, 33.950362]']|Polygon|Pico Rivera, CA|US|-25200|None|None|None|None|None|None|None|None|None|taay bieber ~|taayboo_|559866671|34|274|229|0|30716|Tweet|None|None|None
 52 | tag:search.twitter.com,2005:351835319722713089|2013-07-01T22:50:51.000Z|Alle vennan mine me god humor kommer overends me han pappa.|None|None|None|['en']|en|da|[70.9901604, 26.0325797]|Point|['[25.0407776, 70.6562712]', '[25.0407776, 71.1969438]', '[26.6914636, 71.1969438]', '[26.6914636, 70.6562712]']|Polygon|Nordkapp, Finnmark|NO|None|None|None|None|None|None|None|None|None|None|Lisa Pleym|lisa_pleym|1243693502|33|58|92|0|1117|Tweet|None|None|None
 53 | tag:search.twitter.com,2005:351835319869505537|2013-07-01T22:50:51.000Z|Ah nah morge 1/6 school dan 3/5 werken en dan naar musical van @kevindebruijn1  :(|None|None|None|['nl']|en|nl|[52.2577317, 4.571016]|Point|['[4.5505577, 52.2144875]', '[4.5505577, 52.3863713]', '[4.8191001, 52.3863713]', '[4.8191001, 52.2144875]']|Polygon|Haarlemmermeer, Noord-Holland|NL|7200|None|None|None|None|None|None|None|None|None|tim de bruijn|timeey14|590431458|30|41|73|0|662|Tweet|None|None|None
 54 | tag:search.twitter.com,2005:351835319794016257|2013-07-01T22:50:51.000Z|I'm at San Diego County Fair (Del Mar, CA) w/ 14 others [pic]: http://t.co/guOlPYw8wl|['http://test.gnip.com/mock']|['http://t.co/guOlPYw8wl']|['http://4sq.com/12AbCeZ']|['en']|en|en|[32.97336991, -117.2620368]|Point|['[-117.261680603027, 32.9728813171387]', '[-117.261680603027, 32.9728813171387]', '[-117.261680603027, 32.9728813171387]', '[-117.261680603027, 32.9728813171387]']|Polygon|san diego county fair, Del Mar|US|None|None|None|None|None|None|None|None|None|None|Dawn Muehl|evilchick77|38299534|33|215|433|13|2631|Tweet|None|None|None
 55 | tag:search.twitter.com,2005:351835319957602305|2013-07-01T22:50:51.000Z|or is it just me|None|None|None|['en']|en|en|[42.18224077, -73.99886789]|Point|['[-79.76259, 40.477399]', '[-79.76259, 45.015865]', '[-71.777491, 45.015865]', '[-71.777491, 40.477399]']|Polygon|New York, US|US|-21600|new york|None|None|None|None|None|None|None|None|✨Maryana✨|maryanax13|891890964|29|150|157|0|6252|Tweet|None|None|None
 56 | tag:search.twitter.com,2005:351835318275682304|2013-07-01T22:50:51.000Z|Olay varsa Atçe orda!1!!!!! @HatiHasanoglu|None|None|None|['tr']|tr|tr|[21.5800651, 39.1755465]|Point|None|None|None|None|10800|Arabistan/Cidde-Türkiye/Hatay|None|None|None|None|None|None|None|None|Nezihe Güneş (M.J)|Nezishe|636302362|44|392|152|0|14064|Reply|http://twitter.com/HatiHasanoglu/statuses/351834507160203264|None|None
 57 | tag:search.twitter.com,2005:351835319945019393|2013-07-01T22:50:51.000Z|J'vais pas faire comme Yass un montage de dép|None|None|None|['fr']|fr|fr|[48.97273529, 2.27965733]|Point|['[2.2711327, 48.9584561]', '[2.2711327, 48.9775068]', '[2.298873, 48.9775068]', '[2.298873, 48.9584561]']|Polygon|Saint-Gratien, Val-d'Oise|FR|None|None|None|None|None|None|None|None|None|None|NeyNey. ✌|MarviiinC|1357848218|42|79|70|0|4488|Tweet|None|None|None
 58 | tag:search.twitter.com,2005:351835318741254147|2013-07-01T22:50:51.000Z|Dirumah "@Genna_Nugroho @Iam_MrArrogant dimna ko ini?|None|None|None|['en']|id|id|None|None|['[117.042537, -7.499231]', '[117.042537, -1.5230100000000002]', '[135.8369, -1.5230100000000002]', '[135.8369, -7.499231]']|Polygon|South Sulawesi, Indonesia|ID|25200|Rajuni, Makassar, Indonesia|None|None|None|None|None|None|None|None|IDGHAM HKS_3112|Iam_MrArrogant|527060033|48|727|157|0|13176|Reply|http://twitter.com/Genna_Nugroho/statuses/351822267874938882|None|None
 59 | tag:search.twitter.com,2005:351835319219400706|2013-07-01T22:50:51.000Z|#HolaJulio nada solo trae BUENAS fests... Chau|None|None|None|['es']|es|es|[-34.5474842, -58.7062953]|Point|None|None|None|None|-10800|None|None|None|None|None|None|None|None|None|Ťąsmąňią|LauuCacciato|182900398|36|76|140|0|723|Tweet|None|None|None
 60 | tag:search.twitter.com,2005:351835319856939008|2013-07-01T22:50:51.000Z|Just so fucking bored of it|None|None|None|['en']|en|en|[52.28487032, -1.40530251]|Point|['[-1.962203999999999, 51.955399]', '[-1.962203999999999, 52.368094]', '[-1.231712999999999, 52.368094]', '[-1.231712999999999, 51.955399]']|Polygon|Stratford-on-Avon, Warwickshire|GB|3600|Long Itchington|None|None|None|None|None|None|None|None|Emily Crane|_emilycrane|127834421|44|648|601|0|18184|Tweet|None|None|None
 61 | tag:search.twitter.com,2005:351835319970168833|2013-07-01T22:50:51.000Z|Dreams &lt;3|None|None|None|['en']|en|en|[39.83853327, -84.88283731]|Point|['[-84.958257, 39.74859]', '[-84.958257, 39.882589]', '[-84.813804, 39.882589]', '[-84.813804, 39.74859]']|Polygon|Richmond, IN|US|-14400|Tampa|None|None|None|None|None|None|None|None|Jaylyn Plumley|JaylynPlumley|521322581|37|86|97|0|1270|Tweet|None|None|None
 62 | tag:search.twitter.com,2005:351835318850289665|2013-07-01T22:50:51.000Z|Buena noche|None|None|None|['es']|it|es|[37.1793352, -5.7949027]|Point|['[-6.0497685, 36.9160919]', '[-6.0497685, 37.2529197]', '[-5.5893256000000004, 37.2529197]', '[-5.5893256000000004, 36.9160919]']|Polygon|Utrera, Sevilla|ES|None|None|None|None|None|None|None|None|None|None|manolo|malooje94|936658410|36|159|160|0|2350|Tweet|None|None|None
 63 | tag:search.twitter.com,2005:351835320003727360|2013-07-01T22:50:51.000Z|@KayleeJohnston1 you can't avoid that?!?!|None|None|None|['en']|en|en|[39.98486816, -83.16312234]|Point|['[-83.183439, 39.808631]', '[-83.183439, 40.157317]', '[-82.771378, 40.157317]', '[-82.771378, 39.808631]']|Polygon|Columbus, OH|US|-21600|Hilliard, Ohio|None|None|None|None|None|None|None|None|Brady Simpson|BSimp614|396640875|43|402|196|0|5767|Reply|http://twitter.com/KayleeJohnston1/statuses/351835238609072128|None|None
 64 | tag:search.twitter.com,2005:351835320121176064|2013-07-01T22:50:51.000Z|"@VLADDO: Al ganador del concurso aquel deberían darle el título de #ElGranColomVillano."|None|None|None|['es']|en|es|[9.3635352, -73.6014936]|Point|None|None|None|None|-18000|VALLEDUPAR, COLOMBIA...|None|None|None|None|None|None|None|None|Enrique...|kikekamacho|190517290|31|182|706|1|2417|Tweet|None|None|None
 65 | tag:search.twitter.com,2005:351835319777247233|2013-07-01T22:50:51.000Z|朝コーヒーなう。 (@ ドトールコーヒーショップ 溝の口KSP店) [pic]: http://t.co/RYNvPIsZGk|['http://test.gnip.com/mock']|['http://t.co/RYNvPIsZGk']|['http://4sq.com/1cJD0J6']|['ja']|en|ja|[35.5948511, 139.62046981]|Point|['[139.586410833333, 35.5602822222222]', '[139.586410833333, 35.6185508333333]', '[139.643206111111, 35.6185508333333]', '[139.643206111111, 35.5602822222222]']|Polygon|川崎市高津区, 神奈川県|JP|32400|Kanagawa, JAPAN|None|None|None|None|None|None|None|None|Hitoshi Kuwano|jm1omh|81327848|28|152|335|3|2151|Tweet|None|None|None
 66 | tag:search.twitter.com,2005:351835320226029568|2013-07-01T22:50:51.000Z|на аске осталась и в аск буке :)|None|None|None|['ru']|en|ru|None|None|['[47.7674, 56.588]', '[47.7674, 56.668600000000005]', '[47.999900000000004, 56.668600000000005]', '[47.999900000000004, 56.588]']|Polygon|Йошкар-Ола, Марий Эл республика|RU|14400|None|None|None|None|None|None|None|None|None|SokoVikova|NataliaBoston97|929621287|23|34|30|0|696|Tweet|None|None|None
 67 | tag:search.twitter.com,2005:351835318351179776|2013-07-01T22:50:51.000Z|You &amp; whoever you're coming with can catch it.|None|None|None|['en']|en|en|[28.43406419, -81.31527867]|Point|['[-81.50773, 28.347984]', '[-81.50773, 28.614251]', '[-81.229749, 28.614251]', '[-81.229749, 28.347984]']|Polygon|Orlando, FL|US|-14400|Behind You.|None|None|None|None|None|None|None|None|Beyond Life.|ChrisHoww|229101817|45|1140|943|3|105416|Tweet|None|None|None
 68 | tag:search.twitter.com,2005:351835319659790337|2013-07-01T22:50:51.000Z|Knock knock... http://t.co/PIoJfShl75|['http://twitter.com/DomKent/status/351835319659790337/photo/1']|None|None|['en']|en|en|[51.63157951, -0.77995453]|Point|['[-0.950745, 51.544757]', '[-0.950745, 51.782450999999995]', '[-0.669482, 51.782450999999995]', '[-0.669482, 51.544757]']|Polygon|Wycombe, Buckinghamshire|GB|3600|High Wycombe|None|None|None|None|None|None|None|None|Dominic Kent|DomKent|50283380|40|207|165|0|3275|Tweet|None|None|None
 69 | tag:search.twitter.com,2005:351835320284753920|2013-07-01T22:50:51.000Z|Encontrei o Edevan em Torres, aquele chato.. hahah|None|None|None|['pt']|pt|pt|None|None|['[-51.3061478, -30.2688069]', '[-51.3061478, -29.9306357]', '[-51.012471, -29.9306357]', '[-51.012471, -30.2688069]']|Polygon|Porto Alegre, Rio Grande do Sul|BR|-10800|Torres Rio Grande do Sul|None|None|None|None|None|None|None|None|Pâmmy Couto|PammyCouto|592933910|30|187|231|0|5251|Tweet|None|None|None
 70 | tag:search.twitter.com,2005:351835320217636864|2013-07-01T22:50:51.000Z|@scb00by_doo I can't help it, oka x|None|None|None|['en']|en|en|[51.36794487, 0.59170763]|Point|['[0.397325999999999, 51.327898999999995]', '[0.397325999999999, 51.504019]', '[0.738956, 51.504019]', '[0.738956, 51.327898999999995]']|Polygon|Medway, Medway|GB|None|In wonderland|None|None|None|None|None|None|None|None|aliceinwonderland|WetAliceHeather|924224682|32|104|76|0|1068|Reply|http://twitter.com/scb00by_doo/statuses/351835227448020992|None|None
 71 | tag:search.twitter.com,2005:351835320301527040|2013-07-01T22:50:51.000Z|can you be my Nightingale, sing to me, I know you're there.|None|None|None|['en']|en|en|[41.6650056, -71.5314796]|Point|['[-71.534426, 41.662266]', '[-71.534426, 41.731779]', '[-71.483858, 41.731779]', '[-71.483858, 41.662266]']|Polygon|West Warwick, RI|US|-18000|⚓West Warwick, Rhode Island⚓|None|None|None|None|None|None|None|None|∞ kelsie ∞|Fittaaa___|356528186|37|700|984|1|49190|Tweet|None|None|None
 72 | tag:search.twitter.com,2005:351835320339283968|2013-07-01T22:50:51.000Z|Noooooooooooooooooooooooooooooooooooooooooooooooooooooooossa|None|None|None|['en']|en|es|None|None|['[-46.826038999999994, -24.008813999999997]', '[-46.826038999999994, -23.356792]', '[-46.365052, -23.356792]', '[-46.365052, -24.008813999999997]']|Polygon|São Paulo, São Paulo|BR|-10800|London - Mullingar|None|None|None|None|None|None|None|None|Pequena do Will :3|NathaliaCruzs|586933574|40|472|535|0|18460|Tweet|None|None|None
 73 | tag:search.twitter.com,2005:351835320184086528|2013-07-01T22:50:51.000Z|@stevieness i've never heard it before! Got so scared!!|None|None|None|['en']|en|en|[51.12509537, -0.18076173]|Point|['[-0.255629, 51.084806]', '[-0.255629, 51.167404999999995]', '[-0.132974999999999, 51.167404999999995]', '[-0.132974999999999, 51.084806]']|Polygon|Crawley, West Sussex|GB|0|England (West Sussex)|None|None|None|None|None|None|None|None|Nicolle Hope|NicolleHope|24449877|40|950|937|35|32327|Reply|http://twitter.com/stevieness/statuses/351835147873685506|None|None
 74 | tag:search.twitter.com,2005:351835320418967552|2013-07-01T22:50:51.000Z|A nigga can love you from the bottom of his heart and still have room at the top for another bitch|None|None|None|['en']|en|en|[33.86342611, -88.7186663]|Point|['[-91.65500899999999, 30.146096]', '[-91.65500899999999, 34.996099]', '[-88.097888, 34.996099]', '[-88.097888, 30.146096]']|Polygon|Mississippi, US|US|-18000|None|None|None|None|None|None|None|None|None|iStandd_ALONE !|Like_ahDiamondd|538683059|35|826|826|1|16771|Tweet|None|None|None
 75 | tag:search.twitter.com,2005:351835320368635905|2013-07-01T22:50:51.000Z|Wakwak"@wigunasaid: Elu lagi -_- "@alfiftrn: No!"@wigunasaid: Welcome july be nice ok (:"""|None|None|None|['id']|en|id|[-6.9432218, 107.5827196]|Point|['[107.577083, -6.965574999999999]', '[107.577083, -6.916034]', '[107.60021, -6.916034]', '[107.60021, -6.965574999999999]']|Polygon|Babakan Ciparay, Kota Bandung|ID|25200|di hati kenji|None|None|None|None|None|None|None|None|MANGGA ANTOSAN WEH.|alfiftrn|732586412|49|159|145|0|8925|Tweet|None|None|None
 76 | tag:search.twitter.com,2005:351835319366197248|2013-07-01T22:50:51.000Z|@IkmalAhmadIA k tq|None|None|None|['en']|None|und|[2.51849, 101.81334]|Point|['[101.7050095, 2.391072]', '[101.7050095, 2.7224898]', '[102.0324478, 2.7224898]', '[102.0324478, 2.391072]']|Polygon|Port Dickson, Negri Sembilan|MY|None|Always in Ryan's heart ♥|None|None|None|None|None|None|None|None|ayien'Lee|ain_salim92|1478753790|35|43|96|0|987|Reply|http://twitter.com/IkmalAhmadIA/statuses/351774712143163392|None|None
 77 | tag:search.twitter.com,2005:351835319848538113|2013-07-01T22:50:51.000Z|@viniciuaraujo97 kkkkkk ou amanha c sai q hrs da aula?|None|None|None|['en']|pt|pt|[-19.94496505, -43.94804174]|Point|['[-44.062788999999995, -20.059815999999998]', '[-44.062788999999995, -19.777568]', '[-43.856856, -19.777568]', '[-43.856856, -20.059815999999998]']|Polygon|Belo Horizonte, Minas Gerais|BR|-14400|Panem|None|None|None|None|None|None|None|None|melo|Juliamfr|290796386|37|759|322|0|30559|Reply|http://twitter.com/viniciuaraujo97/statuses/351835033054613506|None|None
 78 | tag:search.twitter.com,2005:351835319672381440|2013-07-01T22:50:51.000Z|Not looking forward to going back to Montrose tomorrow, even if it is only one night-.-|None|None|None|['en']|en|en|[56.85457195, -2.5758810199999997]|Point|['[-3.801637, 56.747133]', '[-3.801637, 57.705541999999994]', '[-1.752896, 57.705541999999994]', '[-1.752896, 56.747133]']|Polygon|Aberdeenshire, Aberdeenshire|GB|3600|scotlanddd|None|None|None|None|None|None|None|None|✨ alana richardson✨|un1c0rnh03|588648129|38|189|209|1|2244|Tweet|None|None|None
 79 | tag:search.twitter.com,2005:351835320284749824|2013-07-01T22:50:51.000Z|"I am Lieutenant Phison. And I apologize for my haste, but I have to get some information and comm it to the bridge #befo|None|None|None|['en']|en|en|[38.376, -69.039]|Point|None|None|None|None|None|None|None|None|None|None|None|None|None|None|Dacal Mccomas|dacal_dcal|1561615404|None|0|2|0|4|Tweet|None|None|None
 80 | tag:search.twitter.com,2005:351835319919857665|2013-07-01T22:50:51.000Z|oh my lawd it is fucking hot outside http://t.co/4hvXX9zc38|['http://twitter.com/frdrkt/status/351835319919857665/photo/1']|None|None|['en']|en|en|[49.26367739, -123.18586009]|Point|['[-123.73837499999999, 49.001920999999996]', '[-123.73837499999999, 49.574551]', '[-122.406655, 49.574551]', '[-122.406655, 49.001920999999996]']|Polygon|Greater Vancouver, British Columbia|CA|-18000|None|None|None|None|None|None|None|None|None|fred|frdrkt|136941660|24|80|126|1|1791|Tweet|None|None|None
 81 | tag:search.twitter.com,2005:351835320590934016|2013-07-01T22:50:51.000Z|Faut jmette mn reveil, demain je jeûne.|None|None|None|['fr']|fr|fr|[49.231941, 2.1302169]|Point|['[2.0872422999999998, 49.2117603]', '[2.0872422999999998, 49.2796766]', '[2.1720206, 49.2796766]', '[2.1720206, 49.2117603]']|Polygon|Méru, Oise|FR|7200|Paris|None|None|None|None|None|None|None|None|Anonymous|DiogouGbn|199013969|43|305|241|1|12230|Tweet|None|None|None
 82 | tag:search.twitter.com,2005:351835320435748864|2013-07-01T22:50:51.000Z|@LaritaGuardiola @IcaFerrer @10Klaus @Olguita_29 no no, perdona? Me quereis a mi y mucho :') jajaja|None|None|None|['es']|es|es|[38.61692362, -0.12441094]|Point|['[-0.1732318, 38.566939999999995]', '[-0.1732318, 38.6296775]', '[-0.078323999999999, 38.6296775]', '[-0.078323999999999, 38.566939999999995]']|Polygon|La Nucia, Alicante|ES|7200|None|None|None|None|None|None|None|None|None|Sonríe, enamoras^^|nereaaegea|489518771|42|283|216|0|4414|Reply|http://twitter.com/LaritaGuardiola/statuses/351834810060255232|None|None
 83 | tag:search.twitter.com,2005:351835320607719424|2013-07-01T22:50:51.000Z|Am i texting anybody? Nope. Would anyone like to text me? Probably not. On that note; it's nap time for me.|None|None|None|['en']|en|en|[29.8251009, -96.4450126]|Point|['[-106.645646, 25.837163999999998]', '[-106.645646, 36.500704]', '[-93.508039, 36.500704]', '[-93.508039, 25.837163999999998]']|Polygon|Texas, US|US|None|don't worry about it|None|None|None|None|None|None|None|None|Daija Lorenz|DaijaLorenz|370483722|40|1034|391|0|47863|Tweet|None|None|None
 84 | tag:search.twitter.com,2005:351835320788062208|2013-07-01T22:50:51.000Z|"Oh, and they need me." He let out a snort of contempt. "The pact works both ways. Tebut was careless. It's not an exerc|None|None|None|['en']|en|en|[60.919, -6.605]|Point|['[-9.0831329, 57.9562068]', '[-9.0831329, 80.8245621]', '[33.6320899, 80.8245621]', '[33.6320899, 57.9562068]']|Polygon|Norway|NO|None|None|None|None|None|None|None|None|None|None|Llacuna Bartoli|BartoliLlauna|1561647116|None|0|1|0|3|Tweet|None|None|None
 85 | tag:search.twitter.com,2005:351835320737730561|2013-07-01T22:50:51.000Z|cite nomes gata..|None|None|None|['pt']|it|lv|None|None|['[-48.285982, -16.0524045]', '[-48.285982, -15.500191999999998]', '[-47.307263999999996, -15.500191999999998]', '[-47.307263999999996, -16.0524045]']|Polygon|Distrito Federal, Brasil|BR|-10800|Gamaika|None|None|None|None|None|None|None|None|♣|GABRIEL34L3ST|323440706|41|297|237|0|5221|Tweet|None|None|None
 86 | tag:search.twitter.com,2005:351835320695787522|2013-07-01T22:50:51.000Z|Y 6 camisetas G-Star|None|None|None|['es']|es|en|[41.408397, 2.1856093]|Point|['[2.0524766, 41.3199988]', '[2.0524766, 41.4682658]', '[2.2261223, 41.4682658]', '[2.2261223, 41.3199988]']|Polygon|Barcelona, Barcelona|ES|None|Barcelona, Camp Nou|None|None|None|None|None|None|None|None|Antimadridista|CarlesPuig7|237867934|27|45|219|0|896|Tweet|None|None|None
 87 | tag:search.twitter.com,2005:351835320385413124|2013-07-01T22:50:51.000Z|Shouuuu I Need To Slide To Dray &amp; Ash House|None|None|None|['en']|en|en|[29.64622295, -82.27052519]|Point|['[-87.634896, 24.396307999999998]', '[-87.634896, 31.000968]', '[-79.974306, 31.000968]', '[-79.974306, 24.396307999999998]']|Polygon|Florida, US|US|-18000|352 →→→ 863|None|None|None|None|None|None|None|None|PutInWurkkkk|RichHomie_Shouu|297041807|42|1031|718|1|50632|Tweet|None|None|None
 88 | tag:search.twitter.com,2005:351835320125366272|2013-07-01T22:50:51.000Z|hbd @AL_kadier @ADEkadier :D better in every way yaaaa. keep rawks ;D|None|None|None|['en']|en|en|[1.5338994000000001, 124.9212371]|Point|['[124.8446177, 1.4724483]', '[124.8446177, 1.576611]', '[124.9334809, 1.576611]', '[124.9334809, 1.4724483]']|Polygon|Mapanget, Kota Manado|ID|28800|AllahSWT's|None|None|None|None|None|None|None|None|LDSari•|diajengsarii|888154496|50|149|200|1|4377|Reply|http://twitter.com/AL_kadier/statuses/351824209544097792|None|None
 89 | tag:search.twitter.com,2005:351835320792252417|2013-07-01T22:50:51.000Z|sometimes i wonder if i will ever be smart #keepdreaming|None|None|None|['en']|en|en|[28.87395867, -13.82116208]|Point|['[-13.8821654, 28.8383932]', '[-13.8821654, 29.0366774]', '[-13.6940322, 29.0366774]', '[-13.6940322, 28.8383932]']|Polygon|Yaiza, Palmas|ES|None|None|None|None|None|None|None|None|None|None|iamabitch☯|poppylewisxo|1030915357|37|275|510|0|799|Tweet|None|None|None
 90 | tag:search.twitter.com,2005:351835320645459968|2013-07-01T22:50:51.000Z|@JADEMEIMOUN1 mdrrrr ohh nan sa serai dommageee|None|None|None|['fr']|en|fr|[48.88389837, 2.38612415]|Point|['[2.2241006, 48.8155414]', '[2.2241006, 48.9021461]', '[2.4699099, 48.9021461]', '[2.4699099, 48.8155414]']|Polygon|Paris, Paris|FR|None|None|None|None|None|None|None|None|None|None|BIRTHHHHDAYYYYYYY|DvorahMergui|919844498|44|94|66|1|5267|Reply|http://twitter.com/JADEMEIMOUN1/statuses/351835011810463744|None|None
 91 | tag:search.twitter.com,2005:351835320817426433|2013-07-01T22:50:51.000Z|Como cuando grito ¡¡*José*!! y la mayoria de mis primos voltea ...|None|None|None|['es']|en|es|[14.9673297, -91.7889038]|Point|['[-94.230569, 14.5319181]', '[-94.230569, 17.985291]', '[-90.37140699999999, 17.985291]', '[-90.37140699999999, 14.5319181]']|Polygon|Chiapas, México|MX|-10800|Guatemala|None|None|None|None|None|None|None|None|Pilar Ochoa|PiLigrOsa|107307381|39|280|258|0|2632|Tweet|None|None|None
 92 | tag:search.twitter.com,2005:351835321006170112|2013-07-01T22:50:51.000Z|Tweeting for no reason|None|None|None|['en']|en|en|[47.0353488, -122.7995873]|Point|['[-122.839691, 46.980616]', '[-122.839691, 47.120329]', '[-122.738243, 47.120329]', '[-122.738243, 46.980616]']|Polygon|Lacey, WA|US|None|None|None|None|None|None|None|None|None|None|aG€Nt ¥£  ¤w|godsgentlegiant|1518643968|41|71|67|0|1474|Tweet|None|None|None
 93 | tag:search.twitter.com,2005:351835321056509952|2013-07-01T22:50:51.000Z|And I'm the one that always gets hit on. OKAAAAY.|None|None|None|['en']|en|en|[34.09956425, -117.40411515]|Point|['[-117.436711, 34.023746]', '[-117.436711, 34.184163]', '[-117.341401, 34.184163]', '[-117.341401, 34.023746]']|Polygon|Rialto, CA|US|-28800|Arizona-Cali|None|None|None|None|None|None|None|None|Cilli♡|Cilli_021112|382278854|30|557|440|1|18434|Tweet|None|None|None
 94 | tag:search.twitter.com,2005:351835320297328640|2013-07-01T22:50:51.000Z|@CheniseFowlisX yeah 😂|None|None|None|['en']|en|vi|[53.28722009, -3.21569687]|Point|['[-3.400604, 53.072143999999994]', '[-3.400604, 53.361909]', '[-2.920279, 53.361909]', '[-2.920279, 53.072143999999994]']|Polygon|Flintshire, Flintshire|GB|0|Leeds- UK / Greenfield- Wales|None|None|None|None|None|None|None|None|marshaa|marshaleighm|269323425|44|767|486|0|17549|Reply|http://twitter.com/CheniseFowlisX/statuses/351834114531409920|None|None
 95 | tag:search.twitter.com,2005:351835321220075524|2013-07-01T22:50:52.000Z|@justinbagdr no, non mi far star meglio.|None|None|None|['it']|it|it|None|None|['[14.1332005, 40.7920697]', '[14.1332005, 40.9159313]', '[14.3537026, 40.9159313]', '[14.3537026, 40.7920697]']|Polygon|Napoli, Napoli|IT|None|#bagdr @limitdemibagdr|None|None|None|None|None|None|None|None|demi.|demibagdr|1542203100|44|64|58|0|1872|Reply|http://twitter.com/justinbagdr/statuses/351833684762038273|None|None
 96 | tag:search.twitter.com,2005:351835321081659392|2013-07-01T22:50:51.000Z|@xhazzasdimples Probabile AHAHHAHAHAHAHAHAHAHAH ODDIO MARIA AHAHAHHAHAHAHA QUELLA SERA QUANTA CE NE SIAMO FUMATA AHAHAHHAHAHAHAHAHA|None|None|None|['it']|it|it|[41.73604, 12.2890263]|Point|['[12.2344266, 41.6558738]', '[12.2344266, 42.140958999999995]', '[12.8558641, 42.140958999999995]', '[12.8558641, 41.6558738]']|Polygon|Roma, Roma|IT|7200|Big Ben's tip|None|None|None|None|None|None|None|None|She's my Louis ❤|xliamsdreams|562092674|53|5084|3802|11|30163|Reply|http://twitter.com/xhazzasdimples/statuses/351835128290480129|None|None
 97 | tag:search.twitter.com,2005:351835321442385921|2013-07-01T22:50:52.000Z|Mandei a foto do Piqué e o fc n respondeu até agr , ela infarto na frente do pc kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk|None|None|None|['pt']|pt|pt|None|None|['[-50.899727999999996, -21.454719]', '[-50.899727999999996, -20.830645999999998]', '[-50.342965, -20.830645999999998]', '[-50.342965, -21.454719]']|Polygon|Araçatuba, São Paulo|BR|-10800|na cama do mari :p|None|None|None|None|None|None|None|None|lorena uai ∞|Fc_GritaAmor|397609113|44|1248|1962|0|11161|Tweet|None|None|None
 98 | tag:search.twitter.com,2005:351835321425608704|2013-07-01T22:50:52.000Z|O matheus André ta me falando aqui , tem quase 1ano q não vejo meu amigo , q saudade dele|None|None|None|['pt']|pt|pt|[-22.72046574, -43.57919689]|Point|['[-43.645123999999996, -22.783967999999998]', '[-43.645123999999996, -22.678859]', '[-43.512217, -22.678859]', '[-43.512217, -22.783967999999998]']|Polygon|Queimados, Rio de Janeiro|BR|-10800|Rio de Janeiro|None|None|None|None|None|None|None|None|Shelyda|ShelydaDays|70959019|40|128|76|0|1419|Tweet|None|None|None
 99 | tag:search.twitter.com,2005:351835320859369475|2013-07-01T22:50:51.000Z|@Mulayhim hatha bs one exam. Other exams y6l3oon roo7na other than the project also -.-|None|None|None|['en']|en|en|[52.21599153, 6.88528201]|Point|['[6.7559955, 52.1611799]', '[6.7559955, 52.2855112]', '[6.981173999999999, 52.2855112]', '[6.981173999999999, 52.1611799]']|Polygon|Enschede|NL|3600|Netherlands\Enschede|None|None|None|None|None|None|None|None|L|Karkooba|1252736376|30|48|88|0|957|Reply|http://twitter.com/Mulayhim/statuses/351817617452965889|None|None
100 | tag:search.twitter.com,2005:351835321471746048|2013-07-01T22:50:52.000Z|😕 Hmm...|None|None|None|['en']|None|en|[42.21746118, -78.03367083]|Point|['[-78.045887, 42.212985]', '[-78.045887, 42.232082]', '[-78.019534, 42.232082]', '[-78.019534, 42.212985]']|Polygon|Belmont, NY|US|None|None|None|None|None|None|None|None|None|None|Lindsey Chamberlain|Linnyy_Kayy|466042520|41|198|188|0|1897|Tweet|None|None|None
101 | 


--------------------------------------------------------------------------------