3 | RUN pip install scikit-learn
4 | ADD *.ipynb /notebooks/
5 | WORKDIR /notebooks
6 | CMD ["/run_jupyter.sh"]
7 |
--------------------------------------------------------------------------------
/deep-learning/tensor-flow-exercises/README.md:
--------------------------------------------------------------------------------
1 | Exercises
2 | ===========================================================
3 |
4 | Building the Docker container
5 | -----------------------------
6 |
7 | docker build -t $USER/exercises .
8 |
9 | Running the container
10 | ---------------------
11 |
12 | docker run -p 8888:8888 -it --rm $USER/exercises
13 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/Makefile:
--------------------------------------------------------------------------------
1 | intro_theano.pdf: slides_source/intro_theano.tex
2 | cd slides_source; pdflatex --shell-escape intro_theano.tex
3 | mv slides_source/intro_theano.pdf .
4 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/intro_theano/intro_theano.pdf
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/intro_theano/utils.py:
--------------------------------------------------------------------------------
1 | """ This file contains different utility functions that are not connected
2 | in anyway to the networks presented in the tutorials, but rather help in
3 | processing the outputs into a more understandable way.
4 |
5 | For example ``tile_raster_images`` helps in generating a easy to grasp
6 | image from a set of samples or weights.
7 | """
8 |
9 |
10 | import numpy
11 | from six.moves import xrange
12 |
13 |
14 | def scale_to_unit_interval(ndar, eps=1e-8):
15 | """ Scales all values in the ndarray ndar to be between 0 and 1 """
16 | ndar = ndar.copy()
17 | ndar -= ndar.min()
18 | ndar *= 1.0 / (ndar.max() + eps)
19 | return ndar
20 |
21 |
22 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
23 | scale_rows_to_unit_interval=True,
24 | output_pixel_vals=True):
25 | """
26 | Transform an array with one flattened image per row, into an array in
27 | which images are reshaped and layed out like tiles on a floor.
28 |
29 | This function is useful for visualizing datasets whose rows are images,
30 | and also columns of matrices for transforming those rows
31 | (such as the first layer of a neural net).
32 |
33 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
34 | be 2-D ndarrays or None;
35 | :param X: a 2-D array in which every row is a flattened image.
36 |
37 | :type img_shape: tuple; (height, width)
38 | :param img_shape: the original shape of each image
39 |
40 | :type tile_shape: tuple; (rows, cols)
41 | :param tile_shape: the number of images to tile (rows, cols)
42 |
43 | :param output_pixel_vals: if output should be pixel values (i.e. int8
44 | values) or floats
45 |
46 | :param scale_rows_to_unit_interval: if the values need to be scaled before
47 | being plotted to [0,1] or not
48 |
49 |
50 | :returns: array suitable for viewing as an image.
51 | (See:`Image.fromarray`.)
52 | :rtype: a 2-d array with same dtype as X.
53 |
54 | """
55 |
56 | assert len(img_shape) == 2
57 | assert len(tile_shape) == 2
58 | assert len(tile_spacing) == 2
59 |
60 | # The expression below can be re-written in a more C style as
61 | # follows :
62 | #
63 | # out_shape = [0,0]
64 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
65 | # tile_spacing[0]
66 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
67 | # tile_spacing[1]
68 | out_shape = [
69 | (ishp + tsp) * tshp - tsp
70 | for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
71 | ]
72 |
73 | if isinstance(X, tuple):
74 | assert len(X) == 4
75 | # Create an output numpy ndarray to store the image
76 | if output_pixel_vals:
77 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
78 | dtype='uint8')
79 | else:
80 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
81 | dtype=X.dtype)
82 |
83 | #colors default to 0, alpha defaults to 1 (opaque)
84 | if output_pixel_vals:
85 | channel_defaults = [0, 0, 0, 255]
86 | else:
87 | channel_defaults = [0., 0., 0., 1.]
88 |
89 | for i in xrange(4):
90 | if X[i] is None:
91 | # if channel is None, fill it with zeros of the correct
92 | # dtype
93 | dt = out_array.dtype
94 | if output_pixel_vals:
95 | dt = 'uint8'
96 | out_array[:, :, i] = numpy.zeros(
97 | out_shape,
98 | dtype=dt
99 | ) + channel_defaults[i]
100 | else:
101 | # use a recurrent call to compute the channel and store it
102 | # in the output
103 | out_array[:, :, i] = tile_raster_images(
104 | X[i], img_shape, tile_shape, tile_spacing,
105 | scale_rows_to_unit_interval, output_pixel_vals)
106 | return out_array
107 |
108 | else:
109 | # if we are dealing with only one channel
110 | H, W = img_shape
111 | Hs, Ws = tile_spacing
112 |
113 | # generate a matrix to store the output
114 | dt = X.dtype
115 | if output_pixel_vals:
116 | dt = 'uint8'
117 | out_array = numpy.zeros(out_shape, dtype=dt)
118 |
119 | for tile_row in xrange(tile_shape[0]):
120 | for tile_col in xrange(tile_shape[1]):
121 | if tile_row * tile_shape[1] + tile_col < X.shape[0]:
122 | this_x = X[tile_row * tile_shape[1] + tile_col]
123 | if scale_rows_to_unit_interval:
124 | # if we should scale values to be between 0 and 1
125 | # do this by calling the `scale_to_unit_interval`
126 | # function
127 | this_img = scale_to_unit_interval(
128 | this_x.reshape(img_shape))
129 | else:
130 | this_img = this_x.reshape(img_shape)
131 | # add the slice to the corresponding position in the
132 | # output array
133 | c = 1
134 | if output_pixel_vals:
135 | c = 255
136 | out_array[
137 | tile_row * (H + Hs): tile_row * (H + Hs) + H,
138 | tile_col * (W + Ws): tile_col * (W + Ws) + W
139 | ] = this_img * c
140 | return out_array
141 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/Makefile:
--------------------------------------------------------------------------------
1 | all: instruction.pdf rnn_lstm.pdf
2 |
3 | instruction.pdf: slides_source/instruction.tex
4 | cd slides_source; pdflatex --shell-escape instruction.tex
5 | cd slides_source; pdflatex --shell-escape instruction.tex
6 | cd slides_source; pdflatex --shell-escape instruction.tex
7 | mv slides_source/instruction.pdf .
8 |
9 | rnn_lstm.pdf: slides_source/rnn_lstm.tex
10 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex
11 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex
12 | cd slides_source; pdflatex --shell-escape rnn_lstm.tex
13 | mv slides_source/rnn_lstm.pdf .
14 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/instruction.pdf
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/deep-learning/theano-tutorial/rnn_tutorial/rnn_lstm.pdf
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/rnn_tutorial/synthetic.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import numpy as np
3 |
4 |
5 | def mackey_glass(sample_len=1000, tau=17, seed=None, n_samples = 1):
6 | '''
7 | mackey_glass(sample_len=1000, tau=17, seed = None, n_samples = 1) -> input
8 | Generate the Mackey Glass time-series. Parameters are:
9 | - sample_len: length of the time-series in timesteps. Default is 1000.
10 | - tau: delay of the MG - system. Commonly used values are tau=17 (mild
11 | chaos) and tau=30 (moderate chaos). Default is 17.
12 | - seed: to seed the random generator, can be used to generate the same
13 | timeseries at each invocation.
14 | - n_samples : number of samples to generate
15 | '''
16 | delta_t = 10
17 | history_len = tau * delta_t
18 | # Initial conditions for the history of the system
19 | timeseries = 1.2
20 |
21 | if seed is not None:
22 | np.random.seed(seed)
23 |
24 | samples = []
25 |
26 | for _ in range(n_samples):
27 | history = collections.deque(1.2 * np.ones(history_len) + 0.2 * \
28 | (np.random.rand(history_len) - 0.5))
29 | # Preallocate the array for the time-series
30 | inp = np.zeros((sample_len,1))
31 |
32 | for timestep in range(sample_len):
33 | for _ in range(delta_t):
34 | xtau = history.popleft()
35 | history.append(timeseries)
36 | timeseries = history[-1] + (0.2 * xtau / (1.0 + xtau ** 10) - \
37 | 0.1 * history[-1]) / delta_t
38 | inp[timestep] = timeseries
39 |
40 | # Squash timeseries through tanh
41 | inp = np.tanh(inp - 1)
42 | samples.append(inp)
43 | return samples
44 |
45 |
46 | def mso(sample_len=1000, n_samples = 1):
47 | '''
48 | mso(sample_len=1000, n_samples = 1) -> input
49 | Generate the Multiple Sinewave Oscillator time-series, a sum of two sines
50 | with incommensurable periods. Parameters are:
51 | - sample_len: length of the time-series in timesteps
52 | - n_samples: number of samples to generate
53 | '''
54 | signals = []
55 | for _ in range(n_samples):
56 | phase = np.random.rand()
57 | x = np.atleast_2d(np.arange(sample_len)).T
58 | signals.append(np.sin(0.2 * x + phase) + np.sin(0.311 * x + phase))
59 | return signals
60 |
61 |
62 | def lorentz(sample_len=1000, sigma=10, rho=28, beta=8 / 3, step=0.01):
63 | """This function generates a Lorentz time series of length sample_len,
64 | with standard parameters sigma, rho and beta.
65 | """
66 |
67 | x = np.zeros([sample_len])
68 | y = np.zeros([sample_len])
69 | z = np.zeros([sample_len])
70 |
71 | # Initial conditions taken from 'Chaos and Time Series Analysis', J. Sprott
72 | x[0] = 0;
73 | y[0] = -0.01;
74 | z[0] = 9;
75 |
76 | for t in range(sample_len - 1):
77 | x[t + 1] = x[t] + sigma * (y[t] - x[t]) * step
78 | y[t + 1] = y[t] + (x[t] * (rho - z[t]) - y[t]) * step
79 | z[t + 1] = z[t] + (x[t] * y[t] - beta * z[t]) * step
80 |
81 | x.shape += (1,)
82 | y.shape += (1,)
83 | z.shape += (1,)
84 |
85 | return np.concatenate((x, y, z), axis=1)
86 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex1_solution.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import theano.tensor as T
3 | import numpy as np
4 |
5 | coefficients = T.vector("coefficients")
6 | x = T.scalar("x")
7 | max_coefficients_supported = 10000
8 |
9 |
10 | def step(coeff, power, prior_value, free_var):
11 | return prior_value + (coeff * (free_var ** power))
12 |
13 | # Generate the components of the polynomial
14 | full_range = T.arange(max_coefficients_supported)
15 | outputs_info = np.zeros((), dtype=theano.config.floatX)
16 |
17 | components, updates = theano.scan(fn=step,
18 | sequences=[coefficients, full_range],
19 | outputs_info=outputs_info,
20 | non_sequences=x)
21 |
22 | polynomial = components[-1]
23 | calculate_polynomial = theano.function(inputs=[coefficients, x],
24 | outputs=polynomial,
25 | updates=updates)
26 |
27 | test_coeff = np.asarray([1, 0, 2], dtype=theano.config.floatX)
28 | print(calculate_polynomial(test_coeff, 3))
29 |
--------------------------------------------------------------------------------
/deep-learning/theano-tutorial/scan_tutorial/scan_ex2_solution.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import theano.tensor as T
3 | import numpy as np
4 |
5 | probabilities = T.vector()
6 | nb_samples = T.iscalar()
7 |
8 | rng = T.shared_randomstreams.RandomStreams(1234)
9 |
10 |
11 | def sample_from_pvect(pvect):
12 | """ Provided utility function: given a symbolic vector of
13 | probabilities (which MUST sum to 1), sample one element
14 | and return its index.
15 | """
16 | onehot_sample = rng.multinomial(n=1, pvals=pvect)
17 | sample = onehot_sample.argmax()
18 | return sample
19 |
20 |
21 | def set_p_to_zero(pvect, i):
22 | """ Provided utility function: given a symbolic vector of
23 | probabilities and an index 'i', set the probability of the
24 | i-th element to 0 and renormalize the probabilities so they
25 | sum to 1.
26 | """
27 | new_pvect = T.set_subtensor(pvect[i], 0.)
28 | new_pvect = new_pvect / new_pvect.sum()
29 | return new_pvect
30 |
31 |
32 | def step(p):
33 | sample = sample_from_pvect(p)
34 | new_p = set_p_to_zero(p, sample)
35 | return new_p, sample
36 |
37 | output, updates = theano.scan(fn=step,
38 | outputs_info=[probabilities, None],
39 | n_steps=nb_samples)
40 |
41 | modified_probabilities, samples = output
42 |
43 | f = theano.function(inputs=[probabilities, nb_samples],
44 | outputs=[samples],
45 | updates=updates)
46 |
47 | # Testing the function
48 | test_probs = np.asarray([0.6, 0.3, 0.1], dtype=theano.config.floatX)
49 | for i in range(10):
50 | print(f(test_probs, 2))
51 |
--------------------------------------------------------------------------------
/images/README.sketch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README.sketch
--------------------------------------------------------------------------------
/images/README_1200x800.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/README_1200x800.gif
--------------------------------------------------------------------------------
/images/aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/aws.png
--------------------------------------------------------------------------------
/images/commands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/commands.png
--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/cover.png
--------------------------------------------------------------------------------
/images/coversmall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall.png
--------------------------------------------------------------------------------
/images/coversmall_alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/coversmall_alt.png
--------------------------------------------------------------------------------
/images/deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/deep.png
--------------------------------------------------------------------------------
/images/k-means.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/k-means.gif
--------------------------------------------------------------------------------
/images/kaggle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/kaggle.png
--------------------------------------------------------------------------------
/images/keras.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/keras.jpg
--------------------------------------------------------------------------------
/images/matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/matplotlib.png
--------------------------------------------------------------------------------
/images/mrjob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/mrjob.png
--------------------------------------------------------------------------------
/images/numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/numpy.png
--------------------------------------------------------------------------------
/images/pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/pandas.png
--------------------------------------------------------------------------------
/images/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/python.png
--------------------------------------------------------------------------------
/images/regex-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-1.png
--------------------------------------------------------------------------------
/images/regex-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/regex-2.png
--------------------------------------------------------------------------------
/images/scikitlearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scikitlearn.png
--------------------------------------------------------------------------------
/images/scipy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/scipy.png
--------------------------------------------------------------------------------
/images/spark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/spark.png
--------------------------------------------------------------------------------
/images/svm.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/svm.gif
--------------------------------------------------------------------------------
/images/tensorflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/tensorflow.png
--------------------------------------------------------------------------------
/images/theano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/images/theano.png
--------------------------------------------------------------------------------
/kaggle/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/kaggle/__init__.py
--------------------------------------------------------------------------------
/mapreduce/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/mapreduce/__init__.py
--------------------------------------------------------------------------------
/mapreduce/mr_s3_log_parser.py:
--------------------------------------------------------------------------------
1 |
2 | import time
3 | from mrjob.job import MRJob
4 | from mrjob.protocol import RawValueProtocol, ReprProtocol
5 | import re
6 |
7 |
8 | class MrS3LogParser(MRJob):
9 | """Parses the logs from S3 based on the S3 logging format:
10 | http://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html
11 |
12 | Aggregates a user's daily requests by user agent and operation
13 |
14 | Outputs date_time, requester, user_agent, operation, count
15 | """
16 |
17 | LOGPATS = r'(\S+) (\S+) \[(.*?)\] (\S+) (\S+) ' \
18 | r'(\S+) (\S+) (\S+) ("([^"]+)"|-) ' \
19 | r'(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ' \
20 | r'("([^"]+)"|-) ("([^"]+)"|-)'
21 | NUM_ENTRIES_PER_LINE = 17
22 | logpat = re.compile(LOGPATS)
23 |
24 | (S3_LOG_BUCKET_OWNER,
25 | S3_LOG_BUCKET,
26 | S3_LOG_DATE_TIME,
27 | S3_LOG_IP,
28 | S3_LOG_REQUESTER_ID,
29 | S3_LOG_REQUEST_ID,
30 | S3_LOG_OPERATION,
31 | S3_LOG_KEY,
32 | S3_LOG_HTTP_METHOD,
33 | S3_LOG_HTTP_STATUS,
34 | S3_LOG_S3_ERROR,
35 | S3_LOG_BYTES_SENT,
36 | S3_LOG_OBJECT_SIZE,
37 | S3_LOG_TOTAL_TIME,
38 | S3_LOG_TURN_AROUND_TIME,
39 | S3_LOG_REFERER,
40 | S3_LOG_USER_AGENT) = range(NUM_ENTRIES_PER_LINE)
41 |
42 | DELIMITER = '\t'
43 |
44 | # We use RawValueProtocol for input to be format agnostic
45 | # and avoid any type of parsing errors
46 | INPUT_PROTOCOL = RawValueProtocol
47 |
48 | # We use RawValueProtocol for output so we can output raw lines
49 | # instead of (k, v) pairs
50 | OUTPUT_PROTOCOL = RawValueProtocol
51 |
52 | # Encode the intermediate records using repr() instead of JSON, so the
53 | # record doesn't get Unicode-encoded
54 | INTERNAL_PROTOCOL = ReprProtocol
55 |
56 | def clean_date_time_zone(self, raw_date_time_zone):
57 | """Converts entry 22/Jul/2013:21:04:17 +0000 to the format
58 | 'YYYY-MM-DD HH:MM:SS' which is more suitable for loading into
59 | a database such as Redshift or RDS
60 |
61 | Note: requires the chars "[ ]" to be stripped prior to input
62 | Returns the converted datetime annd timezone
63 | or None for both values if failed
64 |
65 | TODO: Needs to combine timezone with date as one field
66 | """
67 | date_time = None
68 | time_zone_parsed = None
69 |
70 | # TODO: Probably cleaner to parse this with a regex
71 | date_parsed = raw_date_time_zone[:raw_date_time_zone.find(":")]
72 | time_parsed = raw_date_time_zone[raw_date_time_zone.find(":") + 1:
73 | raw_date_time_zone.find("+") - 1]
74 | time_zone_parsed = raw_date_time_zone[raw_date_time_zone.find("+"):]
75 |
76 | try:
77 | date_struct = time.strptime(date_parsed, "%d/%b/%Y")
78 | converted_date = time.strftime("%Y-%m-%d", date_struct)
79 | date_time = converted_date + " " + time_parsed
80 |
81 | # Throws a ValueError exception if the operation fails that is
82 | # caught by the calling function and is handled appropriately
83 | except ValueError as error:
84 | raise ValueError(error)
85 | else:
86 | return converted_date, date_time, time_zone_parsed
87 |
88 | def mapper(self, _, line):
89 | line = line.strip()
90 | match = self.logpat.search(line)
91 |
92 | date_time = None
93 | requester = None
94 | user_agent = None
95 | operation = None
96 |
97 | try:
98 | for n in range(self.NUM_ENTRIES_PER_LINE):
99 | group = match.group(1 + n)
100 |
101 | if n == self.S3_LOG_DATE_TIME:
102 | date, date_time, time_zone_parsed = \
103 | self.clean_date_time_zone(group)
104 | # Leave the following line of code if
105 | # you want to aggregate by date
106 | date_time = date + " 00:00:00"
107 | elif n == self.S3_LOG_REQUESTER_ID:
108 | requester = group
109 | elif n == self.S3_LOG_USER_AGENT:
110 | user_agent = group
111 | elif n == self.S3_LOG_OPERATION:
112 | operation = group
113 | else:
114 | pass
115 |
116 | except Exception:
117 | yield (("Error while parsing line: %s", line), 1)
118 | else:
119 | yield ((date_time, requester, user_agent, operation), 1)
120 |
121 | def reducer(self, key, values):
122 | output = list(key)
123 | output = self.DELIMITER.join(output) + \
124 | self.DELIMITER + \
125 | str(sum(values))
126 |
127 | yield None, output
128 |
129 | def steps(self):
130 | return [
131 | self.mr(mapper=self.mapper,
132 | reducer=self.reducer)
133 | ]
134 |
135 |
136 | if __name__ == '__main__':
137 | MrS3LogParser.run()
--------------------------------------------------------------------------------
/mapreduce/test_mr_s3_log_parser.py:
--------------------------------------------------------------------------------
1 |
2 | from StringIO import StringIO
3 | import unittest2 as unittest
4 | from mr_s3_log_parser import MrS3LogParser
5 |
6 |
7 | class MrTestsUtil:
8 |
9 | def run_mr_sandbox(self, mr_job, stdin):
10 | # inline runs the job in the same process so small jobs tend to
11 | # run faster and stack traces are simpler
12 | # --no-conf prevents options from local mrjob.conf from polluting
13 | # the testing environment
14 | # "-" reads from standard in
15 | mr_job.sandbox(stdin=stdin)
16 |
17 | # make_runner ensures job cleanup is performed regardless of
18 | # success or failure
19 | with mr_job.make_runner() as runner:
20 | runner.run()
21 | for line in runner.stream_output():
22 | key, value = mr_job.parse_output_line(line)
23 | yield value
24 |
25 |
26 | class TestMrS3LogParser(unittest.TestCase):
27 |
28 | mr_job = None
29 | mr_tests_util = None
30 |
31 | RAW_LOG_LINE_INVALID = \
32 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
33 | '00000388225bcc00000 ' \
34 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \
35 | '00.111.222.33 ' \
36 |
37 | RAW_LOG_LINE_VALID = \
38 | '00000fe9688b6e57f75bd2b7f7c1610689e8f01000000' \
39 | '00000388225bcc00000 ' \
40 | 's3-storage [22/Jul/2013:21:03:27 +0000] ' \
41 | '00.111.222.33 ' \
42 | 'arn:aws:sts::000005646931:federated-user/user 00000AB825500000 ' \
43 | 'REST.HEAD.OBJECT user/file.pdf ' \
44 | '"HEAD /user/file.pdf?versionId=00000XMHZJp6DjM9x500000' \
45 | '00000SDZk ' \
46 | 'HTTP/1.1" 200 - - 4000272 18 - "-" ' \
47 | '"Boto/2.5.1 (darwin) USER-AGENT/1.0.14.0" ' \
48 | '00000XMHZJp6DjM9x5JVEAMo8MG00000'
49 |
50 | DATE_TIME_ZONE_INVALID = "AB/Jul/2013:21:04:17 +0000"
51 | DATE_TIME_ZONE_VALID = "22/Jul/2013:21:04:17 +0000"
52 | DATE_VALID = "2013-07-22"
53 | DATE_TIME_VALID = "2013-07-22 21:04:17"
54 | TIME_ZONE_VALID = "+0000"
55 |
56 | def __init__(self, *args, **kwargs):
57 | super(TestMrS3LogParser, self).__init__(*args, **kwargs)
58 | self.mr_job = MrS3LogParser(['-r', 'inline', '--no-conf', '-'])
59 | self.mr_tests_util = MrTestsUtil()
60 |
61 | def test_invalid_log_lines(self):
62 | stdin = StringIO(self.RAW_LOG_LINE_INVALID)
63 |
64 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
65 | self.assertEqual(result.find("Error"), 0)
66 |
67 | def test_valid_log_lines(self):
68 | stdin = StringIO(self.RAW_LOG_LINE_VALID)
69 |
70 | for result in self.mr_tests_util.run_mr_sandbox(self.mr_job, stdin):
71 | self.assertEqual(result.find("Error"), -1)
72 |
73 | def test_clean_date_time_zone(self):
74 | date, date_time, time_zone_parsed = \
75 | self.mr_job.clean_date_time_zone(self.DATE_TIME_ZONE_VALID)
76 | self.assertEqual(date, self.DATE_VALID)
77 | self.assertEqual(date_time, self.DATE_TIME_VALID)
78 | self.assertEqual(time_zone_parsed, self.TIME_ZONE_VALID)
79 |
80 | # Use a lambda to delay the calling of clean_date_time_zone so that
81 | # assertRaises has enough time to handle it properly
82 | self.assertRaises(ValueError,
83 | lambda: self.mr_job.clean_date_time_zone(
84 | self.DATE_TIME_ZONE_INVALID))
85 |
86 | if __name__ == '__main__':
87 | unittest.main()
88 |
--------------------------------------------------------------------------------
/matplotlib/04.15-Further-Resources.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "
\n",
9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
10 | "\n",
11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
12 | "\n",
13 | "*No changes were made to the contents of this notebook from the original.*"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "\n",
21 | "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Further Resources"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Matplotlib Resources\n",
36 | "\n",
37 | "A single chapter in a book can never hope to cover all the available features and plot types available in Matplotlib.\n",
38 | "As with other packages we've seen, liberal use of IPython's tab-completion and help functions (see [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)) can be very helpful when exploring Matplotlib's API.\n",
39 | "In addition, Matplotlib’s [online documentation](http://matplotlib.org/) can be a helpful reference.\n",
40 | "See in particular the [Matplotlib gallery](http://matplotlib.org/gallery.html) linked on that page: it shows thumbnails of hundreds of different plot types, each one linked to a page with the Python code snippet used to generate it.\n",
41 | "In this way, you can visually inspect and learn about a wide range of different plotting styles and visualization techniques.\n",
42 | "\n",
43 | "For a book-length treatment of Matplotlib, I would recommend [*Interactive Applications Using Matplotlib*](https://www.packtpub.com/application-development/interactive-applications-using-matplotlib), written by Matplotlib core developer Ben Root."
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Other Python Graphics Libraries\n",
51 | "\n",
52 | "Although Matplotlib is the most prominent Python visualization library, there are other more modern tools that are worth exploring as well.\n",
53 | "I'll mention a few of them briefly here:\n",
54 | "\n",
55 | "- [Bokeh](http://bokeh.pydata.org) is a JavaScript visualization library with a Python frontend that creates highly interactive visualizations capable of handling very large and/or streaming datasets. The Python front-end outputs a JSON data structure that can be interpreted by the Bokeh JS engine.\n",
56 | "- [Plotly](http://plot.ly) is the eponymous open source product of the Plotly company, and is similar in spirit to Bokeh. Because Plotly is the main product of a startup, it is receiving a high level of development effort. Use of the library is entirely free.\n",
57 | "- [Vispy](http://vispy.org/) is an actively developed project focused on dynamic visualizations of very large datasets. Because it is built to target OpenGL and make use of efficient graphics processors in your computer, it is able to render some quite large and stunning visualizations.\n",
58 | "- [Vega](https://vega.github.io/) and [Vega-Lite](https://vega.github.io/vega-lite) are declarative graphics representations, and are the product of years of research into the fundamental language of data visualization. The reference rendering implementation is JavaScript, but the API is language agnostic. There is a Python API under development in the [Altair](https://altair-viz.github.io/) package. Though as of summer 2016 it's not yet fully mature, I'm quite excited for the possibilities of this project to provide a common reference point for visualization in Python and other languages.\n",
59 | "\n",
60 | "The visualization space in the Python community is very dynamic, and I fully expect this list to be out of date as soon as it is published.\n",
61 | "Keep an eye out for what's coming in the future!"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "\n",
69 | "< [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb) | [Contents](Index.ipynb) | [Machine Learning](05.00-Machine-Learning.ipynb) >"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.4.3"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 0
94 | }
95 |
--------------------------------------------------------------------------------
/matplotlib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/__init__.py
--------------------------------------------------------------------------------
/matplotlib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/matplotlib/tests/__init__.py
--------------------------------------------------------------------------------
/misc/regex.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Regex\n",
8 | "\n",
9 | "Credits: Material from [Regular Expressions Cheat Sheet](http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/) by Dave Child\n",
10 | "\n",
11 | "Use with http://www.regexr.com to generate regular expressions."
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n",
19 | "
\n",
20 | "
\n",
21 | "
\n",
22 | "
"
23 | ]
24 | }
25 | ],
26 | "metadata": {
27 | "kernelspec": {
28 | "display_name": "Python 2",
29 | "language": "python",
30 | "name": "python2"
31 | },
32 | "language_info": {
33 | "codemirror_mode": {
34 | "name": "ipython",
35 | "version": 2
36 | },
37 | "file_extension": ".py",
38 | "mimetype": "text/x-python",
39 | "name": "python",
40 | "nbconvert_exporter": "python",
41 | "pygments_lexer": "ipython2",
42 | "version": "2.7.10"
43 | }
44 | },
45 | "nbformat": 4,
46 | "nbformat_minor": 0
47 | }
48 |
--------------------------------------------------------------------------------
/numpy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/__init__.py
--------------------------------------------------------------------------------
/numpy/figures/02.05-broadcasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/02.05-broadcasting.png
--------------------------------------------------------------------------------
/numpy/figures/PDSH-cover-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/PDSH-cover-small.png
--------------------------------------------------------------------------------
/numpy/figures/array_vs_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/array_vs_list.png
--------------------------------------------------------------------------------
/numpy/figures/cint_vs_pyint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/figures/cint_vs_pyint.png
--------------------------------------------------------------------------------
/numpy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/numpy/tests/__init__.py
--------------------------------------------------------------------------------
/pandas/03.00-Introduction-to-Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "
\n",
9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
10 | "\n",
11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
12 | "\n",
13 | "*No changes were made to the contents of this notebook from the original.*"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "\n",
21 | "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Data Manipulation with Pandas"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "In the previous chapter, we dove into detail on NumPy and its ``ndarray`` object, which provides efficient storage and manipulation of dense typed arrays in Python.\n",
36 | "Here we'll build on this knowledge by looking in detail at the data structures provided by the Pandas library.\n",
37 | "Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a ``DataFrame``.\n",
38 | "``DataFrame``s are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.\n",
39 | "As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.\n",
40 | "\n",
41 | "As we saw, NumPy's ``ndarray`` data structure provides essential features for the type of clean, well-organized data typically seen in numerical computing tasks.\n",
42 | "While it serves this purpose very well, its limitations become clear when we need more flexibility (e.g., attaching labels to data, working with missing data, etc.) and when attempting operations that do not map well to element-wise broadcasting (e.g., groupings, pivots, etc.), each of which is an important piece of analyzing the less structured data available in many forms in the world around us.\n",
43 | "Pandas, and in particular its ``Series`` and ``DataFrame`` objects, builds on the NumPy array structure and provides efficient access to these sorts of \"data munging\" tasks that occupy much of a data scientist's time.\n",
44 | "\n",
45 | "In this chapter, we will focus on the mechanics of using ``Series``, ``DataFrame``, and related structures effectively.\n",
46 | "We will use examples drawn from real datasets where appropriate, but these examples are not necessarily the focus."
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## Installing and Using Pandas\n",
54 | "\n",
55 | "Installation of Pandas on your system requires NumPy to be installed, and if building the library from source, requires the appropriate tools to compile the C and Cython sources on which Pandas is built.\n",
56 | "Details on this installation can be found in the [Pandas documentation](http://pandas.pydata.org/).\n",
57 | "If you followed the advice outlined in the [Preface](00.00-Preface.ipynb) and used the Anaconda stack, you already have Pandas installed.\n",
58 | "\n",
59 | "Once Pandas is installed, you can import it and check the version:"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 1,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/plain": [
72 | "'0.18.1'"
73 | ]
74 | },
75 | "execution_count": 1,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "import pandas\n",
82 | "pandas.__version__"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "Just as we generally import NumPy under the alias ``np``, we will import Pandas under the alias ``pd``:"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 2,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "import pandas as pd"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "This import convention will be used throughout the remainder of this book."
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## Reminder about Built-In Documentation\n",
115 | "\n",
116 | "As you read through this chapter, don't forget that IPython gives you the ability to quickly explore the contents of a package (by using the tab-completion feature) as well as the documentation of various functions (using the ``?`` character). (Refer back to [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb) if you need a refresher on this.)\n",
117 | "\n",
118 | "For example, to display all the contents of the pandas namespace, you can type\n",
119 | "\n",
120 | "```ipython\n",
121 | "In [3]: pd.\n",
122 | "```\n",
123 | "\n",
124 | "And to display Pandas's built-in documentation, you can use this:\n",
125 | "\n",
126 | "```ipython\n",
127 | "In [4]: pd?\n",
128 | "```\n",
129 | "\n",
130 | "More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/."
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "\n",
138 | "< [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb) | [Contents](Index.ipynb) | [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb) >"
139 | ]
140 | }
141 | ],
142 | "metadata": {
143 | "anaconda-cloud": {},
144 | "kernelspec": {
145 | "display_name": "Python 3",
146 | "language": "python",
147 | "name": "python3"
148 | },
149 | "language_info": {
150 | "codemirror_mode": {
151 | "name": "ipython",
152 | "version": 3
153 | },
154 | "file_extension": ".py",
155 | "mimetype": "text/x-python",
156 | "name": "python",
157 | "nbconvert_exporter": "python",
158 | "pygments_lexer": "ipython3",
159 | "version": "3.4.3"
160 | }
161 | },
162 | "nbformat": 4,
163 | "nbformat_minor": 0
164 | }
165 |
--------------------------------------------------------------------------------
/pandas/03.13-Further-Resources.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "
\n",
9 | "*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n",
10 | "\n",
11 | "*The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!*\n",
12 | "\n",
13 | "*No changes were made to the contents of this notebook from the original.*"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "\n",
21 | "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Further Resources\n",
29 | "\n",
30 | "In this chapter, we've covered many of the basics of using Pandas effectively for data analysis.\n",
31 | "Still, much has been omitted from our discussion.\n",
32 | "To learn more about Pandas, I recommend the following resources:\n",
33 | "\n",
34 | "- [Pandas online documentation](http://pandas.pydata.org/): This is the go-to source for complete documentation of the package. While the examples in the documentation tend to be small generated datasets, the description of the options is complete and generally very useful for understanding the use of various functions.\n",
35 | "\n",
36 | "- [*Python for Data Analysis*](http://shop.oreilly.com/product/0636920023784.do) Written by Wes McKinney (the original creator of Pandas), this book contains much more detail on the Pandas package than we had room for in this chapter. In particular, he takes a deep dive into tools for time series, which were his bread and butter as a financial consultant. The book also has many entertaining examples of applying Pandas to gain insight from real-world datasets. Keep in mind, though, that the book is now several years old, and the Pandas package has quite a few new features that this book does not cover (but be on the lookout for a new edition in 2017).\n",
37 | "\n",
38 | "- [Stack Overflow](http://stackoverflow.com/questions/tagged/pandas): Pandas has so many users that any question you have has likely been asked and answered on Stack Overflow. Using Pandas is a case where some Google-Fu is your best friend. Simply go to your favorite search engine and type in the question, problem, or error you're coming across–more than likely you'll find your answer on a Stack Overflow page.\n",
39 | "\n",
40 | "- [Pandas on PyVideo](http://pyvideo.org/search?q=pandas): From PyCon to SciPy to PyData, many conferences have featured tutorials from Pandas developers and power users. The PyCon tutorials in particular tend to be given by very well-vetted presenters.\n",
41 | "\n",
42 | "Using these resources, combined with the walk-through given in this chapter, my hope is that you'll be poised to use Pandas to tackle any data analysis problem you come across!"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "\n",
50 | "< [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb) | [Contents](Index.ipynb) | [Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb) >"
51 | ]
52 | }
53 | ],
54 | "metadata": {
55 | "anaconda-cloud": {},
56 | "kernelspec": {
57 | "display_name": "Python 3",
58 | "language": "python",
59 | "name": "python3"
60 | },
61 | "language_info": {
62 | "codemirror_mode": {
63 | "name": "ipython",
64 | "version": 3
65 | },
66 | "file_extension": ".py",
67 | "mimetype": "text/x-python",
68 | "name": "python",
69 | "nbconvert_exporter": "python",
70 | "pygments_lexer": "ipython3",
71 | "version": "3.4.3"
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 0
76 | }
77 |
--------------------------------------------------------------------------------
/pandas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/__init__.py
--------------------------------------------------------------------------------
/pandas/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/pandas/tests/__init__.py
--------------------------------------------------------------------------------
/python-data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/__init__.py
--------------------------------------------------------------------------------
/python-data/files.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Files\n",
15 | "\n",
16 | "* Read a File\n",
17 | "* Write a File\n",
18 | "* Read and Write UTF-8"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Read a File\n",
26 | "\n",
27 | "Open a file in read-only mode.
\n",
28 | "Iterate over the file lines. rstrip removes the EOL markers.
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 1,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "class TypeUtil:\n",
43 | "\n",
44 | " @classmethod\n",
45 | " def is_iterable(cls, obj):\n",
46 | " \"\"\"Determines if obj is iterable.\n",
47 | "\n",
48 | " Useful when writing functions that can accept multiple types of\n",
49 | " input (list, tuple, ndarray, iterator). Pairs well with\n",
50 | " convert_to_list.\n",
51 | " \"\"\"\n",
52 | " try:\n",
53 | " iter(obj)\n",
54 | " return True\n",
55 | " except TypeError:\n",
56 | " return False\n",
57 | "\n",
58 | " @classmethod\n",
59 | " def convert_to_list(cls, obj):\n",
60 | " \"\"\"Converts obj to a list if it is not a list and it is iterable,\n",
61 | " else returns the original obj.\n",
62 | " \"\"\"\n",
63 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n",
64 | " obj = list(obj)\n",
65 | " return obj\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "old_file_path = 'type_util.py'\n",
71 | "with open(old_file_path, 'r') as old_file:\n",
72 | " for line in old_file:\n",
73 | " print(line.rstrip())"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## Write to a file\n",
81 | "\n",
82 | "Create a new file overwriting any previous file with the same name, write text, then close the file:"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 2,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "new_file_path = 'hello_world.txt'\n",
94 | "with open(new_file_path, 'w') as new_file:\n",
95 | " new_file.write('hello world!')"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "## Read and Write UTF-8"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 3,
108 | "metadata": {
109 | "collapsed": false
110 | },
111 | "outputs": [],
112 | "source": [
113 | "import codecs\n",
114 | "with codecs.open(\"hello_world_new.txt\", \"a\", \"utf-8\") as new_file:\n",
115 | " with codecs.open(\"hello_world.txt\", \"r\", \"utf-8\") as old_file: \n",
116 | " for line in old_file:\n",
117 | " new_file.write(line + '\\n')"
118 | ]
119 | }
120 | ],
121 | "metadata": {
122 | "kernelspec": {
123 | "display_name": "Python 2",
124 | "language": "python",
125 | "name": "python2"
126 | },
127 | "language_info": {
128 | "codemirror_mode": {
129 | "name": "ipython",
130 | "version": 2
131 | },
132 | "file_extension": ".py",
133 | "mimetype": "text/x-python",
134 | "name": "python",
135 | "nbconvert_exporter": "python",
136 | "pygments_lexer": "ipython2",
137 | "version": "2.7.10"
138 | }
139 | },
140 | "nbformat": 4,
141 | "nbformat_minor": 0
142 | }
143 |
--------------------------------------------------------------------------------
/python-data/hello_world.txt:
--------------------------------------------------------------------------------
1 | hello world!
--------------------------------------------------------------------------------
/python-data/logs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Logging in Python\n",
15 | "* Logging with RotatingFileHandler\n",
16 | "* Logging with TimedRotatingFileHandler "
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Logging with RotatingFileHandler\n",
24 | "\n",
25 | "The logging discussion is taken from the [Python Logging Cookbook](https://docs.python.org/2/howto/logging-cookbook.html#using-file-rotation):\n",
26 | "\n",
27 | "Sometimes you want to let a log file grow to a certain size, then open a new file and log to that. You may want to keep a certain number of these files, and when that many files have been created, rotate the files so that the number of files and the size of the files both remain bounded. For this usage pattern, the logging package provides a RotatingFileHandler.\n",
28 | "\n",
29 | "The most current file is always logging_rotatingfile_example.out, and each time it reaches the size limit it is renamed with the suffix .1. Each of the existing backup files is renamed to increment the suffix (.1 becomes .2, etc.) and the .6 file is erased.\n",
30 | "\n",
31 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import logging\n",
43 | "import time\n",
44 | " \n",
45 | "from logging.handlers import RotatingFileHandler\n",
46 | " \n",
47 | "#----------------------------------------------------------------------\n",
48 | "def create_rotating_log(path):\n",
49 | " \"\"\"\n",
50 | " Creates a rotating log\n",
51 | " \"\"\"\n",
52 | " logger = logging.getLogger(\"Rotating Log\")\n",
53 | " logger.setLevel(logging.INFO)\n",
54 | " \n",
55 | " # add a rotating handler\n",
56 | " handler = RotatingFileHandler(path, maxBytes=20,\n",
57 | " backupCount=5)\n",
58 | " logger.addHandler(handler)\n",
59 | " \n",
60 | " for i in range(10):\n",
61 | " logger.info(\"This is test log line %s\" % i)\n",
62 | " time.sleep(1.5)\n",
63 | " \n",
64 | "#----------------------------------------------------------------------\n",
65 | "if __name__ == \"__main__\":\n",
66 | " log_file = \"test.log\"\n",
67 | " create_rotating_log(log_file)"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "## Logging with TimedRotatingFileHandler\n",
75 | "\n",
76 | "The following code snippet is taken from [here](http://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/)."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [],
86 | "source": [
87 | "import logging\n",
88 | "import time\n",
89 | " \n",
90 | "from logging.handlers import TimedRotatingFileHandler\n",
91 | " \n",
92 | "#----------------------------------------------------------------------\n",
93 | "def create_timed_rotating_log(path):\n",
94 | " \"\"\"\"\"\"\n",
95 | " logger = logging.getLogger(\"Rotating Log\")\n",
96 | " logger.setLevel(logging.INFO)\n",
97 | " \n",
98 | " # Rotate log based on when parameter:\n",
99 | " # second (s)\n",
100 | " # minute (m)\n",
101 | " # hour (h)\n",
102 | " # day (d)\n",
103 | " # w0-w6 (weekday, 0=Monday)\n",
104 | " # midnight\n",
105 | " handler = TimedRotatingFileHandler(path,\n",
106 | " when=\"m\",\n",
107 | " interval=1,\n",
108 | " backupCount=5)\n",
109 | " logger.addHandler(handler)\n",
110 | " \n",
111 | " for i in range(20):\n",
112 | " logger.info(\"This is a test!\")\n",
113 | " time.sleep(1.5)\n",
114 | " \n",
115 | "#----------------------------------------------------------------------\n",
116 | "if __name__ == \"__main__\":\n",
117 | " log_file = \"timed_test.log\"\n",
118 | " create_timed_rotating_log(log_file)"
119 | ]
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python 2",
125 | "language": "python",
126 | "name": "python2"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 2
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython2",
138 | "version": "2.7.10"
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 0
143 | }
144 |
--------------------------------------------------------------------------------
/python-data/pdb.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# PDB\n",
15 | "\n",
16 | "The pdb module defines an interactive source code debugger for Python programs. Below are frequently used commands:"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {
23 | "collapsed": false
24 | },
25 | "outputs": [],
26 | "source": [
27 | "# Run pdb when this line is hit\n",
28 | "import pdb; pdb.set_trace()\n",
29 | "\n",
30 | "# Run pdb when the script is run\n",
31 | "python -m pdb script.py\n",
32 | "\n",
33 | "# Help\n",
34 | "h[elp]\n",
35 | "\n",
36 | "# Show current content\n",
37 | "l[ist]\n",
38 | "\n",
39 | "# Examine variables\n",
40 | "p[rint]\n",
41 | "\n",
42 | "# Pretty print\n",
43 | "pp\n",
44 | "\n",
45 | "# Go to next line\n",
46 | "n[ext]\n",
47 | "\n",
48 | "# Step into\n",
49 | "s[tep]\n",
50 | "\n",
51 | "# Continue execution until the line with the line number greater \n",
52 | "# than the current one is reached or when returning from current frame.\n",
53 | "until\n",
54 | "\n",
55 | "# Return\n",
56 | "r[eturn]\n",
57 | "\n",
58 | "# See all breakpoints\n",
59 | "b to see all breakpoints\n",
60 | "\n",
61 | "# Set breakpoint at line 16\n",
62 | "b 16 \n",
63 | "\n",
64 | "# Clear breakpoint 1\n",
65 | "cl[ear] 1\n",
66 | "\n",
67 | "# Continue\n",
68 | "c[ontinue]\n",
69 | "\n",
70 | "# Conditional breakpoints, line 11\n",
71 | "b 11, this_year == 2015\n",
72 | "\n",
73 | "# Stack location\n",
74 | "w[here]\n",
75 | "\n",
76 | "# Go up in stack\n",
77 | "u[p]\n",
78 | "\n",
79 | "# Go down in stack\n",
80 | "d[own]\n",
81 | "\n",
82 | "# Longlist shows full method of where you're in (Python 3)\n",
83 | "ll\n",
84 | "\n",
85 | "# Quit\n",
86 | "q[uit]"
87 | ]
88 | }
89 | ],
90 | "metadata": {
91 | "kernelspec": {
92 | "display_name": "Python 2",
93 | "language": "python",
94 | "name": "python2"
95 | },
96 | "language_info": {
97 | "codemirror_mode": {
98 | "name": "ipython",
99 | "version": 2
100 | },
101 | "file_extension": ".py",
102 | "mimetype": "text/x-python",
103 | "name": "python",
104 | "nbconvert_exporter": "python",
105 | "pygments_lexer": "ipython2",
106 | "version": "2.7.10"
107 | }
108 | },
109 | "nbformat": 4,
110 | "nbformat_minor": 0
111 | }
112 |
--------------------------------------------------------------------------------
/python-data/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/python-data/tests/__init__.py
--------------------------------------------------------------------------------
/python-data/tests/test_transform_util.py:
--------------------------------------------------------------------------------
1 | from nose.tools import assert_equal
2 | from ..transform_util import TransformUtil
3 |
4 |
5 | class TestTransformUtil():
6 |
7 | states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', \
8 | 'FlOrIda', 'south carolina##', 'West virginia?']
9 |
10 | expected_output = ['Alabama',
11 | 'Georgia',
12 | 'Georgia',
13 | 'Georgia',
14 | 'Florida',
15 | 'South Carolina',
16 | 'West Virginia']
17 |
18 | def test_remove_punctuation(self):
19 | assert_equal(TransformUtil.remove_punctuation('!#?'), '')
20 |
21 | def test_map_remove_punctuation(self):
22 | # Map applies a function to a collection
23 | output = map(TransformUtil.remove_punctuation, self.states)
24 | assert_equal('!#?' not in output, True)
25 |
26 | def test_clean_strings(self):
27 | clean_ops = [str.strip, TransformUtil.remove_punctuation, str.title]
28 | output = TransformUtil.clean_strings(self.states, clean_ops)
29 | assert_equal(output, self.expected_output)
--------------------------------------------------------------------------------
/python-data/tests/test_type_util.py:
--------------------------------------------------------------------------------
1 | from nose.tools import assert_equal
2 | from ..type_util import TypeUtil
3 |
4 |
5 | class TestUtil():
6 |
7 | def test_is_iterable(self):
8 | assert_equal(TypeUtil.is_iterable('foo'), True)
9 | assert_equal(TypeUtil.is_iterable(7), False)
10 |
11 | def test_convert_to_list(self):
12 | assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)
13 | assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)
--------------------------------------------------------------------------------
/python-data/transform_util.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | class TransformUtil:
5 |
6 | @classmethod
7 | def remove_punctuation(cls, value):
8 | """Removes !, #, and ?.
9 | """
10 | return re.sub('[!#?]', '', value)
11 |
12 | @classmethod
13 | def clean_strings(cls, strings, ops):
14 | """General purpose method to clean strings.
15 |
16 | Pass in a sequence of strings and the operations to perform.
17 | """
18 | result = []
19 | for value in strings:
20 | for function in ops:
21 | value = function(value)
22 | result.append(value)
23 | return result
--------------------------------------------------------------------------------
/python-data/type_util.py:
--------------------------------------------------------------------------------
1 | class TypeUtil:
2 |
3 | @classmethod
4 | def is_iterable(cls, obj):
5 | """Determines if obj is iterable.
6 |
7 | Useful when writing functions that can accept multiple types of
8 | input (list, tuple, ndarray, iterator). Pairs well with
9 | convert_to_list.
10 | """
11 | try:
12 | iter(obj)
13 | return True
14 | except TypeError:
15 | return False
16 |
17 | @classmethod
18 | def convert_to_list(cls, obj):
19 | """Converts obj to a list if it is not a list and it is iterable,
20 | else returns the original obj.
21 | """
22 | if not isinstance(obj, list) and cls.is_iterable(obj):
23 | obj = list(obj)
24 | return obj
--------------------------------------------------------------------------------
/python-data/unit_tests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Nose Unit Tests with IPython Notebook"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Nose\n",
22 | "\n",
23 | "Testing is a vital part of software development. Nose extends unittest to make testing easier."
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Install Nose\n",
31 | "\n",
32 | "Run the following command line:"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "!pip install nose"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Create the Code\n",
51 | "\n",
52 | "Save your code to a file with the %%file magic:"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 1,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "Overwriting type_util.py\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "%%file type_util.py\n",
72 | "class TypeUtil:\n",
73 | "\n",
74 | " @classmethod\n",
75 | " def is_iterable(cls, obj):\n",
76 | " \"\"\"Determines if obj is iterable.\n",
77 | "\n",
78 | " Useful when writing functions that can accept multiple types of\n",
79 | " input (list, tuple, ndarray, iterator). Pairs well with\n",
80 | " convert_to_list.\n",
81 | " \"\"\"\n",
82 | " try:\n",
83 | " iter(obj)\n",
84 | " return True\n",
85 | " except TypeError:\n",
86 | " return False\n",
87 | "\n",
88 | " @classmethod\n",
89 | " def convert_to_list(cls, obj):\n",
90 | " \"\"\"Converts obj to a list if it is not a list and it is iterable, \n",
91 | " else returns the original obj.\n",
92 | " \"\"\"\n",
93 | " if not isinstance(obj, list) and cls.is_iterable(obj):\n",
94 | " obj = list(obj)\n",
95 | " return obj\n"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "## Create the Nose Tests\n",
103 | "\n",
104 | "Save your test to a file with the %%file magic:"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 2,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "Overwriting tests/test_type_util.py\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "%%file tests/test_type_util.py\n",
124 | "from nose.tools import assert_equal\n",
125 | "from ..type_util import TypeUtil\n",
126 | "\n",
127 | "\n",
128 | "class TestUtil():\n",
129 | "\n",
130 | " def test_is_iterable(self):\n",
131 | " assert_equal(TypeUtil.is_iterable('foo'), True)\n",
132 | " assert_equal(TypeUtil.is_iterable(7), False)\n",
133 | "\n",
134 | " def test_convert_to_list(self):\n",
135 | " assert_equal(isinstance(TypeUtil.convert_to_list('foo'), list), True)\n",
136 | " assert_equal(isinstance(TypeUtil.convert_to_list(7), list), False)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "## Run the Nose Tests\n",
144 | "\n",
145 | "Run the following command line:"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 3,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "core.tests.test_type_util.TestUtil.test_convert_to_list ... ok\r\n",
160 | "core.tests.test_type_util.TestUtil.test_is_iterable ... ok\r\n",
161 | "\r\n",
162 | "----------------------------------------------------------------------\r\n",
163 | "Ran 2 tests in 0.001s\r\n",
164 | "\r\n",
165 | "OK\r\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "!nosetests tests/test_type_util.py -v"
171 | ]
172 | }
173 | ],
174 | "metadata": {
175 | "kernelspec": {
176 | "display_name": "Python 2",
177 | "language": "python",
178 | "name": "python2"
179 | },
180 | "language_info": {
181 | "codemirror_mode": {
182 | "name": "ipython",
183 | "version": 2
184 | },
185 | "file_extension": ".py",
186 | "mimetype": "text/x-python",
187 | "name": "python",
188 | "nbconvert_exporter": "python",
189 | "pygments_lexer": "ipython2",
190 | "version": "2.7.10"
191 | }
192 | },
193 | "nbformat": 4,
194 | "nbformat_minor": 0
195 | }
196 |
--------------------------------------------------------------------------------
/scikit-learn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/__init__.py
--------------------------------------------------------------------------------
/scikit-learn/fig_code/ML_flow_chart.py:
--------------------------------------------------------------------------------
1 | """
2 | Tutorial Diagrams
3 | -----------------
4 |
5 | This script plots the flow-charts used in the scikit-learn tutorials.
6 | """
7 |
8 | import numpy as np
9 | import pylab as pl
10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow
11 |
12 | def create_base(box_bg = '#CCCCCC',
13 | arrow1 = '#88CCFF',
14 | arrow2 = '#88FF88',
15 | supervised=True):
16 | fig = pl.figure(figsize=(9, 6), facecolor='w')
17 | ax = pl.axes((0, 0, 1, 1),
18 | xticks=[], yticks=[], frameon=False)
19 | ax.set_xlim(0, 9)
20 | ax.set_ylim(0, 6)
21 |
22 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
23 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
24 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
25 |
26 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
27 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
28 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
29 |
30 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
31 |
32 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
33 |
34 | Circle((5.5, 3.5), 1.0, fc=box_bg),
35 |
36 | Polygon([[5.5, 1.7],
37 | [6.1, 1.1],
38 | [5.5, 0.5],
39 | [4.9, 1.1]], fc=box_bg),
40 |
41 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
42 | width=0.25, head_width=0.5, head_length=0.2),
43 |
44 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
45 | width=0.25, head_width=0.5, head_length=0.2),
46 |
47 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
48 | width=0.25, head_width=0.5, head_length=0.2),
49 |
50 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
51 | width=0.25, head_width=0.5, head_length=0.2),
52 |
53 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
54 | width=0.25, head_width=0.5, head_length=0.2),
55 |
56 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
57 | width=0.25, head_width=0.5, head_length=0.2)]
58 |
59 | if supervised:
60 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
61 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
62 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
63 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
64 | width=0.25, head_width=0.5, head_length=0.2),
65 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
66 | else:
67 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
68 |
69 | for p in patches:
70 | ax.add_patch(p)
71 |
72 | pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
73 | ha='center', va='center', fontsize=14)
74 |
75 | pl.text(3.6, 4.9, "Feature\nVectors",
76 | ha='left', va='center', fontsize=14)
77 |
78 | pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
79 | ha='center', va='center', fontsize=14)
80 |
81 | pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
82 | ha='center', va='center', fontsize=14)
83 |
84 | pl.text(3.3, 1.7, "Feature\nVector",
85 | ha='left', va='center', fontsize=14)
86 |
87 | pl.text(5.5, 1.1, "Predictive\nModel",
88 | ha='center', va='center', fontsize=12)
89 |
90 | if supervised:
91 | pl.text(1.45, 3.05, "Labels",
92 | ha='center', va='center', fontsize=14)
93 |
94 | pl.text(8.05, 1.1, "Expected\nLabel",
95 | ha='center', va='center', fontsize=14)
96 | pl.text(8.8, 5.8, "Supervised Learning Model",
97 | ha='right', va='top', fontsize=18)
98 |
99 | else:
100 | pl.text(8.05, 1.1,
101 | "Likelihood\nor Cluster ID\nor Better\nRepresentation",
102 | ha='center', va='center', fontsize=12)
103 | pl.text(8.8, 5.8, "Unsupervised Learning Model",
104 | ha='right', va='top', fontsize=18)
105 |
106 |
107 |
108 | def plot_supervised_chart(annotate=False):
109 | create_base(supervised=True)
110 | if annotate:
111 | fontdict = dict(color='r', weight='bold', size=14)
112 | pl.text(1.9, 4.55, 'X = vec.fit_transform(input)',
113 | fontdict=fontdict,
114 | rotation=20, ha='left', va='bottom')
115 | pl.text(3.7, 3.2, 'clf.fit(X, y)',
116 | fontdict=fontdict,
117 | rotation=20, ha='left', va='bottom')
118 | pl.text(1.7, 1.5, 'X_new = vec.transform(input)',
119 | fontdict=fontdict,
120 | rotation=20, ha='left', va='bottom')
121 | pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
122 | fontdict=fontdict,
123 | rotation=20, ha='left', va='bottom')
124 |
125 | def plot_unsupervised_chart():
126 | create_base(supervised=False)
127 |
128 |
129 | if __name__ == '__main__':
130 | plot_supervised_chart(False)
131 | plot_supervised_chart(True)
132 | plot_unsupervised_chart()
133 | pl.show()
134 |
135 |
136 |
--------------------------------------------------------------------------------
/scikit-learn/fig_code/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import *
2 | from .figures import *
3 |
4 | from .sgd_separator import plot_sgd_separator
5 | from .linear_regression import plot_linear_regression
6 | from .helpers import plot_iris_knn
7 |
--------------------------------------------------------------------------------
/scikit-learn/fig_code/data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def linear_data_sample(N=40, rseed=0, m=3, b=-2):
5 | rng = np.random.RandomState(rseed)
6 |
7 | x = 10 * rng.rand(N)
8 | dy = m / 2 * (1 + rng.rand(N))
9 | y = m * x + b + dy * rng.randn(N)
10 |
11 | return (x, y, dy)
12 |
13 |
14 | def linear_data_sample_big_errs(N=40, rseed=0, m=3, b=-2):
15 | rng = np.random.RandomState(rseed)
16 |
17 | x = 10 * rng.rand(N)
18 | dy = m / 2 * (1 + rng.rand(N))
19 | dy[20:25] *= 10
20 | y = m * x + b + dy * rng.randn(N)
21 |
22 | return (x, y, dy)
23 |
24 |
25 | def sample_light_curve(phased=True):
26 | from astroML.datasets import fetch_LINEAR_sample
27 | data = fetch_LINEAR_sample()
28 | t, y, dy = data[18525697].T
29 |
30 | if phased:
31 | P_best = 0.580313015651
32 | t /= P_best
33 |
34 | return (t, y, dy)
35 |
36 |
37 | def sample_light_curve_2(phased=True):
38 | from astroML.datasets import fetch_LINEAR_sample
39 | data = fetch_LINEAR_sample()
40 | t, y, dy = data[10022663].T
41 |
42 | if phased:
43 | P_best = 0.61596079804
44 | t /= P_best
45 |
46 | return (t, y, dy)
47 |
48 |
--------------------------------------------------------------------------------
/scikit-learn/fig_code/helpers.py:
--------------------------------------------------------------------------------
1 | """
2 | Small helpers for code that is not shown in the notebooks
3 | """
4 |
5 | from sklearn import neighbors, datasets, linear_model
6 | import pylab as pl
7 | import numpy as np
8 | from matplotlib.colors import ListedColormap
9 |
10 | # Create color maps for 3-class classification problem, as with iris
11 | cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
12 | cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
13 |
14 | def plot_iris_knn():
15 | iris = datasets.load_iris()
16 | X = iris.data[:, :2] # we only take the first two features. We could
17 | # avoid this ugly slicing by using a two-dim dataset
18 | y = iris.target
19 |
20 | knn = neighbors.KNeighborsClassifier(n_neighbors=5)
21 | knn.fit(X, y)
22 |
23 | x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
24 | y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
25 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
26 | np.linspace(y_min, y_max, 100))
27 | Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
28 |
29 | # Put the result into a color plot
30 | Z = Z.reshape(xx.shape)
31 | pl.figure()
32 | pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
33 |
34 | # Plot also the training points
35 | pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
36 | pl.xlabel('sepal length (cm)')
37 | pl.ylabel('sepal width (cm)')
38 | pl.axis('tight')
39 |
40 |
41 | def plot_polynomial_regression():
42 | rng = np.random.RandomState(0)
43 | x = 2*rng.rand(100) - 1
44 |
45 | f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9
46 | y = f(x) + .4 * rng.normal(size=100)
47 |
48 | x_test = np.linspace(-1, 1, 100)
49 |
50 | pl.figure()
51 | pl.scatter(x, y, s=4)
52 |
53 | X = np.array([x**i for i in range(5)]).T
54 | X_test = np.array([x_test**i for i in range(5)]).T
55 | regr = linear_model.LinearRegression()
56 | regr.fit(X, y)
57 | pl.plot(x_test, regr.predict(X_test), label='4th order')
58 |
59 | X = np.array([x**i for i in range(10)]).T
60 | X_test = np.array([x_test**i for i in range(10)]).T
61 | regr = linear_model.LinearRegression()
62 | regr.fit(X, y)
63 | pl.plot(x_test, regr.predict(X_test), label='9th order')
64 |
65 | pl.legend(loc='best')
66 | pl.axis('tight')
67 | pl.title('Fitting a 4th and a 9th order polynomial')
68 |
69 | pl.figure()
70 | pl.scatter(x, y, s=4)
71 | pl.plot(x_test, f(x_test), label="truth")
72 | pl.axis('tight')
73 | pl.title('Ground truth (9th order polynomial)')
74 |
75 |
76 |
--------------------------------------------------------------------------------
/scikit-learn/fig_code/linear_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.linear_model import LinearRegression
4 |
5 |
6 | def plot_linear_regression():
7 | a = 0.5
8 | b = 1.0
9 |
10 | # x from 0 to 10
11 | x = 30 * np.random.random(20)
12 |
13 | # y = a*x + b with noise
14 | y = a * x + b + np.random.normal(size=x.shape)
15 |
16 | # create a linear regression classifier
17 | clf = LinearRegression()
18 | clf.fit(x[:, None], y)
19 |
20 | # predict y from the data
21 | x_new = np.linspace(0, 30, 100)
22 | y_new = clf.predict(x_new[:, None])
23 |
24 | # plot the results
25 | ax = plt.axes()
26 | ax.scatter(x, y)
27 | ax.plot(x_new, y_new)
28 |
29 | ax.set_xlabel('x')
30 | ax.set_ylabel('y')
31 |
32 | ax.axis('tight')
33 |
34 |
35 | if __name__ == '__main__':
36 | plot_linear_regression()
37 | plt.show()
38 |
--------------------------------------------------------------------------------
/scikit-learn/fig_code/scikit-learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "",
4 | "signature": "sha256:29899a15bea89b9d8275879798b23011cecabc0eff03dd41bb606324221e0bc3"
5 | },
6 | "nbformat": 3,
7 | "nbformat_minor": 0,
8 | "worksheets": [
9 | {
10 | "cells": [
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "# scikit-learn"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "collapsed": false,
21 | "input": [
22 | "%matplotlib inline\n",
23 | "\n",
24 | "# set seaborn plot defaults.\n",
25 | "# This can be safely commented out\n",
26 | "import seaborn; seaborn.set()"
27 | ],
28 | "language": "python",
29 | "metadata": {},
30 | "outputs": [],
31 | "prompt_number": 3
32 | },
33 | {
34 | "cell_type": "code",
35 | "collapsed": false,
36 | "input": [
37 | "# Import the example plot from the figures directory\n",
38 | "from fig_code import plot_sgd_separator\n",
39 | "plot_sgd_separator()"
40 | ],
41 | "language": "python",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "ename": "ImportError",
46 | "evalue": "No module named fig_code",
47 | "output_type": "pyerr",
48 | "traceback": [
49 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
50 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Import the example plot from the figures directory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfig_code\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mplot_sgd_separator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
51 | "\u001b[0;31mImportError\u001b[0m: No module named fig_code"
52 | ]
53 | }
54 | ],
55 | "prompt_number": 4
56 | },
57 | {
58 | "cell_type": "code",
59 | "collapsed": false,
60 | "input": [],
61 | "language": "python",
62 | "metadata": {},
63 | "outputs": []
64 | },
65 | {
66 | "cell_type": "code",
67 | "collapsed": false,
68 | "input": [],
69 | "language": "python",
70 | "metadata": {},
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "collapsed": false,
76 | "input": [],
77 | "language": "python",
78 | "metadata": {},
79 | "outputs": []
80 | },
81 | {
82 | "cell_type": "code",
83 | "collapsed": false,
84 | "input": [],
85 | "language": "python",
86 | "metadata": {},
87 | "outputs": []
88 | },
89 | {
90 | "cell_type": "code",
91 | "collapsed": false,
92 | "input": [],
93 | "language": "python",
94 | "metadata": {},
95 | "outputs": []
96 | }
97 | ],
98 | "metadata": {}
99 | }
100 | ]
101 | }
--------------------------------------------------------------------------------
/scikit-learn/fig_code/sgd_separator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.linear_model import SGDClassifier
4 | from sklearn.datasets.samples_generator import make_blobs
5 |
6 | def plot_sgd_separator():
7 | # we create 50 separable points
8 | X, Y = make_blobs(n_samples=50, centers=2,
9 | random_state=0, cluster_std=0.60)
10 |
11 | # fit the model
12 | clf = SGDClassifier(loss="hinge", alpha=0.01,
13 | n_iter=200, fit_intercept=True)
14 | clf.fit(X, Y)
15 |
16 | # plot the line, the points, and the nearest vectors to the plane
17 | xx = np.linspace(-1, 5, 10)
18 | yy = np.linspace(-1, 5, 10)
19 |
20 | X1, X2 = np.meshgrid(xx, yy)
21 | Z = np.empty(X1.shape)
22 | for (i, j), val in np.ndenumerate(X1):
23 | x1 = val
24 | x2 = X2[i, j]
25 | p = clf.decision_function([x1, x2])
26 | Z[i, j] = p[0]
27 | levels = [-1.0, 0.0, 1.0]
28 | linestyles = ['dashed', 'solid', 'dashed']
29 | colors = 'k'
30 |
31 | ax = plt.axes()
32 | ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
33 | ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
34 |
35 | ax.axis('tight')
36 |
37 |
38 | if __name__ == '__main__':
39 | plot_sgd_separator()
40 | plt.show()
41 |
--------------------------------------------------------------------------------
/scikit-learn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scikit-learn/tests/__init__.py
--------------------------------------------------------------------------------
/scipy/2002FemPreg.dat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/2002FemPreg.dat.gz
--------------------------------------------------------------------------------
/scipy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/__init__.py
--------------------------------------------------------------------------------
/scipy/first.py:
--------------------------------------------------------------------------------
1 | """This file contains code used in "Think Stats",
2 | by Allen B. Downey, available from greenteapress.com
3 |
4 | Copyright 2014 Allen B. Downey
5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6 | """
7 |
8 | from __future__ import print_function
9 |
10 | import math
11 | import numpy as np
12 |
13 | import nsfg
14 | import thinkstats2
15 | import thinkplot
16 |
17 |
18 | def MakeFrames():
19 | """Reads pregnancy data and partitions first babies and others.
20 |
21 | returns: DataFrames (all live births, first babies, others)
22 | """
23 | preg = nsfg.ReadFemPreg()
24 |
25 | live = preg[preg.outcome == 1]
26 | firsts = live[live.birthord == 1]
27 | others = live[live.birthord != 1]
28 |
29 | assert len(live) == 9148
30 | assert len(firsts) == 4413
31 | assert len(others) == 4735
32 |
33 | return live, firsts, others
34 |
35 |
36 | def Summarize(live, firsts, others):
37 | """Print various summary statistics."""
38 |
39 | mean = live.prglngth.mean()
40 | var = live.prglngth.var()
41 | std = live.prglngth.std()
42 |
43 | print('Live mean', mean)
44 | print('Live variance', var)
45 | print('Live std', std)
46 |
47 | mean1 = firsts.prglngth.mean()
48 | mean2 = others.prglngth.mean()
49 |
50 | var1 = firsts.prglngth.var()
51 | var2 = others.prglngth.var()
52 |
53 | print('Mean')
54 | print('First babies', mean1)
55 | print('Others', mean2)
56 |
57 | print('Variance')
58 | print('First babies', var1)
59 | print('Others', var2)
60 |
61 | print('Difference in weeks', mean1 - mean2)
62 | print('Difference in hours', (mean1 - mean2) * 7 * 24)
63 |
64 | print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)
65 |
66 | d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
67 | print('Cohen d', d)
68 |
69 |
70 | def PrintExtremes(live):
71 | """Plots the histogram of pregnancy lengths and prints the extremes.
72 |
73 | live: DataFrame of live births
74 | """
75 | hist = thinkstats2.Hist(live.prglngth)
76 | thinkplot.Hist(hist, label='live births')
77 |
78 | thinkplot.Save(root='first_nsfg_hist_live',
79 | title='Histogram',
80 | xlabel='weeks',
81 | ylabel='frequency')
82 |
83 | print('Shortest lengths:')
84 | for weeks, freq in hist.Smallest(10):
85 | print(weeks, freq)
86 |
87 | print('Longest lengths:')
88 | for weeks, freq in hist.Largest(10):
89 | print(weeks, freq)
90 |
91 |
92 | def MakeHists(live):
93 | """Plot Hists for live births
94 |
95 | live: DataFrame
96 | others: DataFrame
97 | """
98 | hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
99 | thinkplot.Hist(hist)
100 | thinkplot.Save(root='first_wgt_lb_hist',
101 | xlabel='pounds',
102 | ylabel='frequency',
103 | axis=[-1, 14, 0, 3200])
104 |
105 | hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
106 | thinkplot.Hist(hist)
107 | thinkplot.Save(root='first_wgt_oz_hist',
108 | xlabel='ounces',
109 | ylabel='frequency',
110 | axis=[-1, 16, 0, 1200])
111 |
112 | hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
113 | thinkplot.Hist(hist)
114 | thinkplot.Save(root='first_agepreg_hist',
115 | xlabel='years',
116 | ylabel='frequency')
117 |
118 | hist = thinkstats2.Hist(live.prglngth, label='prglngth')
119 | thinkplot.Hist(hist)
120 | thinkplot.Save(root='first_prglngth_hist',
121 | xlabel='weeks',
122 | ylabel='frequency',
123 | axis=[-1, 53, 0, 5000])
124 |
125 |
126 | def MakeComparison(firsts, others):
127 | """Plots histograms of pregnancy length for first babies and others.
128 |
129 | firsts: DataFrame
130 | others: DataFrame
131 | """
132 | first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
133 | other_hist = thinkstats2.Hist(others.prglngth, label='other')
134 |
135 | width = 0.45
136 | thinkplot.PrePlot(2)
137 | thinkplot.Hist(first_hist, align='right', width=width)
138 | thinkplot.Hist(other_hist, align='left', width=width)
139 |
140 | thinkplot.Save(root='first_nsfg_hist',
141 | title='Histogram',
142 | xlabel='weeks',
143 | ylabel='frequency',
144 | axis=[27, 46, 0, 2700])
145 |
146 |
147 | def main(script):
148 | live, firsts, others = MakeFrames()
149 |
150 | MakeHists(live)
151 | PrintExtremes(live)
152 | MakeComparison(firsts, others)
153 | Summarize(live, firsts, others)
154 |
155 |
156 | if __name__ == '__main__':
157 | import sys
158 | main(*sys.argv)
159 |
160 |
161 |
--------------------------------------------------------------------------------
/scipy/nsfg.py:
--------------------------------------------------------------------------------
1 | """This file contains code for use with "Think Stats",
2 | by Allen B. Downey, available from greenteapress.com
3 |
4 | Copyright 2010 Allen B. Downey
5 | License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6 | """
7 |
8 | from __future__ import print_function
9 |
10 | from collections import defaultdict
11 | import numpy as np
12 | import sys
13 |
14 | import thinkstats2
15 |
16 |
17 | def ReadFemPreg(dct_file='2002FemPreg.dct',
18 | dat_file='2002FemPreg.dat.gz'):
19 | """Reads the NSFG pregnancy data.
20 |
21 | dct_file: string file name
22 | dat_file: string file name
23 |
24 | returns: DataFrame
25 | """
26 | dct = thinkstats2.ReadStataDct(dct_file)
27 | df = dct.ReadFixedWidth(dat_file, compression='gzip')
28 | CleanFemPreg(df)
29 | return df
30 |
31 |
32 | def CleanFemPreg(df):
33 | """Recodes variables from the pregnancy frame.
34 |
35 | df: DataFrame
36 | """
37 | # mother's age is encoded in centiyears; convert to years
38 | df.agepreg /= 100.0
39 |
40 | # birthwgt_lb contains at least one bogus value (51 lbs)
41 | # replace with NaN
42 | df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
43 |
44 | # replace 'not ascertained', 'refused', 'don't know' with NaN
45 | na_vals = [97, 98, 99]
46 | df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
47 | df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
48 | df.hpagelb.replace(na_vals, np.nan, inplace=True)
49 |
50 | df.babysex.replace([7, 9], np.nan, inplace=True)
51 | df.nbrnaliv.replace([9], np.nan, inplace=True)
52 |
53 | # birthweight is stored in two columns, lbs and oz.
54 | # convert to a single column in lb
55 | # NOTE: creating a new column requires dictionary syntax,
56 | # not attribute assignment (like df.totalwgt_lb)
57 | df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
58 |
59 | # due to a bug in ReadStataDct, the last variable gets clipped;
60 | # so for now set it to NaN
61 | df.cmintvw = np.nan
62 |
63 |
64 | def MakePregMap(df):
65 | """Make a map from caseid to list of preg indices.
66 |
67 | df: DataFrame
68 |
69 | returns: dict that maps from caseid to list of indices into preg df
70 | """
71 | d = defaultdict(list)
72 | for index, caseid in df.caseid.iteritems():
73 | d[caseid].append(index)
74 | return d
75 |
76 |
77 | def main(script):
78 | """Tests the functions in this module.
79 |
80 | script: string script name
81 | """
82 | df = ReadFemPreg()
83 | print(df.shape)
84 |
85 | assert len(df) == 13593
86 |
87 | assert df.caseid[13592] == 12571
88 | assert df.pregordr.value_counts()[1] == 5033
89 | assert df.nbrnaliv.value_counts()[1] == 8981
90 | assert df.babysex.value_counts()[1] == 4641
91 | assert df.birthwgt_lb.value_counts()[7] == 3049
92 | assert df.birthwgt_oz.value_counts()[0] == 1037
93 | assert df.prglngth.value_counts()[39] == 4744
94 | assert df.outcome.value_counts()[1] == 9148
95 | assert df.birthord.value_counts()[1] == 4413
96 | assert df.agepreg.value_counts()[22.75] == 100
97 | assert df.totalwgt_lb.value_counts()[7.5] == 302
98 |
99 | weights = df.finalwgt.value_counts()
100 | key = max(weights.keys())
101 | assert df.finalwgt.value_counts()[key] == 6
102 |
103 | print('%s: All tests passed.' % script)
104 |
105 | if __name__ == '__main__':
106 | main(*sys.argv)
107 |
--------------------------------------------------------------------------------
/scipy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/scipy/tests/__init__.py
--------------------------------------------------------------------------------
/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/data-science-ipython-notebooks/5b3c00d462c6e9200315afe46d0093948621eb95/spark/__init__.py
--------------------------------------------------------------------------------
/spark/hdfs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# HDFS"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Run an HDFS command:"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": false
29 | },
30 | "outputs": [],
31 | "source": [
32 | "!hdfs"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "Run a file system command on the file systems (FsShell):"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [],
49 | "source": [
50 | "!hdfs dfs"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "List the user's home directory:"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [],
67 | "source": [
68 | "!hdfs dfs -ls"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "List the HDFS root directory:"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [],
85 | "source": [
86 | "!hdfs dfs -ls /"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Copy a local file to the user's directory on HDFS:"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [],
103 | "source": [
104 | "!hdfs dfs -put file.txt file.txt"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "Display the contents of the specified HDFS file:"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "!hdfs dfs -cat file.txt"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "Print the last 10 lines of the file to the terminal:"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [],
139 | "source": [
140 | "!hdfs dfs -cat file.txt | tail -n 10"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "View a directory and all of its files:"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "collapsed": false
155 | },
156 | "outputs": [],
157 | "source": [
158 | "!hdfs dfs -cat dir/* | less"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "Copy an HDFS file to local:"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [],
175 | "source": [
176 | "!hdfs dfs -get file.txt file.txt"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "Create a directory on HDFS:"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "!hdfs dfs -mkdir dir"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "Recursively delete the specified directory and all of its contents:"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {
208 | "collapsed": false
209 | },
210 | "outputs": [],
211 | "source": [
212 | "!hdfs dfs -rm -r dir"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "Specify HDFS file in Spark (paths are relative to the user's home HDFS directory):"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "collapsed": false
227 | },
228 | "outputs": [],
229 | "source": [
230 | "data = sc.textFile (\"hdfs://hdfs-host:port/path/file.txt\")"
231 | ]
232 | }
233 | ],
234 | "metadata": {
235 | "kernelspec": {
236 | "display_name": "Python 2",
237 | "language": "python",
238 | "name": "python2"
239 | },
240 | "language_info": {
241 | "codemirror_mode": {
242 | "name": "ipython",
243 | "version": 2
244 | },
245 | "file_extension": ".py",
246 | "mimetype": "text/x-python",
247 | "name": "python",
248 | "nbconvert_exporter": "python",
249 | "pygments_lexer": "ipython2",
250 | "version": "2.7.10"
251 | }
252 | },
253 | "nbformat": 4,
254 | "nbformat_minor": 0
255 | }
256 |
--------------------------------------------------------------------------------