├── .gitattributes
├── 9781484234495.jpg
├── Airline2.py
├── Airline_friendly_patient_efficient cabin crew.py
├── Airline_worst_experience_ever.py
├── Arima.py
├── Contributing.md
├── LICENSE.txt
├── NumericalDataClassification.py
├── README.md
├── Recursive_Levenshtein.py
├── Text in clustering.py
├── airline1.py
├── arma.py
├── autoregressive model.py
├── classifier1.py
├── classifier2.py
├── clustering_with_k_mean.py
├── code.R
├── customsearch.py
├── errata.md
├── knn_classifier.py
├── least square estimation linear regression.py
├── log_reg.py
├── log_reg_regular.py
├── moving average.py
├── rnn_keras_timeseries_stock.py
├── se_test1.py
└── src
    ├── MainBDAS.java
    ├── RootBDAS.java
    ├── Utility.java
    ├── WordCounterBDAS.java
    └── testBDAS.java


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/9781484234495.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/advanced-data-analytics-using-python/2133b66317ddddee65c9e5c2d861effa722e4daf/9781484234495.jpg


--------------------------------------------------------------------------------
/Airline2.py:
--------------------------------------------------------------------------------
1 | from textblob.classifiers import NaiveBayesClassifier
2 | 
3 | train = [('Air India did a poor job of queue management both times.', 'staff service'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown')]
4 | 
5 | cl = NaiveBayesClassifier(train)
6 | 
7 | tests = ['Food is good.', 'Colombo to Moscow via Delhi.']
8 | for c in tests:
9 | 	print c,'\t',cl.classify(c)


--------------------------------------------------------------------------------
/Airline_friendly_patient_efficient cabin crew.py:
--------------------------------------------------------------------------------
1 | from textblob.classifiers import NaiveBayesClassifier
2 | 
3 | train = [('This flight from Bodhgaya to Delhi was the latest of about 20 domestic flights sectors completed with Air India over the last 3 - 4 years.', 'pos'),('The level of service has remained consistent in all areas, with some aspects of service stronger than others.', 'pos'),('This flight was originally due to depart Bodhgaya at 14.35h arriving in Delhi at 16.25h.', 'pos'),('An initial delay to a revised 16.00h departure was notified by e-mail several hours in advance.',  'pos'),('We had no idea how long the delay would be until we saw our aircraft land.', 'neg'),('Except for an expensive coffee kiosk there were no customer facilities while waiting.', 'neg')]
4 | 
5 | cl = NaiveBayesClassifier(train)
6 | print(cl.classify('Check-in queue management at Bodhgaya airport was poor but agent service was good once you eventually got to the check-in desk.'))


--------------------------------------------------------------------------------
/Airline_worst_experience_ever.py:
--------------------------------------------------------------------------------
1 | from textblob.classifiers import NaiveBayesClassifier
2 | 
3 | train = [("It's a very long and exhausting flight!", 'pos'),('I traveled alone with my 9 month old son from Frankfurt to Delhi and onto Sydney!', 'pos'),('At the end they wrote a manual boarding pass.', 'pos'),("I think everybody knows with a baby it's even harder!", 'pos'),('After this 2 hours they gave me just the boarding pass until Delhi and I had to run carrying my son and all the luggage to the boarding as this had started already.', 'pos'),("The first problems started when they weren't be able to print the boarding pass for my son.", 'neg'),("They kept me 2 hours in front of the check-in that I couldn't care properly for my son.", 'neg'),('No toilets, no food again - just hard chairs to wait on.', 'neg'),("They didn't care.", 'neg'),('I got more and more stressed.', 'pos')]
4 | 
5 | cl = NaiveBayesClassifier(train)
6 | print(cl.classify("Worst experience ever with Air India!"))


--------------------------------------------------------------------------------
/Arima.py:
--------------------------------------------------------------------------------
 1 | from pandas import read_csv
 2 | from pandas import datetime
 3 | from matplotlib import pyplot
 4 | from statsmodels.tsa.arima_model import ARIMA
 5 | from sklearn.metrics import mean_squared_error
 6 | 
 7 | def parser(p):
 8 | 	return datetime.strptime('190'+p, '%Y-%m')
 9 | 
10 | series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
11 | P = series.values
12 | size = int(len(P) * 0.66)
13 | train, test = P[0:size], P[size:len(P)]
14 | history = [p for p in train]
15 | predictions = list()
16 | for t in range(len(test)):
17 | 	model = ARIMA(history, order=(5,1,0))
18 | 	model_fit = model.fit(disp=0)
19 | 	output = model_fit.forecast()
20 | 	yhat = output[0]
21 | 	predictions.append(yhat)
22 | 	obs = test[t]
23 | 	history.append(obs)
24 | 	print('predicted=%f, expected=%f' % (yhat, obs))
25 | error = mean_squared_error(test, predictions)
26 | print('Test MSE: %.3f' % error)
27 | # plot
28 | pyplot.plot(test)
29 | pyplot.plot(predictions, color='red')
30 | pyplot.show()


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2018 Sayan Mukhopadhyay
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/NumericalDataClassification.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import random
 3 | import operator 
 4 | import math
 5 | 
 6 | 
 7 | def euclideanDistance(instance1, instance2, length):
 8 | 	distance = 0
 9 | 	for x in range(length):
10 | 		distance += pow((instance1[x] - instance2[x]), 2)
11 | 	return math.sqrt(distance)
12 | 
13 | def getNeighbors(trainingSet, testInstance, k):
14 | 	distances = []
15 | 	length = len(testInstance)-1
16 | 	for x in range(len(trainingSet)):
17 | 		dist = euclideanDistance(testInstance, trainingSet[x], length)
18 | 		distances.append((trainingSet[x], dist))
19 | 	distances.sort(key=operator.itemgetter(1))
20 | 	neighbors = []
21 | 	for x in range(k):
22 | 		neighbors.append(distances[x][0])
23 | 	return neighbors
24 | 	
25 | trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
26 | testInstance = [5, 5, 5]
27 | k = 1
28 | neighbors = getNeighbors(trainSet, testInstance, 1)
29 | print(neighbors)
30 | 
31 | exit(0)
32 | 
33 | def loadDataset(filename, split, trainingSet=[] , testSet=[]):
34 | 	with open(filename, 'rb') as csvfile:
35 | 	    lines = csv.reader(csvfile)
36 | 	    dataset = list(lines)
37 | 	    for x in range(len(dataset)-1):
38 | 	        for y in range(4):
39 | 	            dataset[x][y] = float(dataset[x][y])
40 | 	        if random.random() < split:
41 | 	            trainingSet.append(dataset[x])
42 | 	        else:
43 | 	            testSet.append(dataset[x])
44 | trainingSet=[]
45 | testSet=[]
46 | loadDataset('irisdata.txt', 0.66, trainingSet, testSet)
47 | print 'Train: ' + repr(len(trainingSet))
48 | print 'Test: ' + repr(len(testSet))
49 | 
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Advanced Data Analytics Using Python*](https://www.apress.com/9781484234495) by Sayan Mukhopadhyay (Apress, 2018).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484234495.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/Recursive_Levenshtein.py:
--------------------------------------------------------------------------------
 1 | def LD(s, t):
 2 |     if s == "":
 3 |         return len(t)
 4 |     if t == "":
 5 |         return len(s)
 6 |     if s[-1] == t[-1]:
 7 |         cost = 0
 8 |     else:
 9 |         cost = 1
10 |        
11 |     res = min([LD(s[:-1], t)+1,
12 |                LD(s, t[:-1])+1, 
13 |                LD(s[:-1], t[:-1]) + cost])
14 |     return res
15 | print(LD("Python", "Peithen"))


--------------------------------------------------------------------------------
/Text in clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | def LD(s, t):
 5 |     if s == "":
 6 |         return len(t)
 7 |     if t == "":
 8 |         return len(s)
 9 |     if s[-1] == t[-1]:
10 |         cost = 0
11 |     else:
12 |         cost = 1
13 |        
14 |     res = min([LD(s[:-1], t)+1,
15 |                LD(s, t[:-1])+1, 
16 |                LD(s[:-1], t[:-1]) + cost])
17 |     return res
18 | 	
19 | def find_centre(x, X, mu):
20 | 	min = 100
21 | 	cent = 0
22 | 	for c in mu:
23 | 		dist = LD(x, X[c])
24 | 		if dist < min:
25 | 			min = dist
26 | 			cent = c
27 | 	return cent
28 | 		
29 | 	 
30 | def cluster_points(X, mu):
31 |     clusters  = {}
32 |     for x in X:
33 |         bestmukey = find_centre(x, X, mu)
34 |         try:
35 |             clusters[bestmukey].append(x)
36 |         except KeyError:
37 |             clusters[bestmukey] = [x]
38 |     return clusters
39 |  
40 | def reevaluate_centers(mu, clusters):
41 |     newmu = []
42 |     keys = sorted(clusters.keys())
43 |     for k in keys:
44 |         newmu.append(k)
45 |     return newmu
46 |  
47 | def has_converged(mu, oldmu):
48 |     return sorted(mu) == sorted(oldmu)
49 |  
50 | def find_centers(X, K):
51 |     oldmu = random.sample(range(0,5), K)
52 |     mu = random.sample(range(0,5), K)
53 |     while not has_converged(mu, oldmu):
54 |         oldmu = mu
55 |         # Assign all points in X to clusters
56 |         clusters = cluster_points(X, mu)
57 |         # Reevaluate centers
58 |         mu = reevaluate_centers(oldmu, clusters)
59 |     return(mu, clusters)
60 | 	
61 | X = ['Delhi','Dehli', 'Delli','Kolkata','Kalkata','Kalkota']
62 | 
63 | print(find_centers(X,2))
64 |     


--------------------------------------------------------------------------------
/airline1.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import random
 4 | 
 5 | url = "http://www.airlinequality.com/airline-reviews/air-india/page/2/"
 6 | 
 7 | agent = "Mozilla/5.0 (Windows NT 6.2) Firefox/40.1"
 8 | 
 9 | headers = {'user-agent': agent}
10 | r = requests.get(url, headers=headers)
11 | 
12 | data = r.content
13 | print(data)
14 | exit()
15 | soup = BeautifulSoup(data)
16 | for div in soup.findAll("div", { "class" : "text_content" }):
17 | 	print(str(div))


--------------------------------------------------------------------------------
/arma.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from scipy import stats 
 3 | import pandas 
 4 | import matplotlib.pyplot as plt 
 5 | import statsmodels.api as sm
 6 | 
 7 | from statsmodels.graphics.api import qqplot
 8 | 
 9 | print sm.datasets.sunspots.NOTE
10 | 
11 | Number of Observation - 309 (Annual 1700 - 2008)
12 | Number of Variable – 1
13 | Variable name definitions::
14 | SUNACTIVITY - Number of sunspots for each year
15 | The data file contains a 'YEAR' variable that is not returned by load.
16 | 
17 | dta = sm.datasets.sunspots.load_pandas().data
18 | 
19 | dta.index = pandas.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
20 | del dta["YEAR"]
21 | 
22 | 


--------------------------------------------------------------------------------
/autoregressive model.py:
--------------------------------------------------------------------------------
 1 | from pandas import Series
 2 | from matplotlib import pyplot
 3 | from statsmodels.tsa.ar_model import AR
 4 | from sklearn.metrics import mean_squared_error
 5 | series = Series.from_csv('daily-minimum-temperatures.csv', header=0)
 6 | 
 7 | J = series.value
 8 | train, test = J[1:len(J)-7], J[len(J)-7:]
 9 | 
10 | model = AR(train)
11 | model_fit = model.fit()
12 | print('Lag: %s' % model_fit.k_ar)
13 | print('Coefficients: %s' % model_fit.params)
14 | 
15 | predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
16 | for t in range(len(predictions)):
17 | 	print('predicted=%f, expected=%f' % (predictions[t], test[t]))
18 | error = mean_squared_error(test, predictions)
19 | print('Test MSE: %.3f' % error)
20 | 
21 | pyplot.plot(test)
22 | pyplot.plot(predictions, color='red')
23 | pyplot.show()
24 | 


--------------------------------------------------------------------------------
/classifier1.py:
--------------------------------------------------------------------------------
1 | from textblob.classifiers import NaiveBayesClassifier
2 | 
3 | train = [('Air India did a poor job of queue management both times.', 'staff service'),('I love this sandwich.', 'pos'),('this is an amazing place!', 'pos'),('I feel very good about these beers.', 'pos'),('this is my best work.', 'pos'),("what an awesome view", 'pos'),('I do not like this restaurant', 'neg'),('I am tired of this stuff.', 'neg'),("I can't deal with this", 'neg'),('he is my sworn enemy!', 'neg'),('my boss is horrible.', 'neg'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown')]
4 | 
5 | cl = NaiveBayesClassifier(train)
6 | 
7 | tests = ['Food is good.', 'Colombo to Moscow via Delhi.']
8 | for c in tests:
9 | 	print c,'\t',cl.classify(c)


--------------------------------------------------------------------------------
/classifier2.py:
--------------------------------------------------------------------------------
1 | from textblob.classifiers import NaiveBayesClassifier
2 | 
3 | train = [('I love this sandwich.', 'pos'),('Air India did a poor job of queue management both times.', 'staff service'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown'),
4 | ('this is an amazing place!', 'pos'),('I feel very good about these beers.', 'pos'),('this is my best work.', 'pos'),("what an awesome view", 'pos'),('I do not like this restaurant', 'neg'),('I am tired of this stuff.', 'neg'),("I can't deal with this", 'neg'),('he is my sworn enemy!', 'neg'),('my boss is horrible.', 'neg')]
5 | 
6 | cl = NaiveBayesClassifier(train)
7 | print (cl.classify("This is an amazing library!"))


--------------------------------------------------------------------------------
/clustering_with_k_mean.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 |  
 4 | def cluster_points(X, mu):
 5 |     clusters  = {}
 6 |     for x in X:
 7 |         bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
 8 |                     for i in enumerate(mu)], key=lambda t:t[1])[0]
 9 |         try:
10 |             clusters[bestmukey].append(x)
11 |         except KeyError:
12 |             clusters[bestmukey] = [x]
13 |     return clusters
14 |  
15 | def reevaluate_centers(mu, clusters):
16 |     newmu = []
17 |     keys = sorted(clusters.keys())
18 |     for k in keys:
19 |         newmu.append(np.mean(clusters[k], axis = 0))
20 |     return newmu
21 |  
22 | def has_converged(mu, oldmu):
23 |     return (set([tuple(b) for b in mu]) == set([tuple(b) for b in oldmu]))
24 |  
25 | def find_centers(X, K):
26 |     oldmu = random.sample(X, K)
27 |     mu = random.sample(X, K)
28 |     while not has_converged(mu, oldmu):
29 |         oldmu = mu
30 |         
31 |         clusters = cluster_points(X, mu)
32 |         
33 |         mu = reevaluate_centers(oldmu, clusters)
34 |     return(mu, clusters)
35 | 	
36 | X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(10)])
37 | 
38 | print(find_centers(X,2))
39 |     


--------------------------------------------------------------------------------
/code.R:
--------------------------------------------------------------------------------
 1 | asm_weekwise<-read.csv("F:/souravda/New ASM Weekwise.csv",header=TRUE)
 2 | 
 3 | asm_weekwise$Week <- NULL
 4 | 
 5 | library(MASS, lib.loc="F:/souravda/lib/")
 6 | library(tseries, lib.loc="F:/souravda/lib/")
 7 | library(forecast, lib.loc="F:/souravda/lib/")
 8 | 
 9 | #asm_weekwise[is.na(asm_weekwise)] <- 0
10 | #asm_weekwise[asm_weekwise <= 0] <- mean(as.matrix(asm_weekwise))
11 | 
12 | 
13 | 
14 | 
15 | weekjoyforecastvalues <- data.frame( "asm" = integer(), "value" = integer(), stringsAsFactors=FALSE)
16 | 
17 | for(i in 1:ncol(asm_weekwise))
18 | {
19 |   asmname<-names(asm_weekwise)[i]
20 |   temparimadata<-asm_weekwise[,i]
21 |   temparimadata[is.na(temparimadata)] <- 0
22 |   temparimadata[temparimadata <=0] <- mean(as.matrix(temparimadata))
23 |   m <- mean(as.matrix(temparimadata))
24 |   #print(m)
25 |   s <- sd(temparimadata)
26 |   #print(s)
27 |   temparimadata <- (temparimadata - m)
28 |   temparimadata <- (temparimadata / s)
29 |   temparima<-auto.arima(temparimadata, stationary = FALSE, seasonal = TRUE, allowdrift = TRUE, allowmean = FALSE, biasadj = FALSE)
30 |   tempforecast<-forecast(temparima,h=12)
31 |   #tempforecast <- (tempforecast * s)
32 |   #print(tempforecast)
33 |   temp_forecasted_data<-sum(data.frame(tempforecast$upper[,1])*s + m)
34 |   weekjoyforecastvalues[nrow(weekjoyforecastvalues) + 1, ] <- c( asmname, temp_forecasted_data)
35 | }
36 | 
37 | weekjoyforecastvalues$value<-as.integer(weekjoyforecastvalues$value)
38 | 
39 | cat(weekjoyforecastvalues$value,sep="\n")
40 | 
41 | (sum(weekjoyforecastvalues$value)- 103000000)/103000000 #53782605)/53782605


--------------------------------------------------------------------------------
/customsearch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import pprint, json, urllib2
  5 | import nltk, sys, urllib
  6 | from bs4 import BeautifulSoup
  7 | import csv
  8 | 
  9 | from googleapiclient.discovery import build
 10 | 
 11 | def link_score(link):
 12 |     if ('cv' in link or 'resume' in link) and 'job' not in link:
 13 |         return True
 14 | 
 15 | def process_file():
 16 |     try:
 17 |         
 18 |         with open('data1.json','r') as fl:
 19 |             data = json.load(fl)
 20 |         all_links = []    
 21 |     #     pprint.pprint(len(data['items']))
 22 |         for item in data['items']:
 23 |     #         print item['formattedUrl']
 24 |             all_links.append(item['formattedUrl'])
 25 |         return all_links
 26 |     except:
 27 |         return []
 28 | 
 29 | def main(istart, search_query):
 30 |     service = build("customsearch", "v1",
 31 |               developerKey="AIzaSyApK0athSzeKSUa8vCNWZe2R1IygAv4bP4")
 32 |     
 33 |     res = service.cse().list(
 34 |         q= search_query,
 35 |         cx='007420266948142075924:dsrt3pl0cju',
 36 |         num=10,
 37 |         gl='in', #in for india comment this for whole web
 38 |         start = istart,
 39 |       ).execute()
 40 |     import json
 41 |     with open('data1.json', 'w') as fp:
 42 |         json.dump(res, fp)
 43 | #     pprint.pprint(type(res))
 44 | #     pprint.pprint(res)
 45 | 
 46 | def get_email_ph(link_text, pdf=None):
 47 |     if pdf==True:
 48 |             
 49 |         from textract import process
 50 |         text = process(link_text)
 51 |     else:
 52 |         text = link_text
 53 |     # print text
 54 |     import re
 55 |     email = []
 56 |     ph = [] 
 57 |     valid_ph = re.compile("[789][0-9]{9}$")
 58 |     valid = re.compile("[A-Za-z]+[@]{1}[A-Za-z]+\.[a-z]+")
 59 |     for token in re.split(r'[,\s]',text):
 60 | #     for token in nltk.tokenize(text):
 61 |     #     print token
 62 |         a = valid.match(token)
 63 |         b = valid_ph.match(token)
 64 |         if a != None:
 65 |             print a.group()
 66 |             email.append(a.group())
 67 |         if b != None:
 68 |             print b.group()    
 69 |             ph.append(b.group())
 70 |     return email, ph
 71 | 
 72 | def process_pdf_link(link):
 73 |     html = urllib2.urlopen(link)
 74 |     file = open("document.pdf", 'w')
 75 |     file.write(html.read())
 76 |     file.close()
 77 |     return get_email_ph("document.pdf", pdf=True)
 78 | 
 79 | def process_doc_link(link):
 80 |     testfile = urllib.URLopener()
 81 |     testfile.retrieve(link, "document.doc")
 82 |     return get_email_ph("document.doc", pdf=False)
 83 | 
 84 | def process_docx_link(link):
 85 |     testfile = urllib.URLopener()
 86 |     testfile.retrieve(link, "document.docx")
 87 |     return get_email_ph("document.docx", pdf=False)
 88 | 
 89 | def process_links(all_links):
 90 |     with open('email_ph.csv', 'wb') as csvfile:
 91 |         spamwriter = csv.writer(csvfile, delimiter=',')
 92 |            
 93 |         for link in all_links:
 94 |             if link[:4] !='http':
 95 |                 link = "http://"+link
 96 |             print link
 97 |             try:    
 98 |                 if link[-3:] == 'pdf':
 99 |                     try:
100 |                         email, ph = process_pdf_link(link)
101 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
102 |                     except:
103 |                         print "error",link
104 |                         print sys.exc_info()
105 |                 elif link[-4:] == 'docx':
106 |                     try:
107 |                         email, ph = process_docx_link(link)
108 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
109 |                     except:
110 |                         print "error",link
111 |                         print sys.exc_info()
112 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
113 |                 elif link[-3:] == 'doc':
114 |                     try:
115 |                         email, ph = process_doc_link(link)
116 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
117 |                     except:
118 |                         print "error",link
119 |                         print sys.exc_info()
120 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
121 |                 else:
122 |                     try:
123 |                         html = urllib2.urlopen(link)
124 |                         email, ph = get_email_ph(BeautifulSoup(html.read()).get_text(), pdf=False)
125 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
126 |                     except:
127 |                         print "error",link
128 |                         print sys.exc_info()
129 |                         spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
130 |             except:
131 |                 pass
132 |                 print "error",link
133 |                 print sys.exc_info()
134 |         
135 | if __name__ == '__main__':
136 |     
137 | #     if len(sys.argv) <2  :
138 | #         print "Error : please pass query words e.g. python customsearch.py java developer"
139 | #         sys.exit()
140 | #     else:
141 | #         search_query = " ".join(sys.argv[1:])
142 |     
143 | #     print search_query
144 |     search_query = ' ASP .NET, C#, WebServices, HTML Chicago USA biodata cv'
145 | #     
146 | #     links = ['http://www.michaelminella.com/resume.html',
147 | #              'www.indeed.com/resumes/Java-J2EE-Developer',
148 | #              'www.slideshare.net/raghavanm/java-j2-eecvguide',
149 | #              'www.gcreddy.com/2013/10/java-3-years-resume.html',
150 | #              'www.naschenweng.info/cv/',
151 | #             'www.shinkarenko.org/cv/IlyaShinkarenkoCV.pdf',
152 | #              'stackoverflow.com/cv/anujpatel',
153 | #             'www.hrishikesh.karambelkar.co.in/resume-hrishikesh-karambelkar.doc',
154 | #             'www.oocities.org/rkbalgi/resume.pdf',
155 | #              'adam.kahtava.com/resume/curriculum-vitae/software-developer/']
156 | #     
157 |     all_links = []
158 | #     all_links.extend(links)
159 |     for i in range(1,90,10):
160 |         main(i, search_query)
161 |         all_links.extend(process_file())
162 |     
163 |     process_links(all_links)
164 | #     import csv
165 | #     with open('email_ph.csv', 'wb') as csvfile:
166 | #         spamwriter = csv.writer(csvfile, delimiter=',')
167 | #         for i in range(1,90,10):
168 | #             main(i)
169 | #             all_links = process_file()
170 | #             for link in all_links:
171 | #                 if link_score(link):
172 | #                     print link
173 | #                     if link[:4] !='http':
174 | #                         link = "http://"+link
175 | #                     if link[-3:] == 'pdf':
176 | #                         html = urllib2.urlopen(link)
177 | #                         file = open("document.pdf", 'w')
178 | #                         file.write(html.read())
179 | #                         file.close()
180 | #                         print("Completed")
181 | #                         email, ph = get_email_ph("document.pdf", pdf=True)
182 | #                         spamwriter.writerow([link, email, ph])
183 | #                     elif link[-3:] == 'doc':
184 | #                         try:
185 | #                             email=[]
186 | #                             ph = []
187 | #                             html = urllib2.urlopen(link)
188 | #                             file = open("document.doc", 'w')
189 | #                             file.write(html.read())
190 | #                             file.close()
191 | #                             print("Completed")
192 | #                             testfile = urllib.URLopener()
193 | #                             testfile.retrieve(link, "document.doc")
194 | #                             email, ph = get_email_ph("document.doc", pdf=True)
195 | #                             spamwriter.writerow([link, email, ph])
196 | #                         except:
197 | #                             spamwriter.writerow([link, email, ph])
198 | #                     else:
199 | #                         try:
200 | #                             email = []
201 | #                             ph = []
202 | #                             html = urllib2.urlopen(link)
203 | #             #                 file = open("document.pdf", 'w')
204 | #             #                 file.write(html.read())
205 | #             #                 file.close()
206 | #             #                 print("Completed")
207 | #                             email, ph = get_email_ph(html.read(), pdf=False)
208 | #                             spamwriter.writerow([link, email, ph])
209 | #                         except:
210 | #                             spamwriter.writerow([link, email, ph])
211 | 


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/knn_classifier.py:
--------------------------------------------------------------------------------
  1 | # USAGE
  2 | # python knn_classifier.py --dataset kaggle_dogs_vs_cats
  3 | 
  4 | # import the necessary packages
  5 | from sklearn.neighbors import KNeighborsClassifier
  6 | from sklearn.cross_validation import train_test_split
  7 | from imutils import paths
  8 | import numpy as np
  9 | import argparse
 10 | import imutils
 11 | import cv2
 12 | import os
 13 | 
 14 | def image_to_feature_vector(image, size=(32, 32)):
 15 | 	# resize the image to a fixed size, then flatten the image into
 16 | 	# a list of raw pixel intensities
 17 | 	return cv2.resize(image, size).flatten()
 18 | 
 19 | def extract_color_histogram(image, bins=(8, 8, 8)):
 20 | 	# extract a 3D color histogram from the HSV color space using
 21 | 	# the supplied number of `bins` per channel
 22 | 	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 23 | 	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
 24 | 		[0, 180, 0, 256, 0, 256])
 25 | 
 26 | 	# handle normalizing the histogram if we are using OpenCV 2.4.X
 27 | 	if imutils.is_cv2():
 28 | 		hist = cv2.normalize(hist)
 29 | 
 30 | 	# otherwise, perform "in place" normalization in OpenCV 3 (I
 31 | 	# personally hate the way this is done
 32 | 	else:
 33 | 		cv2.normalize(hist, hist)
 34 | 
 35 | 	# return the flattened histogram as the feature vector
 36 | 	return hist.flatten()
 37 | 
 38 | # construct the argument parse and parse the arguments
 39 | ap = argparse.ArgumentParser()
 40 | ap.add_argument("-d", "--dataset", required=True,
 41 | 	help="path to input dataset")
 42 | ap.add_argument("-k", "--neighbors", type=int, default=1,
 43 | 	help="# of nearest neighbors for classification")
 44 | ap.add_argument("-j", "--jobs", type=int, default=-1,
 45 | 	help="# of jobs for k-NN distance (-1 uses all available cores)")
 46 | args = vars(ap.parse_args())
 47 | 
 48 | # grab the list of images that we'll be describing
 49 | print("[INFO] describing images...")
 50 | imagePaths = list(paths.list_images(args["dataset"]))
 51 | 
 52 | # initialize the raw pixel intensities matrix, the features matrix,
 53 | # and labels list
 54 | rawImages = []
 55 | features = []
 56 | labels = []
 57 | 
 58 | # loop over the input images
 59 | for (i, imagePath) in enumerate(imagePaths):
 60 | 	# load the image and extract the class label (assuming that our
 61 | 	# path as the format: /path/to/dataset/{class}.{image_num}.jpg
 62 | 	image = cv2.imread(imagePath)
 63 | 	label = imagePath.split(os.path.sep)[-1].split(".")[0]
 64 | 
 65 | 	# extract raw pixel intensity "features", followed by a color
 66 | 	# histogram to characterize the color distribution of the pixels
 67 | 	# in the image
 68 | 	pixels = image_to_feature_vector(image)
 69 | 	hist = extract_color_histogram(image)
 70 | 
 71 | 	# update the raw images, features, and labels matricies,
 72 | 	# respectively
 73 | 	rawImages.append(pixels)
 74 | 	features.append(hist)
 75 | 	labels.append(label)
 76 | 
 77 | 	# show an update every 1,000 images
 78 | 	if i > 0 and i % 1000 == 0:
 79 | 		print("[INFO] processed {}/{}".format(i, len(imagePaths)))
 80 | 
 81 | # show some information on the memory consumed by the raw images
 82 | # matrix and features matrix
 83 | rawImages = np.array(rawImages)
 84 | features = np.array(features)
 85 | labels = np.array(labels)
 86 | print("[INFO] pixels matrix: {:.2f}MB".format(
 87 | 	rawImages.nbytes / (1024 * 1000.0)))
 88 | print("[INFO] features matrix: {:.2f}MB".format(
 89 | 	features.nbytes / (1024 * 1000.0)))
 90 | 
 91 | # partition the data into training and testing splits, using 75%
 92 | # of the data for training and the remaining 25% for testing
 93 | (trainRI, testRI, trainRL, testRL) = train_test_split(
 94 | 	rawImages, labels, test_size=0.25, random_state=42)
 95 | (trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
 96 | 	features, labels, test_size=0.25, random_state=42)
 97 | 
 98 | # train and evaluate a k-NN classifer on the raw pixel intensities
 99 | print("[INFO] evaluating raw pixel accuracy...")
100 | model = KNeighborsClassifier(n_neighbors=args["neighbors"],
101 | 	n_jobs=args["jobs"])
102 | model.fit(trainRI, trainRL)
103 | acc = model.score(testRI, testRL)
104 | print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))
105 | 
106 | # train and evaluate a k-NN classifer on the histogram
107 | # representations
108 | print("[INFO] evaluating histogram accuracy...")
109 | model = KNeighborsClassifier(n_neighbors=args["neighbors"],
110 | 	n_jobs=args["jobs"])
111 | model.fit(trainFeat, trainLabels)
112 | acc = model.score(testFeat, testLabels)
113 | print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100))s


--------------------------------------------------------------------------------
/least square estimation linear regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import statsmodels.api as sm
 4 | 
 5 | df = pd.read_csv('longley.csv', index_col=0)
 6 | #print df
 7 | b = df.Employed  
 8 | A = df.GNP  
 9 | A = sm.add_constant(A)  
10 | 
11 | est = sm.OLS(b, A)
12 | est = est.fit()
13 | print est.summary()
14 | 


--------------------------------------------------------------------------------
/log_reg.py:
--------------------------------------------------------------------------------
  1 | from numpy import loadtxt, where, zeros, e, array, log, ones, mean, where
  2 | from pylab import scatter, show, legend, xlabel, ylabel, plot
  3 | from scipy.optimize import fmin_bfgs
  4 | 
  5 | 
  6 | def sigmoid(X):
  7 |     '''Compute the sigmoid function '''
  8 |     #d = zeros(shape=(X.shape))
  9 | 
 10 |     den = 1.0 + e ** (-1.0 * X)
 11 | 
 12 |     d = 1.0 / den
 13 | 
 14 |     return d
 15 | 
 16 | 
 17 | def compute_cost(theta, X, y):
 18 |     '''
 19 |     Comput cost for logistic regression
 20 |     '''
 21 |     #Number of training samples
 22 | 
 23 |     theta.shape = (1, 3)
 24 | 
 25 |     m = y.size
 26 | 
 27 |     h = sigmoid(X.dot(theta.T))
 28 | 
 29 |     J = (1.0 / m) * ((-y.T.dot(log(h))) - ((1.0 - y.T).dot(log(1.0 - h))))
 30 | 
 31 |     return - 1 * J.sum()
 32 | 
 33 | 
 34 | def compute_grad(theta, X, y):
 35 | 
 36 |     #print theta.shape
 37 | 
 38 |     theta.shape = (1, 3)
 39 | 
 40 |     grad = zeros(3)
 41 | 
 42 |     h = sigmoid(X.dot(theta.T))
 43 | 
 44 |     delta = h - y
 45 | 
 46 |     l = grad.size
 47 | 
 48 |     for i in range(l):
 49 |         sumdelta = delta.T.dot(X[:, i])
 50 |         grad[i] = (1.0 / m) * sumdelta * - 1
 51 | 
 52 |     theta.shape = (3,)
 53 | 
 54 |     return  grad
 55 | 
 56 | 
 57 | #load the dataset
 58 | data = loadtxt('ex2data1.txt', delimiter=',')
 59 | 
 60 | X = data[:, 0:2]
 61 | y = data[:, 2]
 62 | 
 63 | pos = where(y == 1)
 64 | neg = where(y == 0)
 65 | scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
 66 | scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
 67 | xlabel('Exam 1 score')
 68 | ylabel('Exam 2 score')
 69 | legend(['Not Admitted', 'Admitted'])
 70 | #show()
 71 | 
 72 | m, n = X.shape
 73 | 
 74 | y.shape = (m, 1)
 75 | 
 76 | #Add intercept term to x and X_test
 77 | it = ones(shape=(m, 3))
 78 | it[:, 1:3] = X
 79 | 
 80 | 
 81 | def decorated_cost(it, y):
 82 |     def f(theta):
 83 |         return compute_cost(theta, it, y)
 84 | 
 85 |     def fprime(theta):
 86 |         return compute_grad(theta, it, y)
 87 | 
 88 |     #Initialize theta parameters
 89 |     theta = zeros(3)
 90 | 
 91 |     return fmin_bfgs(f, theta, fprime, disp=True, maxiter=400)
 92 | 
 93 | decorated_cost(it, y)
 94 | theta = [-25.161272, 0.206233, 0.201470]
 95 | 
 96 | 
 97 | #Plotting the decision boundary
 98 | plot_x = array([min(it[:, 1]) - 2, max(it[:, 2]) + 2])
 99 | plot_y = (- 1.0 / theta[2]) * (theta[1] * plot_x + theta[0])
100 | plot(plot_x, plot_y)
101 | legend(['Decision Boundary', 'Not admitted', 'Admitted'])
102 | #show()
103 | 
104 | prob = sigmoid(array([1.0, 45.0, 85.0]).dot(array(theta).T))
105 | print 'For a student with scores 45 and 85, we predict and admission ' + \
106 |     'probability of %f' % prob
107 | 
108 | 
109 | def predict(theta, X):
110 |     '''Predict whether the label
111 |     is 0 or 1 using learned logistic
112 |     regression parameters '''
113 |     m, n = X.shape
114 |     p = zeros(shape=(m, 1))
115 | 
116 |     h = sigmoid(X.dot(theta.T))
117 | 
118 |     for it in range(0, h.shape[0]):
119 |         if h[it] > 0.5:
120 |             p[it, 0] = 1
121 |         else:
122 |             p[it, 0] = 0
123 | 
124 |     return p
125 | 
126 | #Compute accuracy on our training set
127 | p = predict(array(theta), it)
128 | print 'Train Accuracy: %f' % ((y[where(p == y)].size / float(y.size)) * 100.0)
129 | 


--------------------------------------------------------------------------------
/log_reg_regular.py:
--------------------------------------------------------------------------------
  1 | from numpy import loadtxt, where, zeros, e, array, log, ones, append, linspace
  2 | from pylab import scatter, show, legend, xlabel, ylabel, contour, title
  3 | from scipy.optimize import fmin_bfgs
  4 | 
  5 | 
  6 | def sigmoid(X):
  7 |     '''Compute the sigmoid function '''
  8 |     #d = zeros(shape=(X.shape))
  9 | 
 10 |     den = 1.0 + e ** (-1.0 * X)
 11 | 
 12 |     d = 1.0 / den
 13 | 
 14 |     return d
 15 | 
 16 | 
 17 | def cost_function_reg(theta, X, y, l):
 18 |     '''Compute the cost and partial derivatives as grads
 19 |     '''
 20 | 
 21 |     h = sigmoid(X.dot(theta))
 22 | 
 23 |     thetaR = theta[1:, 0]
 24 | 
 25 |     J = (1.0 / m) * ((-y.T.dot(log(h))) - ((1 - y.T).dot(log(1.0 - h)))) \
 26 |             + (l / (2.0 * m)) * (thetaR.T.dot(thetaR))
 27 | 
 28 |     delta = h - y
 29 |     sumdelta = delta.T.dot(X[:, 1])
 30 |     grad1 = (1.0 / m) * sumdelta
 31 | 
 32 |     XR = X[:, 1:X.shape[1]]
 33 |     sumdelta = delta.T.dot(XR)
 34 | 
 35 |     grad = (1.0 / m) * (sumdelta + l * thetaR)
 36 | 
 37 |     out = zeros(shape=(grad.shape[0], grad.shape[1] + 1))
 38 | 
 39 |     out[:, 0] = grad1
 40 |     out[:, 1:] = grad
 41 | 
 42 |     return J.flatten(), out.T.flatten()
 43 | 
 44 | 
 45 | def map_feature(x1, x2):
 46 |     '''
 47 |     Maps the two input features to quadratic features.
 48 | 
 49 |     Returns a new feature array with more features, comprising of
 50 |     X1, X2, X1 ** 2, X2 ** 2, X1*X2, X1*X2 ** 2, etc...
 51 | 
 52 |     Inputs X1, X2 must be the same size
 53 |     '''
 54 |     x1.shape = (x1.size, 1)
 55 |     x2.shape = (x2.size, 1)
 56 |     degree = 6
 57 |     out = ones(shape=(x1[:, 0].size, 1))
 58 | 
 59 |     m, n = out.shape
 60 | 
 61 |     for i in range(1, degree + 1):
 62 |         for j in range(i + 1):
 63 |             r = (x1 ** (i - j)) * (x2 ** j)
 64 |             out = append(out, r, axis=1)
 65 | 
 66 |     return out
 67 | 
 68 | #load the dataset
 69 | data = loadtxt('ex2data2.txt', delimiter=',')
 70 | 
 71 | X = data[:, 0:2]
 72 | y = data[:, 2]
 73 | 
 74 | pos = where(y == 1)
 75 | neg = where(y == 0)
 76 | scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
 77 | scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
 78 | xlabel('Microchip Test 1')
 79 | ylabel('Microchip Test 2')
 80 | legend(['y = 1', 'y = 0'])
 81 | #show()
 82 | 
 83 | m, n = X.shape
 84 | 
 85 | y.shape = (m, 1)
 86 | 
 87 | it = map_feature(X[:, 0], X[:, 1])
 88 | 
 89 | #Initialize theta parameters
 90 | initial_theta = zeros(shape=(it.shape[1], 1))
 91 | 
 92 | #Set regularization parameter lambda to 1
 93 | l = 1
 94 | 
 95 | # Compute and display initial cost and gradient for regularized logistic
 96 | # regression
 97 | cost, grad = cost_function_reg(initial_theta, it, y, l)
 98 | 
 99 | def decorated_cost(theta):
100 |     return cost_function_reg(theta, it, y, l)
101 | 
102 | print fmin_bfgs(decorated_cost, initial_theta, maxfun=400)
103 | 
104 | 
105 | 
106 | 
107 | theta = [
108 |  1.273005, 
109 |  0.624876,
110 |  1.177376,
111 |  -2.020142, 
112 |  -0.912616,
113 |  -1.429907, 
114 |  0.125668, 
115 |  -0.368551, 
116 |  -0.360033,
117 |  -0.171068, 
118 |  -1.460894, 
119 |  -0.052499, 
120 |  -0.618889, 
121 |  -0.273745, 
122 |  -1.192301, 
123 |  -0.240993, 
124 |  -0.207934, 
125 |  -0.047224, 
126 |  -0.278327, 
127 |  -0.296602, 
128 |  -0.453957, 
129 |  -1.045511, 
130 |  0.026463, 
131 |  -0.294330, 
132 |  0.014381,
133 |  -0.328703, 
134 |  -0.143796,
135 |  -0.924883,
136 | ]
137 | 
138 | #Plot Boundary
139 | u = linspace(-1, 1.5, 50)
140 | v = linspace(-1, 1.5, 50)
141 | z = zeros(shape=(len(u), len(v)))
142 | for i in range(len(u)):
143 |     for j in range(len(v)):
144 |         z[i, j] = (map_feature(array(u[i]), array(v[j])).dot(array(theta)))
145 | 
146 | z = z.T
147 | contour(u, v, z)
148 | title('lambda = %f' % l)
149 | xlabel('Microchip Test 1')
150 | ylabel('Microchip Test 2')
151 | legend(['y = 1', 'y = 0', 'Decision boundary'])
152 | show()
153 | 
154 | 
155 | def predict(theta, X):
156 |     '''Predict whether the label
157 |     is 0 or 1 using learned logistic
158 |     regression parameters '''
159 |     m, n = X.shape
160 |     p = zeros(shape=(m, 1))
161 | 
162 |     h = sigmoid(X.dot(theta.T))
163 | 
164 |     for it in range(0, h.shape[0]):
165 |         if h[it] > 0.5:
166 |             p[it, 0] = 1
167 |         else:
168 |             p[it, 0] = 0
169 | 
170 |     return p
171 | 
172 | 
173 | #% Compute accuracy on our training set
174 | p = predict(array(theta), it)
175 | print 'Train Accuracy: %f' % ((y[where(p == y)].size / float(y.size)) * 100.0)
176 | 


--------------------------------------------------------------------------------
/moving average.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def running_mean(l, N):
 4 |     # Also works for the(strictly invalid) cases when N is even.
 5 |     if (N//2)*2 == N:
 6 |         N = N - 1
 7 |     front = np.zeros(N//2)
 8 |     back = np.zeros(N//2)
 9 | 
10 |     for i in range(1, (N//2)*2, 2):
11 |         front[i//2] = np.convolve(l[:i], np.ones((i,))/i, mode = 'valid')
12 |     for i in range(1, (N//2)*2, 2):
13 |         back[i//2] = np.convolve(l[-i:], np.ones((i,))/i, mode = 'valid')
14 |     return np.concatenate([front, np.convolve(l, np.ones((N,))/N, mode = 'valid'), back[::-1]])
15 | 	
16 | 
17 | print running_mean(2,21)


--------------------------------------------------------------------------------
/rnn_keras_timeseries_stock.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Aug 1, 2017
  3 | 
  4 | '''
  5 | #import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import time
  8 | import csv
  9 | from keras.layers.core import Dense, Activation, Dropout
 10 | from keras.layers.recurrent import LSTM
 11 | from keras.models import Sequential
 12 | import sys
 13 | np.random.seed(1234)
 14 | 
 15 | 
 16 | def read_data(path_to_dataset,
 17 |                            sequence_length=50,
 18 |                            ratio=1.0):
 19 | 
 20 |     max_values = ratio * 2049280
 21 | 
 22 |     with open(path_to_dataset) as f:
 23 |         data = csv.reader(f, delimiter=",")
 24 |         power = []
 25 |         nb_of_values = 0
 26 |         for line in data:
 27 |             #print(line)
 28 |             #if nb_of_values == 3500:
 29 |              #   break
 30 |             try:
 31 |                 power.append(float(line[1]))
 32 |                 nb_of_values += 1
 33 |             except ValueError:
 34 |                 pass
 35 |             # 2049280.0 is the total number of valid values, i.e. ratio = 1.0
 36 |             if nb_of_values >= max_values:
 37 |                 break
 38 |     return power
 39 | 
 40 | def convert_to_categorical_increasing(current, future):
 41 | 	change = (future-current)*100/current
 42 | 	if change > 0.005:
 43 | 		return 1
 44 | 	else:
 45 | 		return 0
 46 | 
 47 | def convert_to_categorical_decreasing(current, future):
 48 |         change = (future-current)*100/current
 49 | 	if change < 0.005:
 50 | 		return 0
 51 | 	else:
 52 | 		return 1
 53 | 
 54 | def create_matrix(y_train):
 55 | 	y = [[0 for i in xrange(2)] for j in xrange(len(y_train))]
 56 | 	for i in range(len(y_train)):
 57 | 		#print y_train[i]
 58 | 		y[i][y_train[i]] = 1
 59 | 	return y
 60 | 		
 61 | 
 62 | def process_data(power, sequence_length, ratio, increasing, error):
 63 |     #print("Data loaded from csv. Formatting...")
 64 |     #fig = plt.figure()
 65 |     #plt.plot(power)
 66 |     #plt.show()
 67 |     result = []
 68 |     if not error:
 69 |     	for i in range(len(power)-1):
 70 | 		if increasing:
 71 | 			power[i] = convert_to_categorical_increasing(power[i], power[i+1])
 72 | 		else:
 73 | 			power[i] = convert_to_categorical_decreasing(power[i], power[i+1])
 74 |     for index in range(len(power) - sequence_length-1):
 75 |         result.append(power[index: index + sequence_length])
 76 |     result = np.array(result)  # shape (2049230, 50)
 77 | 
 78 |     #result = np.log(result+1)
 79 |     #print result
 80 |     #exit(0)
 81 | #     print ("Shift : ", result_mean)
 82 |     #print ("Data  : ", result.shape)
 83 |     
 84 |     row = int(round(0.9 * result.shape[0]))
 85 |     #:print row
 86 |     train = result[:row, :]
 87 |     np.random.shuffle(train)
 88 |     X_train = train[:, :-1]
 89 |     y_train_temp = train[:, -1]
 90 |     #print y_train_temp
 91 |     y_train = create_matrix(y_train_temp)
 92 |     X_test = result[row:, :-1]
 93 |     y_test = result[row:, -1]
 94 | 
 95 |     X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
 96 |     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
 97 | 
 98 |     return [X_train, y_train, X_test, y_test]
 99 | 
100 | 
101 | def build_model():
102 |     model = Sequential()
103 |     layers = [1, 100, 50, 2]
104 | 
105 |     model.add(LSTM(
106 |         layers[1],
107 |         input_shape=(None, layers[0]),
108 |         return_sequences=True))
109 |     model.add(Dropout(0.2))
110 | 
111 |     model.add(LSTM(
112 |         layers[2],
113 |         return_sequences=False))
114 |     model.add(Dropout(0.2))
115 | 
116 |     model.add(Dense(
117 |         layers[3]))
118 |     model.add(Activation('softmax'))
119 | 
120 |     start = time.time()
121 |     model.compile(loss="categorical_crossentropy", optimizer="adam")
122 |     #print ("Compilation Time : ", time.time() - start)
123 |     return model
124 | 
125 | 
126 | def run_network(data=None, increasing=False, error=False):
127 |     global_start_time = time.time()
128 |     epochs = 2
129 |     ratio = 0.5
130 |     sequence_length = 100
131 | 
132 |     X_train, y_train, X_test, y_test = process_data(
133 |             data, sequence_length, ratio,increasing, error)
134 | 
135 |     #print ('\nData Loaded. Compiling...\n')
136 | 
137 |     model = build_model()
138 | 
139 |     try:
140 |         model.fit(
141 |             X_train, y_train,
142 |             batch_size=512, nb_epoch=epochs, validation_split=0.05, verbose=0)
143 |         predicted = model.predict(X_test)
144 |         #predicted = np.reshape(predicted, (predicted.size,))
145 |     except KeyboardInterrupt:
146 |         #:print ('Training duration (s) : ', time.time() - global_start_time)
147 |         return model, y_test, 0
148 | 
149 |     try:
150 |         fig = plt.figure()
151 |         ax = fig.add_subplot(111)
152 |         ax.plot(y_test[:100]*result_max)
153 |         plt.plot(predicted[:100]*result_max)
154 |         plt.show()
155 |     except Exception as e:
156 | 	pass
157 |         #print (str(e))
158 |     #print ('Training duration (s) : ', time.time() - global_start_time)
159 |     
160 |     return y_test, predicted
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     path_to_dataset = 'data/CLSB Comdty.csv'#'data/20170818/CLSB Comdty.csv'
165 |     data = read_data(path_to_dataset)
166 |     #print len(data)
167 |     success = 0
168 |     fail = 0
169 |     success1 = 0
170 |     fail1 = 0
171 |     false_low = 0
172 |     false_high = 0
173 |     error  = []
174 |     err_predicted = {}
175 |     mean_std_inc = 0
176 |     mean_std_dec = 0
177 |     mean_std_err = 0
178 |     e = False
179 |     count = 0
180 |     count_bad = 0
181 |     for i in range(0,len(data)-1000,89):
182 | 	count = count + 1
183 | 	if count_bad >=2:
184 | 		count_bad = 0
185 | 		K = 1
186 | 		continue
187 | 	#success = 0
188 |         #fail = 0
189 |         #false_low = 0
190 |         #false_high = 0
191 |         d1 = data[i:i+1001]
192 |         d2 = data[i:i+1001]
193 |         y_test_increasing, predicted_increasing = run_network(d1, True, False)
194 | 	y_test_decreasing, predicted_decreasing = run_network(d2, False, False)
195 | 	if count > 11 and len(error) >= 1000:
196 |                 err_test, err_predicted = run_network(error, True, True)
197 | 		#print "error predicted"
198 | 	prob_increasing = predicted_increasing[:,1]
199 | 	increasing_mean = prob_increasing.mean()
200 | 	increasing_std = prob_increasing.std()
201 | 	prob_decreasing = predicted_decreasing[:,0]
202 | 	decreasing_mean = prob_decreasing.mean()
203 | 	decreasing_std = prob_decreasing.std()
204 | 	if i > 0:
205 |                 mean_std_inc = (mean_std_inc + increasing_std)/2
206 |                 mean_std_dec = (mean_std_dec + decreasing_std)/2
207 |         else:
208 |                 mean_std_inc = increasing_std
209 |                 mean_std_dec = decreasing_std
210 | 
211 | 	y_test_decreasing -= 1
212 | 	prob_err = []
213 | 	prob_err_mean = 0
214 | 	prob_err_std = 0
215 |         if count > 11 and len(error) >= 1000:
216 | 		prob_err = err_predicted[:,0]
217 | 		prob_err_mean = prob_err.mean()
218 | 		prob_err_std = prob_err.std()
219 |                 error = error[90:]
220 | 		#print "mean calculated"
221 | 		#print prob_err
222 | 	#if i > 11 and len(error_increasing) >= 1000 and len(err_predicted_increasing)>=90:
223 | 	if True:
224 | 		mean_std_err = 0
225 | 		K = 1
226 | 		K1 = 1
227 | 		if success != 0:
228 | 			acc_with = success*100/(success + fail)
229 | 			#acc_wo = success1*100/(success1 + fail1)
230 | 			print acc_with
231 | 			if acc_with < 58 and success != 0:
232 | 				K = K/2
233 | 				count_bad = count_bad + 1
234 | 				#mean_std_inc = 0
235 | 				#mean_std_dec = 0
236 | 				if K1 == 1:
237 | 					K1 = 1.5
238 | 				else:
239 | 					K1 = 1
240 | 			if acc_with > 58:
241 | 				K = 1/2
242 | 				#mean_std_err = 0
243 | 			if mean_std_err == 0:
244 | 				mean_std_err = prob_err_std
245 | 			else:
246 | 				mean_std_err = (prob_err_std + mean_std_err)/2
247 | 		for j in range(len(y_test_decreasing)-1):
248 | 			ac_status = y_test_increasing[j] + y_test_decreasing[j]
249 | 			pr_status = 0
250 | 			if True:
251 | 				inc = (prob_increasing[j] - increasing_mean + K1*mean_std_inc)
252 | 				dec = (prob_decreasing[j] - decreasing_mean + K1*mean_std_dec)
253 | 				#print inc,dec
254 | 				if inc > 0 or dec > 0:
255 | 					if inc > dec:
256 | 						pr_status = 1 
257 | 					else:
258 | 						pr_status = -1 
259 | 				else:
260 | 					pr_status = 0 
261 | 				if ac_status != pr_status:
262 | 					error.append(0)
263 | 				else:
264 | 					error.append(1)
265 | 				if ac_status != 0:
266 | 					if inc > 0 and dec > 0:
267 | 						continue
268 | 					if count >= 10 and len(error) >= 1000 and len(err_predicted)>=90:
269 | 						if ac_status == pr_status:
270 |  	                                       		success1 = success1 + 1
271 |                                         	else:
272 |          	                                	fail1 = fail1 + 1
273 | 						if (prob_err[j] - prob_err_mean) > K*mean_std_err:
274 | 							pr_status = -1*pr_status
275 | 						else:
276 | 							if ac_status == pr_status:
277 | 								success = success + 1
278 | 							else:
279 | 								fail = fail + 1 
280 | 					#else:
281 | 					#	success = success1
282 | 					#	fail = fail1
283 | 					#print ac_status,',',pr_status, ',',prob_increasing[j],',',prob_decreasing[j]
284 | 		print success,',',fail,',',count #success1,',',fail1,',', count  #,',',false_high,',',false_low #,',',increasing_std,',',decreasing_std,',',increasing_mean,',',decreasing_mean
285 | 


--------------------------------------------------------------------------------
/se_test1.py:
--------------------------------------------------------------------------------
 1 | # To install the Python client library:
 2 | # pip install -U selenium
 3 | 
 4 | # Import the Selenium 2 namespace (aka "webdriver")
 5 | from selenium import webdriver
 6 | 
 7 | # Google Chrome 
 8 | driver = webdriver.Chrome('C:\Users\User\Downloads\chromedriver_win32\chromedriver.exe')
 9 | 
10 | # ------------------------------
11 | # The actual test scenario: Test the codepad.org code execution service.
12 | 
13 | # Go to codepad.org
14 | driver.get('http://codepad.org')
15 | 
16 | # Select the Python language option
17 | python_link = driver.find_elements_by_xpath("//input[@name='lang' and @value='Python']")[0]
18 | python_link.click()
19 | 
20 | # Enter some text!
21 | text_area = driver.find_element_by_id('textarea')
22 | text_area.send_keys("print 'Good,' + ' Morning!'")
23 | 
24 | # Submit the form!
25 | submit_switch = driver.find_element_by_name('submit')
26 | submit_switch.click()
27 | 
28 | # Make this an actual test. Isn't Python beautiful?
29 | assert "Good, Morning!" in driver.get_page_source()
30 | 
31 | # Close the browser!
32 | driver.quit()


--------------------------------------------------------------------------------
/src/MainBDAS.java:
--------------------------------------------------------------------------------
  1 | 
  2 | import java.io.IOException;
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | 
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.LongWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Reducer;
 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 16 | 
 17 | import java.io.File;
 18 | 
 19 | import javax.xml.parsers.DocumentBuilder;
 20 | import javax.xml.parsers.DocumentBuilderFactory;
 21 | 
 22 | import org.w3c.dom.Document;
 23 | 
 24 | 
 25 | /**
 26 |  * 
 27 |  */
 28 | 
 29 | /**
 30 |  * @author SayanM
 31 |  *
 32 |  */
 33 | public class MainBDAS {
 34 | 	
 35 | 	public static class MapperBDAS extends Mapper<LongWritable, Text, Text, Text> {
 36 | 		
 37 | 		protected void map(LongWritable key, Text value, Context context)
 38 | 				 throws IOException, InterruptedException {
 39 | 			String classname = context.getConfiguration().get("classname");
 40 | 			
 41 | 			try {
 42 | 				RootBDAS instance = (RootBDAS) Class.forName(classname).getConstructor().newInstance();
 43 | 				String line = value.toString();
 44 | 				HashMap<String, ArrayList<String>> result = instance.mapper_task(line);
 45 | 				for(String k : result.keySet())
 46 | 				{
 47 | 					for(String v : result.get(k))
 48 | 					{
 49 | 						context.write(new Text(k), new Text(v));
 50 | 					}
 51 | 				}
 52 | 			} catch (Exception e) {
 53 | 				// TODO Auto-generated catch block
 54 | 				e.printStackTrace();
 55 | 			} 
 56 | 			
 57 | 			
 58 | 			}
 59 | 				
 60 | 	}
 61 | 	
 62 | 	public static class ReducerBDAS extends	 Reducer<Text, Text, Text, Text> {
 63 | 		
 64 | 		protected void reduce(Text key, Iterable<Text> values,
 65 | 				 Context context) throws IOException, InterruptedException {
 66 | 			String classname = context.getConfiguration().get("classname");
 67 | 			
 68 | 			try {
 69 | 				RootBDAS instance = (RootBDAS) Class.forName(classname).getConstructor().newInstance();
 70 | 				ArrayList<String> vals = new ArrayList<String>();
 71 | 				for(Text v : values)
 72 | 				{
 73 | 					vals.add(v.toString());
 74 | 				}
 75 | 				HashMap<String, ArrayList<String>> result = instance.reducer_task(key.toString(), vals);
 76 | 				for(String k : result.keySet())
 77 | 				{
 78 | 					for(String v : result.get(k))
 79 | 					{
 80 | 						context.write(new Text(k), new Text(v));
 81 | 					}
 82 | 				}
 83 | 			} catch (Exception e) {
 84 | 				// TODO Auto-generated catch block
 85 | 				e.printStackTrace();
 86 | 			} 
 87 | 			
 88 | 		}
 89 | 		
 90 | 	}
 91 | 
 92 | 	public static void main(String[] args) throws Exception {
 93 | 		// TODO Auto-generated method stub
 94 | 		
 95 | 		Job job = new Job();
 96 | 		 
 97 | 		job.setJarByClass(MainBDAS.class);
 98 | 		job.setJobName("MapReduceBDAS");
 99 | 		 
100 | 		job.setOutputKeyClass(Text.class);
101 | 		job.setOutputValueClass(Text.class);
102 | 		 
103 | 		job.setInputFormatClass(TextInputFormat.class);
104 | 		job.setOutputFormatClass(TextOutputFormat.class);
105 | 		 
106 | 	
107 | 		FileInputFormat.setInputPaths(job, new Path(args[0]));
108 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
109 | 		job.setMapperClass(MapperBDAS.class);
110 | 		job.setReducerClass(ReducerBDAS.class);
111 | 		
112 | 		File file = new File("Config.xml");
113 | 		DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
114 | 		DocumentBuilder db = dbf.newDocumentBuilder();
115 | 		Document doc = db.parse(file);
116 | 		doc.getDocumentElement().normalize();
117 | 		
118 | 		String classname = Utility.getClassName(doc);
119 | 		
120 | 		job.getConfiguration().set("classname", classname);
121 | 		 
122 | 		System.out.println(job.waitForCompletion(true));
123 | 
124 | 	}
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/RootBDAS.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.HashMap;
 3 | 
 4 | /**
 5 |  * 
 6 |  */
 7 | 
 8 | /**
 9 |  * @author SayanM
10 |  *
11 |  */
12 | public abstract class RootBDAS {
13 | 	abstract  HashMap<String, ArrayList<String>>  mapper_task(String line);
14 | 	abstract  HashMap<String, ArrayList<String>>  reducer_task(String key, ArrayList<String> values);
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/Utility.java:
--------------------------------------------------------------------------------
 1 | import org.w3c.dom.Document;
 2 | import org.w3c.dom.NodeList;
 3 | 
 4 | /**
 5 |  * 
 6 |  */
 7 | 
 8 | /**
 9 |  * @author SayanM
10 |  *
11 |  */
12 | public class Utility {
13 | 	
14 | 	public static String getClassName(Document doc)
15 | 	{
16 | 		NodeList nodeLst = doc.getElementsByTagName("ClassName");
17 | 		return nodeLst.item(0).getNodeValue();
18 | 	}
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/WordCounterBDAS.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.HashMap;
 3 | 
 4 | /**
 5 |  * 
 6 |  */
 7 | 
 8 | /**
 9 |  * @author SayanM
10 |  *
11 |  */
12 | 
13 | 
14 | public final class WordCounterBDAS extends RootBDAS{
15 | 
16 | 	@Override
17 | 	HashMap<String, ArrayList<String>> mapper_task(String line) {
18 | 		// TODO Auto-generated method stub
19 | 		String[] words = line.split(" ");
20 | 		HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
21 | 		for(String w : words)
22 | 		{
23 | 			if(result.containsKey(w))
24 | 			{
25 | 				ArrayList<String> vals = result.get(w);
26 | 				vals.add("1");
27 | 				result.put(w, vals);
28 | 			}
29 | 			else
30 | 			{
31 | 				ArrayList<String> vals = new ArrayList<String>();
32 | 				vals.add("1");
33 | 				result.put(w, vals);
34 | 			}
35 | 		}
36 | 		return result;
37 | 	}
38 | 
39 | 	@Override
40 | 	HashMap<String, ArrayList<String>> reducer_task(String key, ArrayList<String> values) {
41 | 		// TODO Auto-generated method stub
42 | 		HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
43 | 		ArrayList<String> tempres = new ArrayList<String>();
44 | 		tempres.add(values.size()+ "");
45 | 		result.put(key, tempres);
46 | 		return result;
47 | 	}
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/testBDAS.java:
--------------------------------------------------------------------------------
 1 | import static org.junit.Assert.*;
 2 | 
 3 | import org.junit.After;
 4 | import org.junit.AfterClass;
 5 | import org.junit.Before;
 6 | import org.junit.BeforeClass;
 7 | import org.junit.Test;
 8 | 
 9 | 
10 | public class testBDAS {
11 | 
12 | 	public void testMapper(){
13 | 		
14 | 	}
15 | 	
16 | 	public void testReducer(){
17 | 		
18 | 	}
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------