├── .gitattributes ├── 9781484234495.jpg ├── Airline2.py ├── Airline_friendly_patient_efficient cabin crew.py ├── Airline_worst_experience_ever.py ├── Arima.py ├── Contributing.md ├── LICENSE.txt ├── NumericalDataClassification.py ├── README.md ├── Recursive_Levenshtein.py ├── Text in clustering.py ├── airline1.py ├── arma.py ├── autoregressive model.py ├── classifier1.py ├── classifier2.py ├── clustering_with_k_mean.py ├── code.R ├── customsearch.py ├── errata.md ├── knn_classifier.py ├── least square estimation linear regression.py ├── log_reg.py ├── log_reg_regular.py ├── moving average.py ├── rnn_keras_timeseries_stock.py ├── se_test1.py └── src ├── MainBDAS.java ├── RootBDAS.java ├── Utility.java ├── WordCounterBDAS.java └── testBDAS.java /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /9781484234495.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/advanced-data-analytics-using-python/2133b66317ddddee65c9e5c2d861effa722e4daf/9781484234495.jpg -------------------------------------------------------------------------------- /Airline2.py: -------------------------------------------------------------------------------- 1 | from textblob.classifiers import NaiveBayesClassifier 2 | 3 | train = [('Air India did a poor job of queue management both times.', 'staff service'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown')] 4 | 5 | cl = NaiveBayesClassifier(train) 6 | 7 | tests = ['Food is good.', 'Colombo to Moscow via Delhi.'] 8 | for c in tests: 9 | print c,'\t',cl.classify(c) -------------------------------------------------------------------------------- /Airline_friendly_patient_efficient cabin crew.py: -------------------------------------------------------------------------------- 1 | from textblob.classifiers import NaiveBayesClassifier 2 | 3 | train = [('This flight from Bodhgaya to Delhi was the latest of about 20 domestic flights sectors completed with Air India over the last 3 - 4 years.', 'pos'),('The level of service has remained consistent in all areas, with some aspects of service stronger than others.', 'pos'),('This flight was originally due to depart Bodhgaya at 14.35h arriving in Delhi at 16.25h.', 'pos'),('An initial delay to a revised 16.00h departure was notified by e-mail several hours in advance.', 'pos'),('We had no idea how long the delay would be until we saw our aircraft land.', 'neg'),('Except for an expensive coffee kiosk there were no customer facilities while waiting.', 'neg')] 4 | 5 | cl = NaiveBayesClassifier(train) 6 | print(cl.classify('Check-in queue management at Bodhgaya airport was poor but agent service was good once you eventually got to the check-in desk.')) -------------------------------------------------------------------------------- /Airline_worst_experience_ever.py: -------------------------------------------------------------------------------- 1 | from textblob.classifiers import NaiveBayesClassifier 2 | 3 | train = [("It's a very long and exhausting flight!", 'pos'),('I traveled alone with my 9 month old son from Frankfurt to Delhi and onto Sydney!', 'pos'),('At the end they wrote a manual boarding pass.', 'pos'),("I think everybody knows with a baby it's even harder!", 'pos'),('After this 2 hours they gave me just the boarding pass until Delhi and I had to run carrying my son and all the luggage to the boarding as this had started already.', 'pos'),("The first problems started when they weren't be able to print the boarding pass for my son.", 'neg'),("They kept me 2 hours in front of the check-in that I couldn't care properly for my son.", 'neg'),('No toilets, no food again - just hard chairs to wait on.', 'neg'),("They didn't care.", 'neg'),('I got more and more stressed.', 'pos')] 4 | 5 | cl = NaiveBayesClassifier(train) 6 | print(cl.classify("Worst experience ever with Air India!")) -------------------------------------------------------------------------------- /Arima.py: -------------------------------------------------------------------------------- 1 | from pandas import read_csv 2 | from pandas import datetime 3 | from matplotlib import pyplot 4 | from statsmodels.tsa.arima_model import ARIMA 5 | from sklearn.metrics import mean_squared_error 6 | 7 | def parser(p): 8 | return datetime.strptime('190'+p, '%Y-%m') 9 | 10 | series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) 11 | P = series.values 12 | size = int(len(P) * 0.66) 13 | train, test = P[0:size], P[size:len(P)] 14 | history = [p for p in train] 15 | predictions = list() 16 | for t in range(len(test)): 17 | model = ARIMA(history, order=(5,1,0)) 18 | model_fit = model.fit(disp=0) 19 | output = model_fit.forecast() 20 | yhat = output[0] 21 | predictions.append(yhat) 22 | obs = test[t] 23 | history.append(obs) 24 | print('predicted=%f, expected=%f' % (yhat, obs)) 25 | error = mean_squared_error(test, predictions) 26 | print('Test MSE: %.3f' % error) 27 | # plot 28 | pyplot.plot(test) 29 | pyplot.plot(predictions, color='red') 30 | pyplot.show() -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2018 Sayan Mukhopadhyay 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /NumericalDataClassification.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | import operator 4 | import math 5 | 6 | 7 | def euclideanDistance(instance1, instance2, length): 8 | distance = 0 9 | for x in range(length): 10 | distance += pow((instance1[x] - instance2[x]), 2) 11 | return math.sqrt(distance) 12 | 13 | def getNeighbors(trainingSet, testInstance, k): 14 | distances = [] 15 | length = len(testInstance)-1 16 | for x in range(len(trainingSet)): 17 | dist = euclideanDistance(testInstance, trainingSet[x], length) 18 | distances.append((trainingSet[x], dist)) 19 | distances.sort(key=operator.itemgetter(1)) 20 | neighbors = [] 21 | for x in range(k): 22 | neighbors.append(distances[x][0]) 23 | return neighbors 24 | 25 | trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']] 26 | testInstance = [5, 5, 5] 27 | k = 1 28 | neighbors = getNeighbors(trainSet, testInstance, 1) 29 | print(neighbors) 30 | 31 | exit(0) 32 | 33 | def loadDataset(filename, split, trainingSet=[] , testSet=[]): 34 | with open(filename, 'rb') as csvfile: 35 | lines = csv.reader(csvfile) 36 | dataset = list(lines) 37 | for x in range(len(dataset)-1): 38 | for y in range(4): 39 | dataset[x][y] = float(dataset[x][y]) 40 | if random.random() < split: 41 | trainingSet.append(dataset[x]) 42 | else: 43 | testSet.append(dataset[x]) 44 | trainingSet=[] 45 | testSet=[] 46 | loadDataset('irisdata.txt', 0.66, trainingSet, testSet) 47 | print 'Train: ' + repr(len(trainingSet)) 48 | print 'Test: ' + repr(len(testSet)) 49 | 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Advanced Data Analytics Using Python*](https://www.apress.com/9781484234495) by Sayan Mukhopadhyay (Apress, 2018). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484234495.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /Recursive_Levenshtein.py: -------------------------------------------------------------------------------- 1 | def LD(s, t): 2 | if s == "": 3 | return len(t) 4 | if t == "": 5 | return len(s) 6 | if s[-1] == t[-1]: 7 | cost = 0 8 | else: 9 | cost = 1 10 | 11 | res = min([LD(s[:-1], t)+1, 12 | LD(s, t[:-1])+1, 13 | LD(s[:-1], t[:-1]) + cost]) 14 | return res 15 | print(LD("Python", "Peithen")) -------------------------------------------------------------------------------- /Text in clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | def LD(s, t): 5 | if s == "": 6 | return len(t) 7 | if t == "": 8 | return len(s) 9 | if s[-1] == t[-1]: 10 | cost = 0 11 | else: 12 | cost = 1 13 | 14 | res = min([LD(s[:-1], t)+1, 15 | LD(s, t[:-1])+1, 16 | LD(s[:-1], t[:-1]) + cost]) 17 | return res 18 | 19 | def find_centre(x, X, mu): 20 | min = 100 21 | cent = 0 22 | for c in mu: 23 | dist = LD(x, X[c]) 24 | if dist < min: 25 | min = dist 26 | cent = c 27 | return cent 28 | 29 | 30 | def cluster_points(X, mu): 31 | clusters = {} 32 | for x in X: 33 | bestmukey = find_centre(x, X, mu) 34 | try: 35 | clusters[bestmukey].append(x) 36 | except KeyError: 37 | clusters[bestmukey] = [x] 38 | return clusters 39 | 40 | def reevaluate_centers(mu, clusters): 41 | newmu = [] 42 | keys = sorted(clusters.keys()) 43 | for k in keys: 44 | newmu.append(k) 45 | return newmu 46 | 47 | def has_converged(mu, oldmu): 48 | return sorted(mu) == sorted(oldmu) 49 | 50 | def find_centers(X, K): 51 | oldmu = random.sample(range(0,5), K) 52 | mu = random.sample(range(0,5), K) 53 | while not has_converged(mu, oldmu): 54 | oldmu = mu 55 | # Assign all points in X to clusters 56 | clusters = cluster_points(X, mu) 57 | # Reevaluate centers 58 | mu = reevaluate_centers(oldmu, clusters) 59 | return(mu, clusters) 60 | 61 | X = ['Delhi','Dehli', 'Delli','Kolkata','Kalkata','Kalkota'] 62 | 63 | print(find_centers(X,2)) 64 | -------------------------------------------------------------------------------- /airline1.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import random 4 | 5 | url = "http://www.airlinequality.com/airline-reviews/air-india/page/2/" 6 | 7 | agent = "Mozilla/5.0 (Windows NT 6.2) Firefox/40.1" 8 | 9 | headers = {'user-agent': agent} 10 | r = requests.get(url, headers=headers) 11 | 12 | data = r.content 13 | print(data) 14 | exit() 15 | soup = BeautifulSoup(data) 16 | for div in soup.findAll("div", { "class" : "text_content" }): 17 | print(str(div)) -------------------------------------------------------------------------------- /arma.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import stats 3 | import pandas 4 | import matplotlib.pyplot as plt 5 | import statsmodels.api as sm 6 | 7 | from statsmodels.graphics.api import qqplot 8 | 9 | print sm.datasets.sunspots.NOTE 10 | 11 | Number of Observation - 309 (Annual 1700 - 2008) 12 | Number of Variable – 1 13 | Variable name definitions:: 14 | SUNACTIVITY - Number of sunspots for each year 15 | The data file contains a 'YEAR' variable that is not returned by load. 16 | 17 | dta = sm.datasets.sunspots.load_pandas().data 18 | 19 | dta.index = pandas.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) 20 | del dta["YEAR"] 21 | 22 | -------------------------------------------------------------------------------- /autoregressive model.py: -------------------------------------------------------------------------------- 1 | from pandas import Series 2 | from matplotlib import pyplot 3 | from statsmodels.tsa.ar_model import AR 4 | from sklearn.metrics import mean_squared_error 5 | series = Series.from_csv('daily-minimum-temperatures.csv', header=0) 6 | 7 | J = series.value 8 | train, test = J[1:len(J)-7], J[len(J)-7:] 9 | 10 | model = AR(train) 11 | model_fit = model.fit() 12 | print('Lag: %s' % model_fit.k_ar) 13 | print('Coefficients: %s' % model_fit.params) 14 | 15 | predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False) 16 | for t in range(len(predictions)): 17 | print('predicted=%f, expected=%f' % (predictions[t], test[t])) 18 | error = mean_squared_error(test, predictions) 19 | print('Test MSE: %.3f' % error) 20 | 21 | pyplot.plot(test) 22 | pyplot.plot(predictions, color='red') 23 | pyplot.show() 24 | -------------------------------------------------------------------------------- /classifier1.py: -------------------------------------------------------------------------------- 1 | from textblob.classifiers import NaiveBayesClassifier 2 | 3 | train = [('Air India did a poor job of queue management both times.', 'staff service'),('I love this sandwich.', 'pos'),('this is an amazing place!', 'pos'),('I feel very good about these beers.', 'pos'),('this is my best work.', 'pos'),("what an awesome view", 'pos'),('I do not like this restaurant', 'neg'),('I am tired of this stuff.', 'neg'),("I can't deal with this", 'neg'),('he is my sworn enemy!', 'neg'),('my boss is horrible.', 'neg'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown')] 4 | 5 | cl = NaiveBayesClassifier(train) 6 | 7 | tests = ['Food is good.', 'Colombo to Moscow via Delhi.'] 8 | for c in tests: 9 | print c,'\t',cl.classify(c) -------------------------------------------------------------------------------- /classifier2.py: -------------------------------------------------------------------------------- 1 | from textblob.classifiers import NaiveBayesClassifier 2 | 3 | train = [('I love this sandwich.', 'pos'),('Air India did a poor job of queue management both times.', 'staff service'),("The 'cleaning' by flight attendants involved regularly spraying air freshener in the lavatories.", 'staff'),('The food tasted decent.', 'food'),('Flew Air India direct from New York to Delhi round trip.', 'route'),('Colombo to Moscow via Delhi.', 'route'),('Flew Birmingham to Delhi with Air India.', 'route'),('Without toilet, food or anything!', 'food'),('Cabin crew announcements included a sincere apology for the delay.', 'cabin flown'), 4 | ('this is an amazing place!', 'pos'),('I feel very good about these beers.', 'pos'),('this is my best work.', 'pos'),("what an awesome view", 'pos'),('I do not like this restaurant', 'neg'),('I am tired of this stuff.', 'neg'),("I can't deal with this", 'neg'),('he is my sworn enemy!', 'neg'),('my boss is horrible.', 'neg')] 5 | 6 | cl = NaiveBayesClassifier(train) 7 | print (cl.classify("This is an amazing library!")) -------------------------------------------------------------------------------- /clustering_with_k_mean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | def cluster_points(X, mu): 5 | clusters = {} 6 | for x in X: 7 | bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \ 8 | for i in enumerate(mu)], key=lambda t:t[1])[0] 9 | try: 10 | clusters[bestmukey].append(x) 11 | except KeyError: 12 | clusters[bestmukey] = [x] 13 | return clusters 14 | 15 | def reevaluate_centers(mu, clusters): 16 | newmu = [] 17 | keys = sorted(clusters.keys()) 18 | for k in keys: 19 | newmu.append(np.mean(clusters[k], axis = 0)) 20 | return newmu 21 | 22 | def has_converged(mu, oldmu): 23 | return (set([tuple(b) for b in mu]) == set([tuple(b) for b in oldmu])) 24 | 25 | def find_centers(X, K): 26 | oldmu = random.sample(X, K) 27 | mu = random.sample(X, K) 28 | while not has_converged(mu, oldmu): 29 | oldmu = mu 30 | 31 | clusters = cluster_points(X, mu) 32 | 33 | mu = reevaluate_centers(oldmu, clusters) 34 | return(mu, clusters) 35 | 36 | X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(10)]) 37 | 38 | print(find_centers(X,2)) 39 | -------------------------------------------------------------------------------- /code.R: -------------------------------------------------------------------------------- 1 | asm_weekwise<-read.csv("F:/souravda/New ASM Weekwise.csv",header=TRUE) 2 | 3 | asm_weekwise$Week <- NULL 4 | 5 | library(MASS, lib.loc="F:/souravda/lib/") 6 | library(tseries, lib.loc="F:/souravda/lib/") 7 | library(forecast, lib.loc="F:/souravda/lib/") 8 | 9 | #asm_weekwise[is.na(asm_weekwise)] <- 0 10 | #asm_weekwise[asm_weekwise <= 0] <- mean(as.matrix(asm_weekwise)) 11 | 12 | 13 | 14 | 15 | weekjoyforecastvalues <- data.frame( "asm" = integer(), "value" = integer(), stringsAsFactors=FALSE) 16 | 17 | for(i in 1:ncol(asm_weekwise)) 18 | { 19 | asmname<-names(asm_weekwise)[i] 20 | temparimadata<-asm_weekwise[,i] 21 | temparimadata[is.na(temparimadata)] <- 0 22 | temparimadata[temparimadata <=0] <- mean(as.matrix(temparimadata)) 23 | m <- mean(as.matrix(temparimadata)) 24 | #print(m) 25 | s <- sd(temparimadata) 26 | #print(s) 27 | temparimadata <- (temparimadata - m) 28 | temparimadata <- (temparimadata / s) 29 | temparima<-auto.arima(temparimadata, stationary = FALSE, seasonal = TRUE, allowdrift = TRUE, allowmean = FALSE, biasadj = FALSE) 30 | tempforecast<-forecast(temparima,h=12) 31 | #tempforecast <- (tempforecast * s) 32 | #print(tempforecast) 33 | temp_forecasted_data<-sum(data.frame(tempforecast$upper[,1])*s + m) 34 | weekjoyforecastvalues[nrow(weekjoyforecastvalues) + 1, ] <- c( asmname, temp_forecasted_data) 35 | } 36 | 37 | weekjoyforecastvalues$value<-as.integer(weekjoyforecastvalues$value) 38 | 39 | cat(weekjoyforecastvalues$value,sep="\n") 40 | 41 | (sum(weekjoyforecastvalues$value)- 103000000)/103000000 #53782605)/53782605 -------------------------------------------------------------------------------- /customsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pprint, json, urllib2 5 | import nltk, sys, urllib 6 | from bs4 import BeautifulSoup 7 | import csv 8 | 9 | from googleapiclient.discovery import build 10 | 11 | def link_score(link): 12 | if ('cv' in link or 'resume' in link) and 'job' not in link: 13 | return True 14 | 15 | def process_file(): 16 | try: 17 | 18 | with open('data1.json','r') as fl: 19 | data = json.load(fl) 20 | all_links = [] 21 | # pprint.pprint(len(data['items'])) 22 | for item in data['items']: 23 | # print item['formattedUrl'] 24 | all_links.append(item['formattedUrl']) 25 | return all_links 26 | except: 27 | return [] 28 | 29 | def main(istart, search_query): 30 | service = build("customsearch", "v1", 31 | developerKey="AIzaSyApK0athSzeKSUa8vCNWZe2R1IygAv4bP4") 32 | 33 | res = service.cse().list( 34 | q= search_query, 35 | cx='007420266948142075924:dsrt3pl0cju', 36 | num=10, 37 | gl='in', #in for india comment this for whole web 38 | start = istart, 39 | ).execute() 40 | import json 41 | with open('data1.json', 'w') as fp: 42 | json.dump(res, fp) 43 | # pprint.pprint(type(res)) 44 | # pprint.pprint(res) 45 | 46 | def get_email_ph(link_text, pdf=None): 47 | if pdf==True: 48 | 49 | from textract import process 50 | text = process(link_text) 51 | else: 52 | text = link_text 53 | # print text 54 | import re 55 | email = [] 56 | ph = [] 57 | valid_ph = re.compile("[789][0-9]{9}$") 58 | valid = re.compile("[A-Za-z]+[@]{1}[A-Za-z]+\.[a-z]+") 59 | for token in re.split(r'[,\s]',text): 60 | # for token in nltk.tokenize(text): 61 | # print token 62 | a = valid.match(token) 63 | b = valid_ph.match(token) 64 | if a != None: 65 | print a.group() 66 | email.append(a.group()) 67 | if b != None: 68 | print b.group() 69 | ph.append(b.group()) 70 | return email, ph 71 | 72 | def process_pdf_link(link): 73 | html = urllib2.urlopen(link) 74 | file = open("document.pdf", 'w') 75 | file.write(html.read()) 76 | file.close() 77 | return get_email_ph("document.pdf", pdf=True) 78 | 79 | def process_doc_link(link): 80 | testfile = urllib.URLopener() 81 | testfile.retrieve(link, "document.doc") 82 | return get_email_ph("document.doc", pdf=False) 83 | 84 | def process_docx_link(link): 85 | testfile = urllib.URLopener() 86 | testfile.retrieve(link, "document.docx") 87 | return get_email_ph("document.docx", pdf=False) 88 | 89 | def process_links(all_links): 90 | with open('email_ph.csv', 'wb') as csvfile: 91 | spamwriter = csv.writer(csvfile, delimiter=',') 92 | 93 | for link in all_links: 94 | if link[:4] !='http': 95 | link = "http://"+link 96 | print link 97 | try: 98 | if link[-3:] == 'pdf': 99 | try: 100 | email, ph = process_pdf_link(link) 101 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 102 | except: 103 | print "error",link 104 | print sys.exc_info() 105 | elif link[-4:] == 'docx': 106 | try: 107 | email, ph = process_docx_link(link) 108 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 109 | except: 110 | print "error",link 111 | print sys.exc_info() 112 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 113 | elif link[-3:] == 'doc': 114 | try: 115 | email, ph = process_doc_link(link) 116 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 117 | except: 118 | print "error",link 119 | print sys.exc_info() 120 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 121 | else: 122 | try: 123 | html = urllib2.urlopen(link) 124 | email, ph = get_email_ph(BeautifulSoup(html.read()).get_text(), pdf=False) 125 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 126 | except: 127 | print "error",link 128 | print sys.exc_info() 129 | spamwriter.writerow([link, ' '.join(email), ' '.join(ph)]) 130 | except: 131 | pass 132 | print "error",link 133 | print sys.exc_info() 134 | 135 | if __name__ == '__main__': 136 | 137 | # if len(sys.argv) <2 : 138 | # print "Error : please pass query words e.g. python customsearch.py java developer" 139 | # sys.exit() 140 | # else: 141 | # search_query = " ".join(sys.argv[1:]) 142 | 143 | # print search_query 144 | search_query = ' ASP .NET, C#, WebServices, HTML Chicago USA biodata cv' 145 | # 146 | # links = ['http://www.michaelminella.com/resume.html', 147 | # 'www.indeed.com/resumes/Java-J2EE-Developer', 148 | # 'www.slideshare.net/raghavanm/java-j2-eecvguide', 149 | # 'www.gcreddy.com/2013/10/java-3-years-resume.html', 150 | # 'www.naschenweng.info/cv/', 151 | # 'www.shinkarenko.org/cv/IlyaShinkarenkoCV.pdf', 152 | # 'stackoverflow.com/cv/anujpatel', 153 | # 'www.hrishikesh.karambelkar.co.in/resume-hrishikesh-karambelkar.doc', 154 | # 'www.oocities.org/rkbalgi/resume.pdf', 155 | # 'adam.kahtava.com/resume/curriculum-vitae/software-developer/'] 156 | # 157 | all_links = [] 158 | # all_links.extend(links) 159 | for i in range(1,90,10): 160 | main(i, search_query) 161 | all_links.extend(process_file()) 162 | 163 | process_links(all_links) 164 | # import csv 165 | # with open('email_ph.csv', 'wb') as csvfile: 166 | # spamwriter = csv.writer(csvfile, delimiter=',') 167 | # for i in range(1,90,10): 168 | # main(i) 169 | # all_links = process_file() 170 | # for link in all_links: 171 | # if link_score(link): 172 | # print link 173 | # if link[:4] !='http': 174 | # link = "http://"+link 175 | # if link[-3:] == 'pdf': 176 | # html = urllib2.urlopen(link) 177 | # file = open("document.pdf", 'w') 178 | # file.write(html.read()) 179 | # file.close() 180 | # print("Completed") 181 | # email, ph = get_email_ph("document.pdf", pdf=True) 182 | # spamwriter.writerow([link, email, ph]) 183 | # elif link[-3:] == 'doc': 184 | # try: 185 | # email=[] 186 | # ph = [] 187 | # html = urllib2.urlopen(link) 188 | # file = open("document.doc", 'w') 189 | # file.write(html.read()) 190 | # file.close() 191 | # print("Completed") 192 | # testfile = urllib.URLopener() 193 | # testfile.retrieve(link, "document.doc") 194 | # email, ph = get_email_ph("document.doc", pdf=True) 195 | # spamwriter.writerow([link, email, ph]) 196 | # except: 197 | # spamwriter.writerow([link, email, ph]) 198 | # else: 199 | # try: 200 | # email = [] 201 | # ph = [] 202 | # html = urllib2.urlopen(link) 203 | # # file = open("document.pdf", 'w') 204 | # # file.write(html.read()) 205 | # # file.close() 206 | # # print("Completed") 207 | # email, ph = get_email_ph(html.read(), pdf=False) 208 | # spamwriter.writerow([link, email, ph]) 209 | # except: 210 | # spamwriter.writerow([link, email, ph]) 211 | -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** -------------------------------------------------------------------------------- /knn_classifier.py: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # python knn_classifier.py --dataset kaggle_dogs_vs_cats 3 | 4 | # import the necessary packages 5 | from sklearn.neighbors import KNeighborsClassifier 6 | from sklearn.cross_validation import train_test_split 7 | from imutils import paths 8 | import numpy as np 9 | import argparse 10 | import imutils 11 | import cv2 12 | import os 13 | 14 | def image_to_feature_vector(image, size=(32, 32)): 15 | # resize the image to a fixed size, then flatten the image into 16 | # a list of raw pixel intensities 17 | return cv2.resize(image, size).flatten() 18 | 19 | def extract_color_histogram(image, bins=(8, 8, 8)): 20 | # extract a 3D color histogram from the HSV color space using 21 | # the supplied number of `bins` per channel 22 | hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 23 | hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, 24 | [0, 180, 0, 256, 0, 256]) 25 | 26 | # handle normalizing the histogram if we are using OpenCV 2.4.X 27 | if imutils.is_cv2(): 28 | hist = cv2.normalize(hist) 29 | 30 | # otherwise, perform "in place" normalization in OpenCV 3 (I 31 | # personally hate the way this is done 32 | else: 33 | cv2.normalize(hist, hist) 34 | 35 | # return the flattened histogram as the feature vector 36 | return hist.flatten() 37 | 38 | # construct the argument parse and parse the arguments 39 | ap = argparse.ArgumentParser() 40 | ap.add_argument("-d", "--dataset", required=True, 41 | help="path to input dataset") 42 | ap.add_argument("-k", "--neighbors", type=int, default=1, 43 | help="# of nearest neighbors for classification") 44 | ap.add_argument("-j", "--jobs", type=int, default=-1, 45 | help="# of jobs for k-NN distance (-1 uses all available cores)") 46 | args = vars(ap.parse_args()) 47 | 48 | # grab the list of images that we'll be describing 49 | print("[INFO] describing images...") 50 | imagePaths = list(paths.list_images(args["dataset"])) 51 | 52 | # initialize the raw pixel intensities matrix, the features matrix, 53 | # and labels list 54 | rawImages = [] 55 | features = [] 56 | labels = [] 57 | 58 | # loop over the input images 59 | for (i, imagePath) in enumerate(imagePaths): 60 | # load the image and extract the class label (assuming that our 61 | # path as the format: /path/to/dataset/{class}.{image_num}.jpg 62 | image = cv2.imread(imagePath) 63 | label = imagePath.split(os.path.sep)[-1].split(".")[0] 64 | 65 | # extract raw pixel intensity "features", followed by a color 66 | # histogram to characterize the color distribution of the pixels 67 | # in the image 68 | pixels = image_to_feature_vector(image) 69 | hist = extract_color_histogram(image) 70 | 71 | # update the raw images, features, and labels matricies, 72 | # respectively 73 | rawImages.append(pixels) 74 | features.append(hist) 75 | labels.append(label) 76 | 77 | # show an update every 1,000 images 78 | if i > 0 and i % 1000 == 0: 79 | print("[INFO] processed {}/{}".format(i, len(imagePaths))) 80 | 81 | # show some information on the memory consumed by the raw images 82 | # matrix and features matrix 83 | rawImages = np.array(rawImages) 84 | features = np.array(features) 85 | labels = np.array(labels) 86 | print("[INFO] pixels matrix: {:.2f}MB".format( 87 | rawImages.nbytes / (1024 * 1000.0))) 88 | print("[INFO] features matrix: {:.2f}MB".format( 89 | features.nbytes / (1024 * 1000.0))) 90 | 91 | # partition the data into training and testing splits, using 75% 92 | # of the data for training and the remaining 25% for testing 93 | (trainRI, testRI, trainRL, testRL) = train_test_split( 94 | rawImages, labels, test_size=0.25, random_state=42) 95 | (trainFeat, testFeat, trainLabels, testLabels) = train_test_split( 96 | features, labels, test_size=0.25, random_state=42) 97 | 98 | # train and evaluate a k-NN classifer on the raw pixel intensities 99 | print("[INFO] evaluating raw pixel accuracy...") 100 | model = KNeighborsClassifier(n_neighbors=args["neighbors"], 101 | n_jobs=args["jobs"]) 102 | model.fit(trainRI, trainRL) 103 | acc = model.score(testRI, testRL) 104 | print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100)) 105 | 106 | # train and evaluate a k-NN classifer on the histogram 107 | # representations 108 | print("[INFO] evaluating histogram accuracy...") 109 | model = KNeighborsClassifier(n_neighbors=args["neighbors"], 110 | n_jobs=args["jobs"]) 111 | model.fit(trainFeat, trainLabels) 112 | acc = model.score(testFeat, testLabels) 113 | print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100))s -------------------------------------------------------------------------------- /least square estimation linear regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import statsmodels.api as sm 4 | 5 | df = pd.read_csv('longley.csv', index_col=0) 6 | #print df 7 | b = df.Employed 8 | A = df.GNP 9 | A = sm.add_constant(A) 10 | 11 | est = sm.OLS(b, A) 12 | est = est.fit() 13 | print est.summary() 14 | -------------------------------------------------------------------------------- /log_reg.py: -------------------------------------------------------------------------------- 1 | from numpy import loadtxt, where, zeros, e, array, log, ones, mean, where 2 | from pylab import scatter, show, legend, xlabel, ylabel, plot 3 | from scipy.optimize import fmin_bfgs 4 | 5 | 6 | def sigmoid(X): 7 | '''Compute the sigmoid function ''' 8 | #d = zeros(shape=(X.shape)) 9 | 10 | den = 1.0 + e ** (-1.0 * X) 11 | 12 | d = 1.0 / den 13 | 14 | return d 15 | 16 | 17 | def compute_cost(theta, X, y): 18 | ''' 19 | Comput cost for logistic regression 20 | ''' 21 | #Number of training samples 22 | 23 | theta.shape = (1, 3) 24 | 25 | m = y.size 26 | 27 | h = sigmoid(X.dot(theta.T)) 28 | 29 | J = (1.0 / m) * ((-y.T.dot(log(h))) - ((1.0 - y.T).dot(log(1.0 - h)))) 30 | 31 | return - 1 * J.sum() 32 | 33 | 34 | def compute_grad(theta, X, y): 35 | 36 | #print theta.shape 37 | 38 | theta.shape = (1, 3) 39 | 40 | grad = zeros(3) 41 | 42 | h = sigmoid(X.dot(theta.T)) 43 | 44 | delta = h - y 45 | 46 | l = grad.size 47 | 48 | for i in range(l): 49 | sumdelta = delta.T.dot(X[:, i]) 50 | grad[i] = (1.0 / m) * sumdelta * - 1 51 | 52 | theta.shape = (3,) 53 | 54 | return grad 55 | 56 | 57 | #load the dataset 58 | data = loadtxt('ex2data1.txt', delimiter=',') 59 | 60 | X = data[:, 0:2] 61 | y = data[:, 2] 62 | 63 | pos = where(y == 1) 64 | neg = where(y == 0) 65 | scatter(X[pos, 0], X[pos, 1], marker='o', c='b') 66 | scatter(X[neg, 0], X[neg, 1], marker='x', c='r') 67 | xlabel('Exam 1 score') 68 | ylabel('Exam 2 score') 69 | legend(['Not Admitted', 'Admitted']) 70 | #show() 71 | 72 | m, n = X.shape 73 | 74 | y.shape = (m, 1) 75 | 76 | #Add intercept term to x and X_test 77 | it = ones(shape=(m, 3)) 78 | it[:, 1:3] = X 79 | 80 | 81 | def decorated_cost(it, y): 82 | def f(theta): 83 | return compute_cost(theta, it, y) 84 | 85 | def fprime(theta): 86 | return compute_grad(theta, it, y) 87 | 88 | #Initialize theta parameters 89 | theta = zeros(3) 90 | 91 | return fmin_bfgs(f, theta, fprime, disp=True, maxiter=400) 92 | 93 | decorated_cost(it, y) 94 | theta = [-25.161272, 0.206233, 0.201470] 95 | 96 | 97 | #Plotting the decision boundary 98 | plot_x = array([min(it[:, 1]) - 2, max(it[:, 2]) + 2]) 99 | plot_y = (- 1.0 / theta[2]) * (theta[1] * plot_x + theta[0]) 100 | plot(plot_x, plot_y) 101 | legend(['Decision Boundary', 'Not admitted', 'Admitted']) 102 | #show() 103 | 104 | prob = sigmoid(array([1.0, 45.0, 85.0]).dot(array(theta).T)) 105 | print 'For a student with scores 45 and 85, we predict and admission ' + \ 106 | 'probability of %f' % prob 107 | 108 | 109 | def predict(theta, X): 110 | '''Predict whether the label 111 | is 0 or 1 using learned logistic 112 | regression parameters ''' 113 | m, n = X.shape 114 | p = zeros(shape=(m, 1)) 115 | 116 | h = sigmoid(X.dot(theta.T)) 117 | 118 | for it in range(0, h.shape[0]): 119 | if h[it] > 0.5: 120 | p[it, 0] = 1 121 | else: 122 | p[it, 0] = 0 123 | 124 | return p 125 | 126 | #Compute accuracy on our training set 127 | p = predict(array(theta), it) 128 | print 'Train Accuracy: %f' % ((y[where(p == y)].size / float(y.size)) * 100.0) 129 | -------------------------------------------------------------------------------- /log_reg_regular.py: -------------------------------------------------------------------------------- 1 | from numpy import loadtxt, where, zeros, e, array, log, ones, append, linspace 2 | from pylab import scatter, show, legend, xlabel, ylabel, contour, title 3 | from scipy.optimize import fmin_bfgs 4 | 5 | 6 | def sigmoid(X): 7 | '''Compute the sigmoid function ''' 8 | #d = zeros(shape=(X.shape)) 9 | 10 | den = 1.0 + e ** (-1.0 * X) 11 | 12 | d = 1.0 / den 13 | 14 | return d 15 | 16 | 17 | def cost_function_reg(theta, X, y, l): 18 | '''Compute the cost and partial derivatives as grads 19 | ''' 20 | 21 | h = sigmoid(X.dot(theta)) 22 | 23 | thetaR = theta[1:, 0] 24 | 25 | J = (1.0 / m) * ((-y.T.dot(log(h))) - ((1 - y.T).dot(log(1.0 - h)))) \ 26 | + (l / (2.0 * m)) * (thetaR.T.dot(thetaR)) 27 | 28 | delta = h - y 29 | sumdelta = delta.T.dot(X[:, 1]) 30 | grad1 = (1.0 / m) * sumdelta 31 | 32 | XR = X[:, 1:X.shape[1]] 33 | sumdelta = delta.T.dot(XR) 34 | 35 | grad = (1.0 / m) * (sumdelta + l * thetaR) 36 | 37 | out = zeros(shape=(grad.shape[0], grad.shape[1] + 1)) 38 | 39 | out[:, 0] = grad1 40 | out[:, 1:] = grad 41 | 42 | return J.flatten(), out.T.flatten() 43 | 44 | 45 | def map_feature(x1, x2): 46 | ''' 47 | Maps the two input features to quadratic features. 48 | 49 | Returns a new feature array with more features, comprising of 50 | X1, X2, X1 ** 2, X2 ** 2, X1*X2, X1*X2 ** 2, etc... 51 | 52 | Inputs X1, X2 must be the same size 53 | ''' 54 | x1.shape = (x1.size, 1) 55 | x2.shape = (x2.size, 1) 56 | degree = 6 57 | out = ones(shape=(x1[:, 0].size, 1)) 58 | 59 | m, n = out.shape 60 | 61 | for i in range(1, degree + 1): 62 | for j in range(i + 1): 63 | r = (x1 ** (i - j)) * (x2 ** j) 64 | out = append(out, r, axis=1) 65 | 66 | return out 67 | 68 | #load the dataset 69 | data = loadtxt('ex2data2.txt', delimiter=',') 70 | 71 | X = data[:, 0:2] 72 | y = data[:, 2] 73 | 74 | pos = where(y == 1) 75 | neg = where(y == 0) 76 | scatter(X[pos, 0], X[pos, 1], marker='o', c='b') 77 | scatter(X[neg, 0], X[neg, 1], marker='x', c='r') 78 | xlabel('Microchip Test 1') 79 | ylabel('Microchip Test 2') 80 | legend(['y = 1', 'y = 0']) 81 | #show() 82 | 83 | m, n = X.shape 84 | 85 | y.shape = (m, 1) 86 | 87 | it = map_feature(X[:, 0], X[:, 1]) 88 | 89 | #Initialize theta parameters 90 | initial_theta = zeros(shape=(it.shape[1], 1)) 91 | 92 | #Set regularization parameter lambda to 1 93 | l = 1 94 | 95 | # Compute and display initial cost and gradient for regularized logistic 96 | # regression 97 | cost, grad = cost_function_reg(initial_theta, it, y, l) 98 | 99 | def decorated_cost(theta): 100 | return cost_function_reg(theta, it, y, l) 101 | 102 | print fmin_bfgs(decorated_cost, initial_theta, maxfun=400) 103 | 104 | 105 | 106 | 107 | theta = [ 108 | 1.273005, 109 | 0.624876, 110 | 1.177376, 111 | -2.020142, 112 | -0.912616, 113 | -1.429907, 114 | 0.125668, 115 | -0.368551, 116 | -0.360033, 117 | -0.171068, 118 | -1.460894, 119 | -0.052499, 120 | -0.618889, 121 | -0.273745, 122 | -1.192301, 123 | -0.240993, 124 | -0.207934, 125 | -0.047224, 126 | -0.278327, 127 | -0.296602, 128 | -0.453957, 129 | -1.045511, 130 | 0.026463, 131 | -0.294330, 132 | 0.014381, 133 | -0.328703, 134 | -0.143796, 135 | -0.924883, 136 | ] 137 | 138 | #Plot Boundary 139 | u = linspace(-1, 1.5, 50) 140 | v = linspace(-1, 1.5, 50) 141 | z = zeros(shape=(len(u), len(v))) 142 | for i in range(len(u)): 143 | for j in range(len(v)): 144 | z[i, j] = (map_feature(array(u[i]), array(v[j])).dot(array(theta))) 145 | 146 | z = z.T 147 | contour(u, v, z) 148 | title('lambda = %f' % l) 149 | xlabel('Microchip Test 1') 150 | ylabel('Microchip Test 2') 151 | legend(['y = 1', 'y = 0', 'Decision boundary']) 152 | show() 153 | 154 | 155 | def predict(theta, X): 156 | '''Predict whether the label 157 | is 0 or 1 using learned logistic 158 | regression parameters ''' 159 | m, n = X.shape 160 | p = zeros(shape=(m, 1)) 161 | 162 | h = sigmoid(X.dot(theta.T)) 163 | 164 | for it in range(0, h.shape[0]): 165 | if h[it] > 0.5: 166 | p[it, 0] = 1 167 | else: 168 | p[it, 0] = 0 169 | 170 | return p 171 | 172 | 173 | #% Compute accuracy on our training set 174 | p = predict(array(theta), it) 175 | print 'Train Accuracy: %f' % ((y[where(p == y)].size / float(y.size)) * 100.0) 176 | -------------------------------------------------------------------------------- /moving average.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def running_mean(l, N): 4 | # Also works for the(strictly invalid) cases when N is even. 5 | if (N//2)*2 == N: 6 | N = N - 1 7 | front = np.zeros(N//2) 8 | back = np.zeros(N//2) 9 | 10 | for i in range(1, (N//2)*2, 2): 11 | front[i//2] = np.convolve(l[:i], np.ones((i,))/i, mode = 'valid') 12 | for i in range(1, (N//2)*2, 2): 13 | back[i//2] = np.convolve(l[-i:], np.ones((i,))/i, mode = 'valid') 14 | return np.concatenate([front, np.convolve(l, np.ones((N,))/N, mode = 'valid'), back[::-1]]) 15 | 16 | 17 | print running_mean(2,21) -------------------------------------------------------------------------------- /rnn_keras_timeseries_stock.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Aug 1, 2017 3 | 4 | ''' 5 | #import matplotlib.pyplot as plt 6 | import numpy as np 7 | import time 8 | import csv 9 | from keras.layers.core import Dense, Activation, Dropout 10 | from keras.layers.recurrent import LSTM 11 | from keras.models import Sequential 12 | import sys 13 | np.random.seed(1234) 14 | 15 | 16 | def read_data(path_to_dataset, 17 | sequence_length=50, 18 | ratio=1.0): 19 | 20 | max_values = ratio * 2049280 21 | 22 | with open(path_to_dataset) as f: 23 | data = csv.reader(f, delimiter=",") 24 | power = [] 25 | nb_of_values = 0 26 | for line in data: 27 | #print(line) 28 | #if nb_of_values == 3500: 29 | # break 30 | try: 31 | power.append(float(line[1])) 32 | nb_of_values += 1 33 | except ValueError: 34 | pass 35 | # 2049280.0 is the total number of valid values, i.e. ratio = 1.0 36 | if nb_of_values >= max_values: 37 | break 38 | return power 39 | 40 | def convert_to_categorical_increasing(current, future): 41 | change = (future-current)*100/current 42 | if change > 0.005: 43 | return 1 44 | else: 45 | return 0 46 | 47 | def convert_to_categorical_decreasing(current, future): 48 | change = (future-current)*100/current 49 | if change < 0.005: 50 | return 0 51 | else: 52 | return 1 53 | 54 | def create_matrix(y_train): 55 | y = [[0 for i in xrange(2)] for j in xrange(len(y_train))] 56 | for i in range(len(y_train)): 57 | #print y_train[i] 58 | y[i][y_train[i]] = 1 59 | return y 60 | 61 | 62 | def process_data(power, sequence_length, ratio, increasing, error): 63 | #print("Data loaded from csv. Formatting...") 64 | #fig = plt.figure() 65 | #plt.plot(power) 66 | #plt.show() 67 | result = [] 68 | if not error: 69 | for i in range(len(power)-1): 70 | if increasing: 71 | power[i] = convert_to_categorical_increasing(power[i], power[i+1]) 72 | else: 73 | power[i] = convert_to_categorical_decreasing(power[i], power[i+1]) 74 | for index in range(len(power) - sequence_length-1): 75 | result.append(power[index: index + sequence_length]) 76 | result = np.array(result) # shape (2049230, 50) 77 | 78 | #result = np.log(result+1) 79 | #print result 80 | #exit(0) 81 | # print ("Shift : ", result_mean) 82 | #print ("Data : ", result.shape) 83 | 84 | row = int(round(0.9 * result.shape[0])) 85 | #:print row 86 | train = result[:row, :] 87 | np.random.shuffle(train) 88 | X_train = train[:, :-1] 89 | y_train_temp = train[:, -1] 90 | #print y_train_temp 91 | y_train = create_matrix(y_train_temp) 92 | X_test = result[row:, :-1] 93 | y_test = result[row:, -1] 94 | 95 | X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) 96 | X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) 97 | 98 | return [X_train, y_train, X_test, y_test] 99 | 100 | 101 | def build_model(): 102 | model = Sequential() 103 | layers = [1, 100, 50, 2] 104 | 105 | model.add(LSTM( 106 | layers[1], 107 | input_shape=(None, layers[0]), 108 | return_sequences=True)) 109 | model.add(Dropout(0.2)) 110 | 111 | model.add(LSTM( 112 | layers[2], 113 | return_sequences=False)) 114 | model.add(Dropout(0.2)) 115 | 116 | model.add(Dense( 117 | layers[3])) 118 | model.add(Activation('softmax')) 119 | 120 | start = time.time() 121 | model.compile(loss="categorical_crossentropy", optimizer="adam") 122 | #print ("Compilation Time : ", time.time() - start) 123 | return model 124 | 125 | 126 | def run_network(data=None, increasing=False, error=False): 127 | global_start_time = time.time() 128 | epochs = 2 129 | ratio = 0.5 130 | sequence_length = 100 131 | 132 | X_train, y_train, X_test, y_test = process_data( 133 | data, sequence_length, ratio,increasing, error) 134 | 135 | #print ('\nData Loaded. Compiling...\n') 136 | 137 | model = build_model() 138 | 139 | try: 140 | model.fit( 141 | X_train, y_train, 142 | batch_size=512, nb_epoch=epochs, validation_split=0.05, verbose=0) 143 | predicted = model.predict(X_test) 144 | #predicted = np.reshape(predicted, (predicted.size,)) 145 | except KeyboardInterrupt: 146 | #:print ('Training duration (s) : ', time.time() - global_start_time) 147 | return model, y_test, 0 148 | 149 | try: 150 | fig = plt.figure() 151 | ax = fig.add_subplot(111) 152 | ax.plot(y_test[:100]*result_max) 153 | plt.plot(predicted[:100]*result_max) 154 | plt.show() 155 | except Exception as e: 156 | pass 157 | #print (str(e)) 158 | #print ('Training duration (s) : ', time.time() - global_start_time) 159 | 160 | return y_test, predicted 161 | 162 | 163 | if __name__ == '__main__': 164 | path_to_dataset = 'data/CLSB Comdty.csv'#'data/20170818/CLSB Comdty.csv' 165 | data = read_data(path_to_dataset) 166 | #print len(data) 167 | success = 0 168 | fail = 0 169 | success1 = 0 170 | fail1 = 0 171 | false_low = 0 172 | false_high = 0 173 | error = [] 174 | err_predicted = {} 175 | mean_std_inc = 0 176 | mean_std_dec = 0 177 | mean_std_err = 0 178 | e = False 179 | count = 0 180 | count_bad = 0 181 | for i in range(0,len(data)-1000,89): 182 | count = count + 1 183 | if count_bad >=2: 184 | count_bad = 0 185 | K = 1 186 | continue 187 | #success = 0 188 | #fail = 0 189 | #false_low = 0 190 | #false_high = 0 191 | d1 = data[i:i+1001] 192 | d2 = data[i:i+1001] 193 | y_test_increasing, predicted_increasing = run_network(d1, True, False) 194 | y_test_decreasing, predicted_decreasing = run_network(d2, False, False) 195 | if count > 11 and len(error) >= 1000: 196 | err_test, err_predicted = run_network(error, True, True) 197 | #print "error predicted" 198 | prob_increasing = predicted_increasing[:,1] 199 | increasing_mean = prob_increasing.mean() 200 | increasing_std = prob_increasing.std() 201 | prob_decreasing = predicted_decreasing[:,0] 202 | decreasing_mean = prob_decreasing.mean() 203 | decreasing_std = prob_decreasing.std() 204 | if i > 0: 205 | mean_std_inc = (mean_std_inc + increasing_std)/2 206 | mean_std_dec = (mean_std_dec + decreasing_std)/2 207 | else: 208 | mean_std_inc = increasing_std 209 | mean_std_dec = decreasing_std 210 | 211 | y_test_decreasing -= 1 212 | prob_err = [] 213 | prob_err_mean = 0 214 | prob_err_std = 0 215 | if count > 11 and len(error) >= 1000: 216 | prob_err = err_predicted[:,0] 217 | prob_err_mean = prob_err.mean() 218 | prob_err_std = prob_err.std() 219 | error = error[90:] 220 | #print "mean calculated" 221 | #print prob_err 222 | #if i > 11 and len(error_increasing) >= 1000 and len(err_predicted_increasing)>=90: 223 | if True: 224 | mean_std_err = 0 225 | K = 1 226 | K1 = 1 227 | if success != 0: 228 | acc_with = success*100/(success + fail) 229 | #acc_wo = success1*100/(success1 + fail1) 230 | print acc_with 231 | if acc_with < 58 and success != 0: 232 | K = K/2 233 | count_bad = count_bad + 1 234 | #mean_std_inc = 0 235 | #mean_std_dec = 0 236 | if K1 == 1: 237 | K1 = 1.5 238 | else: 239 | K1 = 1 240 | if acc_with > 58: 241 | K = 1/2 242 | #mean_std_err = 0 243 | if mean_std_err == 0: 244 | mean_std_err = prob_err_std 245 | else: 246 | mean_std_err = (prob_err_std + mean_std_err)/2 247 | for j in range(len(y_test_decreasing)-1): 248 | ac_status = y_test_increasing[j] + y_test_decreasing[j] 249 | pr_status = 0 250 | if True: 251 | inc = (prob_increasing[j] - increasing_mean + K1*mean_std_inc) 252 | dec = (prob_decreasing[j] - decreasing_mean + K1*mean_std_dec) 253 | #print inc,dec 254 | if inc > 0 or dec > 0: 255 | if inc > dec: 256 | pr_status = 1 257 | else: 258 | pr_status = -1 259 | else: 260 | pr_status = 0 261 | if ac_status != pr_status: 262 | error.append(0) 263 | else: 264 | error.append(1) 265 | if ac_status != 0: 266 | if inc > 0 and dec > 0: 267 | continue 268 | if count >= 10 and len(error) >= 1000 and len(err_predicted)>=90: 269 | if ac_status == pr_status: 270 | success1 = success1 + 1 271 | else: 272 | fail1 = fail1 + 1 273 | if (prob_err[j] - prob_err_mean) > K*mean_std_err: 274 | pr_status = -1*pr_status 275 | else: 276 | if ac_status == pr_status: 277 | success = success + 1 278 | else: 279 | fail = fail + 1 280 | #else: 281 | # success = success1 282 | # fail = fail1 283 | #print ac_status,',',pr_status, ',',prob_increasing[j],',',prob_decreasing[j] 284 | print success,',',fail,',',count #success1,',',fail1,',', count #,',',false_high,',',false_low #,',',increasing_std,',',decreasing_std,',',increasing_mean,',',decreasing_mean 285 | -------------------------------------------------------------------------------- /se_test1.py: -------------------------------------------------------------------------------- 1 | # To install the Python client library: 2 | # pip install -U selenium 3 | 4 | # Import the Selenium 2 namespace (aka "webdriver") 5 | from selenium import webdriver 6 | 7 | # Google Chrome 8 | driver = webdriver.Chrome('C:\Users\User\Downloads\chromedriver_win32\chromedriver.exe') 9 | 10 | # ------------------------------ 11 | # The actual test scenario: Test the codepad.org code execution service. 12 | 13 | # Go to codepad.org 14 | driver.get('http://codepad.org') 15 | 16 | # Select the Python language option 17 | python_link = driver.find_elements_by_xpath("//input[@name='lang' and @value='Python']")[0] 18 | python_link.click() 19 | 20 | # Enter some text! 21 | text_area = driver.find_element_by_id('textarea') 22 | text_area.send_keys("print 'Good,' + ' Morning!'") 23 | 24 | # Submit the form! 25 | submit_switch = driver.find_element_by_name('submit') 26 | submit_switch.click() 27 | 28 | # Make this an actual test. Isn't Python beautiful? 29 | assert "Good, Morning!" in driver.get_page_source() 30 | 31 | # Close the browser! 32 | driver.quit() -------------------------------------------------------------------------------- /src/MainBDAS.java: -------------------------------------------------------------------------------- 1 | 2 | import java.io.IOException; 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 16 | 17 | import java.io.File; 18 | 19 | import javax.xml.parsers.DocumentBuilder; 20 | import javax.xml.parsers.DocumentBuilderFactory; 21 | 22 | import org.w3c.dom.Document; 23 | 24 | 25 | /** 26 | * 27 | */ 28 | 29 | /** 30 | * @author SayanM 31 | * 32 | */ 33 | public class MainBDAS { 34 | 35 | public static class MapperBDAS extends Mapper { 36 | 37 | protected void map(LongWritable key, Text value, Context context) 38 | throws IOException, InterruptedException { 39 | String classname = context.getConfiguration().get("classname"); 40 | 41 | try { 42 | RootBDAS instance = (RootBDAS) Class.forName(classname).getConstructor().newInstance(); 43 | String line = value.toString(); 44 | HashMap> result = instance.mapper_task(line); 45 | for(String k : result.keySet()) 46 | { 47 | for(String v : result.get(k)) 48 | { 49 | context.write(new Text(k), new Text(v)); 50 | } 51 | } 52 | } catch (Exception e) { 53 | // TODO Auto-generated catch block 54 | e.printStackTrace(); 55 | } 56 | 57 | 58 | } 59 | 60 | } 61 | 62 | public static class ReducerBDAS extends Reducer { 63 | 64 | protected void reduce(Text key, Iterable values, 65 | Context context) throws IOException, InterruptedException { 66 | String classname = context.getConfiguration().get("classname"); 67 | 68 | try { 69 | RootBDAS instance = (RootBDAS) Class.forName(classname).getConstructor().newInstance(); 70 | ArrayList vals = new ArrayList(); 71 | for(Text v : values) 72 | { 73 | vals.add(v.toString()); 74 | } 75 | HashMap> result = instance.reducer_task(key.toString(), vals); 76 | for(String k : result.keySet()) 77 | { 78 | for(String v : result.get(k)) 79 | { 80 | context.write(new Text(k), new Text(v)); 81 | } 82 | } 83 | } catch (Exception e) { 84 | // TODO Auto-generated catch block 85 | e.printStackTrace(); 86 | } 87 | 88 | } 89 | 90 | } 91 | 92 | public static void main(String[] args) throws Exception { 93 | // TODO Auto-generated method stub 94 | 95 | Job job = new Job(); 96 | 97 | job.setJarByClass(MainBDAS.class); 98 | job.setJobName("MapReduceBDAS"); 99 | 100 | job.setOutputKeyClass(Text.class); 101 | job.setOutputValueClass(Text.class); 102 | 103 | job.setInputFormatClass(TextInputFormat.class); 104 | job.setOutputFormatClass(TextOutputFormat.class); 105 | 106 | 107 | FileInputFormat.setInputPaths(job, new Path(args[0])); 108 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 109 | job.setMapperClass(MapperBDAS.class); 110 | job.setReducerClass(ReducerBDAS.class); 111 | 112 | File file = new File("Config.xml"); 113 | DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); 114 | DocumentBuilder db = dbf.newDocumentBuilder(); 115 | Document doc = db.parse(file); 116 | doc.getDocumentElement().normalize(); 117 | 118 | String classname = Utility.getClassName(doc); 119 | 120 | job.getConfiguration().set("classname", classname); 121 | 122 | System.out.println(job.waitForCompletion(true)); 123 | 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/RootBDAS.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.HashMap; 3 | 4 | /** 5 | * 6 | */ 7 | 8 | /** 9 | * @author SayanM 10 | * 11 | */ 12 | public abstract class RootBDAS { 13 | abstract HashMap> mapper_task(String line); 14 | abstract HashMap> reducer_task(String key, ArrayList values); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/Utility.java: -------------------------------------------------------------------------------- 1 | import org.w3c.dom.Document; 2 | import org.w3c.dom.NodeList; 3 | 4 | /** 5 | * 6 | */ 7 | 8 | /** 9 | * @author SayanM 10 | * 11 | */ 12 | public class Utility { 13 | 14 | public static String getClassName(Document doc) 15 | { 16 | NodeList nodeLst = doc.getElementsByTagName("ClassName"); 17 | return nodeLst.item(0).getNodeValue(); 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/WordCounterBDAS.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.HashMap; 3 | 4 | /** 5 | * 6 | */ 7 | 8 | /** 9 | * @author SayanM 10 | * 11 | */ 12 | 13 | 14 | public final class WordCounterBDAS extends RootBDAS{ 15 | 16 | @Override 17 | HashMap> mapper_task(String line) { 18 | // TODO Auto-generated method stub 19 | String[] words = line.split(" "); 20 | HashMap> result = new HashMap>(); 21 | for(String w : words) 22 | { 23 | if(result.containsKey(w)) 24 | { 25 | ArrayList vals = result.get(w); 26 | vals.add("1"); 27 | result.put(w, vals); 28 | } 29 | else 30 | { 31 | ArrayList vals = new ArrayList(); 32 | vals.add("1"); 33 | result.put(w, vals); 34 | } 35 | } 36 | return result; 37 | } 38 | 39 | @Override 40 | HashMap> reducer_task(String key, ArrayList values) { 41 | // TODO Auto-generated method stub 42 | HashMap> result = new HashMap>(); 43 | ArrayList tempres = new ArrayList(); 44 | tempres.add(values.size()+ ""); 45 | result.put(key, tempres); 46 | return result; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/testBDAS.java: -------------------------------------------------------------------------------- 1 | import static org.junit.Assert.*; 2 | 3 | import org.junit.After; 4 | import org.junit.AfterClass; 5 | import org.junit.Before; 6 | import org.junit.BeforeClass; 7 | import org.junit.Test; 8 | 9 | 10 | public class testBDAS { 11 | 12 | public void testMapper(){ 13 | 14 | } 15 | 16 | public void testReducer(){ 17 | 18 | } 19 | 20 | } 21 | --------------------------------------------------------------------------------