9 |
10 | #### #2 Command (⌘) + Save (s) to save the snippet for future applications.
11 |
12 | Sidenote: Right click on the Snippet title you can 'rename' to whatever you would like.
13 |
14 | #### #3 To use your new image snippet do a Google Image Search and scroll down to however many images you would like to save.
15 |
16 | #### #4 Navigate to Chrome DevTools Sources > Snippets > right click your saved snippet and click 'Run'.
17 |
18 | #### #5 A .txt file of all the image URLs will automatically download.
19 |
20 |
21 |
22 |
23 | # Download Images From URLs
24 |
25 | ## Use Brew wget
26 |
27 | #### #1 Install Homebrew
28 |
29 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
30 |
31 | #### #2 Install wget
32 |
33 | brew install wget
34 |
35 | #### #3 Add your urls.txt into a new folder. Navigate to that folder in your terminal (using 'ls & cd' commands in your Terminal). https://wsvincent.com/terminal-command-line-for-beginners/
36 |
37 | #### #4 Run 'wget -i urls.txt' into your terminal (use whatever name your url.txt file is).
38 |
39 | #### #5 All images will be downloaded into the folder.
40 |
41 |
42 |
43 |
44 | # WOOOOHOOOOOOOO!!!! Training Data For AAALLLLL
45 |
--------------------------------------------------------------------------------
/NLP/contractions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Aug 01 01:11:02 2016
4 |
5 | @author: DIP
6 | """
7 |
8 | CONTRACTION_MAP = {
9 | "ain't": "is not",
10 | "aren't": "are not",
11 | "can't": "cannot",
12 | "can't've": "cannot have",
13 | "'cause": "because",
14 | "could've": "could have",
15 | "couldn't": "could not",
16 | "couldn't've": "could not have",
17 | "didn't": "did not",
18 | "doesn't": "does not",
19 | "don't": "do not",
20 | "hadn't": "had not",
21 | "hadn't've": "had not have",
22 | "hasn't": "has not",
23 | "haven't": "have not",
24 | "he'd": "he would",
25 | "he'd've": "he would have",
26 | "he'll": "he will",
27 | "he'll've": "he he will have",
28 | "he's": "he is",
29 | "how'd": "how did",
30 | "how'd'y": "how do you",
31 | "how'll": "how will",
32 | "how's": "how is",
33 | "I'd": "I would",
34 | "I'd've": "I would have",
35 | "I'll": "I will",
36 | "I'll've": "I will have",
37 | "I'm": "I am",
38 | "I've": "I have",
39 | "i'd": "i would",
40 | "i'd've": "i would have",
41 | "i'll": "i will",
42 | "i'll've": "i will have",
43 | "i'm": "i am",
44 | "i've": "i have",
45 | "isn't": "is not",
46 | "it'd": "it would",
47 | "it'd've": "it would have",
48 | "it'll": "it will",
49 | "it'll've": "it will have",
50 | "it's": "it is",
51 | "let's": "let us",
52 | "ma'am": "madam",
53 | "mayn't": "may not",
54 | "might've": "might have",
55 | "mightn't": "might not",
56 | "mightn't've": "might not have",
57 | "must've": "must have",
58 | "mustn't": "must not",
59 | "mustn't've": "must not have",
60 | "needn't": "need not",
61 | "needn't've": "need not have",
62 | "o'clock": "of the clock",
63 | "oughtn't": "ought not",
64 | "oughtn't've": "ought not have",
65 | "shan't": "shall not",
66 | "sha'n't": "shall not",
67 | "shan't've": "shall not have",
68 | "she'd": "she would",
69 | "she'd've": "she would have",
70 | "she'll": "she will",
71 | "she'll've": "she will have",
72 | "she's": "she is",
73 | "should've": "should have",
74 | "shouldn't": "should not",
75 | "shouldn't've": "should not have",
76 | "so've": "so have",
77 | "so's": "so as",
78 | "that'd": "that would",
79 | "that'd've": "that would have",
80 | "that's": "that is",
81 | "there'd": "there would",
82 | "there'd've": "there would have",
83 | "there's": "there is",
84 | "they'd": "they would",
85 | "they'd've": "they would have",
86 | "they'll": "they will",
87 | "they'll've": "they will have",
88 | "they're": "they are",
89 | "they've": "they have",
90 | "to've": "to have",
91 | "wasn't": "was not",
92 | "we'd": "we would",
93 | "we'd've": "we would have",
94 | "we'll": "we will",
95 | "we'll've": "we will have",
96 | "we're": "we are",
97 | "we've": "we have",
98 | "weren't": "were not",
99 | "what'll": "what will",
100 | "what'll've": "what will have",
101 | "what're": "what are",
102 | "what's": "what is",
103 | "what've": "what have",
104 | "when's": "when is",
105 | "when've": "when have",
106 | "where'd": "where did",
107 | "where's": "where is",
108 | "where've": "where have",
109 | "who'll": "who will",
110 | "who'll've": "who will have",
111 | "who's": "who is",
112 | "who've": "who have",
113 | "why's": "why is",
114 | "why've": "why have",
115 | "will've": "will have",
116 | "won't": "will not",
117 | "won't've": "will not have",
118 | "would've": "would have",
119 | "wouldn't": "would not",
120 | "wouldn't've": "would not have",
121 | "y'all": "you all",
122 | "y'all'd": "you all would",
123 | "y'all'd've": "you all would have",
124 | "y'all're": "you all are",
125 | "y'all've": "you all have",
126 | "you'd": "you would",
127 | "you'd've": "you would have",
128 | "you'll": "you will",
129 | "you'll've": "you will have",
130 | "you're": "you are",
131 | "you've": "you have"
132 | }
--------------------------------------------------------------------------------
/AwesomeResources/README.md:
--------------------------------------------------------------------------------
1 | # Awesome Machine Learning Resources for SEO
2 |
3 | A curated list of libraries with application to SEO
4 |
5 |
6 | ## Contents
7 |
8 | - [Machine Learning Frameworks](#machine-learning-frameworks)
9 | - [Deep Learning Frameworks](#deep-learning-frameworks)
10 | - [Deep Learning Tools](#deep-learning-tools)
11 | - [Deep Learning Projects](#deep-learning-projects)
12 | - [Natural Language Processing(NLP)](#nlp)
13 | - [Public Data Sets](#public-data-sets)
14 |
15 | ## Machine Learning Frameworks
16 |
17 | - [scikit-learn](http://scikit-learn.org/stable/) - scikit-learn: machine learning in Python.
18 | - [vowpal_porpoise](https://github.com/josephreisinger/vowpal_porpoise) - Wrapper for vowpal_wabbit.
19 | - [Xgboost](https://xgboost.readthedocs.io/en/latest/) - Scalable, Portable and Distributed Gradient Boosting.
20 |
21 |
22 | ## Deep Learning Frameworks
23 |
24 | - [Pytorch](https://github.com/pytorch/pytorch) - Tensors and Dynamic neural networks in Python with strong GPU acceleration
25 | - [Tensorflow](https://github.com/tensorflow/tensorflow) - Computation using data flow graphs for scalable machine learning.
26 | - [Keras](https://keras.io) - High-level neutral networks API.
27 | - [chainer](https://github.com/chainer/chainer) - A flexible framework of neural networks for deep learning.
28 |
29 |
30 | ## Deep Learning Projects
31 |
32 | - [fairseq-py](https://github.com/facebookresearch/fairseq-py) - Sequence-to-Sequence Toolkit.
33 | - [DrQA](https://github.com/facebookresearch/DrQA) - Reading Wikipedia to Answer Open-Domain Questions.
34 | - [tensorflow-wavenet](https://github.com/ibab/tensorflow-wavenet) - DeepMind's WaveNet.
35 |
36 |
37 | ## Examples
38 |
39 | - [Seedbank](https://research.google.com/seedbank/) - Collection of interactive machine learning models.
40 | - [Google CodeLabs](https://codelabs.developers.google.com/?cat=TensorFlow) - Guided Tensorflow tutorials.
41 | - [Tensorflow Workshops](https://github.com/tensorflow/workshops) - Colab Notebook examples.
42 | - [Tensorflow.js](https://js.tensorflow.org/) - Interactive tensorflow.js demos.
43 | - [Tensorflow Wide & Deep](https://github.com/tensorflow/models/tree/master/official/wide_deep) - Predicting income with the census income dataset example.
44 | - [What If Tool](https://pair-code.github.io/what-if-tool/) - Inspect the inner worrkings of a model, no code required.
45 | - [PEAR](https://ai.google/research/teams/brain/pair) - People + AI Research
46 | - [Facets](https://pair-code.github.io/facets/) - Interactive data vizualiztion.
47 | - [Beat Blender](https://experiments.withgoogle.com/ai/beat-blender/view/) - Make beats with machine learning.
48 | - [Quick, Draw!](https://quickdraw.withgoogle.com/) - Give Google your drawing training data. :)
49 | - [Breast Cancer Detection](https://colab.research.google.com/drive/1ANmq66IO-nKoYWOTC1eIyqvtNxlR7bkn) - Incredible example of how machine learning can help detect cancer.
50 |
51 | ## NLP
52 |
53 | - [gensim](https://github.com/piskvorky/gensim) - Topic Modeling.
54 | - [nltk](http://www.nltk.org) - Natural Language Toolkit.
55 | - [pattern](https://github.com/clips/pattern) - Web mining module.
56 | - [goose3 / goose3](https://github.com/goose3/goose3) - A Python 3 compatible version of goose web text extraction.
57 | - [SpaCy](https://github.com/explosion/spaCy) - library is pretty awesome. Hard to install on windows.
58 | - [jellyfish](https://github.com/jamesturk/jellyfish) - Approximate and phonetic matching of strings.
59 | - [facebook/fastText](https://github.com/facebookresearch/fastText) - Library for fast text representation and classification.
60 | - [google/sentencepiece](https://github.com/google/sentencepiece) - Unsupervised text tokenizer for Neural Network-based text generation.
61 |
62 |
63 | ## Public Data Sets
64 |
65 | - [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets)
66 |
67 |
68 | Started based on [Awsome Python Data Science](https://github.com/thomasjpfan/awesome-python-data-science)
69 |
--------------------------------------------------------------------------------
/NLP/normalization.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Aug 26 20:45:10 2016
4 |
5 | @author: DIP
6 | """
7 |
8 | from lib.contractions import CONTRACTION_MAP
9 | import re
10 | import nltk
11 | import string
12 | from nltk.stem import WordNetLemmatizer
13 | from html.parser import HTMLParser
14 | import unicodedata
15 |
16 | stopword_list = nltk.corpus.stopwords.words('english')
17 |
18 | wnl = WordNetLemmatizer()
19 | html_parser = HTMLParser()
20 |
21 | def tokenize_text(text):
22 | tokens = nltk.word_tokenize(text)
23 | tokens = [token.strip() for token in tokens]
24 | return tokens
25 |
26 | def expand_contractions(text, contraction_mapping):
27 |
28 | contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
29 | flags=re.IGNORECASE|re.DOTALL)
30 | def expand_match(contraction):
31 | match = contraction.group(0)
32 | first_char = match[0]
33 | expanded_contraction = contraction_mapping.get(match)\
34 | if contraction_mapping.get(match)\
35 | else contraction_mapping.get(match.lower())
36 | expanded_contraction = first_char+expanded_contraction[1:]
37 | return expanded_contraction
38 |
39 | expanded_text = contractions_pattern.sub(expand_match, text)
40 | expanded_text = re.sub("'", "", expanded_text)
41 | return expanded_text
42 |
43 |
44 | from nltk.corpus import wordnet as wn
45 | import en_core_web_sm
46 | nlp = en_core_web_sm.load()
47 |
48 | # Annotate text tokens with POS tags
49 | def pos_tag_text(text):
50 |
51 | def penn_to_wn_tags(pos_tag):
52 | if pos_tag.startswith('ADJ'):
53 | return wn.ADJ
54 | elif pos_tag.startswith('VERB'):
55 | return wn.VERB
56 | elif pos_tag.startswith('NOUN'):
57 | return wn.NOUN
58 | elif pos_tag.startswith('ADV'):
59 | return wn.ADV
60 | else:
61 | return None
62 |
63 | tagged_text = nlp(text)
64 | tagged_lower_text = [(str(word).lower(), penn_to_wn_tags(word.pos_))
65 | for word in
66 | tagged_text]
67 | return tagged_lower_text
68 |
69 | # lemmatize text based on POS tags
70 | def lemmatize_text(text):
71 |
72 | pos_tagged_text = pos_tag_text(text)
73 | lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
74 | else word
75 | for word, pos_tag in pos_tagged_text]
76 | lemmatized_text = ' '.join(lemmatized_tokens)
77 | return lemmatized_text
78 |
79 |
80 | def remove_special_characters(text):
81 | tokens = tokenize_text(text)
82 | pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
83 | filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
84 | filtered_text = ' '.join(filtered_tokens)
85 | return filtered_text
86 |
87 | def remove_stopwords(text):
88 | tokens = tokenize_text(text)
89 | filtered_tokens = [token for token in tokens if token not in stopword_list]
90 | filtered_text = ' '.join(filtered_tokens)
91 | return filtered_text
92 |
93 | def sort_terms(text):
94 | tokens = tokenize_text(text)
95 | tokens.sort()
96 | filtered_text = ' '.join(tokens)
97 | return filtered_text
98 |
99 | def keep_text_characters(text):
100 | filtered_tokens = []
101 | tokens = tokenize_text(text)
102 | for token in tokens:
103 | if re.search('[a-zA-Z]', token):
104 | filtered_tokens.append(token)
105 | filtered_text = ' '.join(filtered_tokens)
106 | return filtered_text
107 |
108 | def unescape_html(parser, text):
109 |
110 | return parser.unescape(text)
111 |
112 |
113 | def normalize_corpus(corpus, lemmatize=True,
114 | only_text_chars=False,
115 | tokenize=False, sort_text=False):
116 |
117 | normalized_corpus = []
118 | for text in corpus:
119 | text = html_parser.unescape(text)
120 | text = expand_contractions(text, CONTRACTION_MAP)
121 | if lemmatize:
122 | text = lemmatize_text(text)
123 | else:
124 | text = text.lower()
125 | text = remove_special_characters(text)
126 | text = remove_stopwords(text)
127 | if sort_text:
128 | text = sort_terms(text)
129 | if only_text_chars:
130 | text = keep_text_characters(text)
131 |
132 | if tokenize:
133 | text = tokenize_text(text)
134 | normalized_corpus.append(text)
135 | else:
136 | normalized_corpus.append(text)
137 |
138 | return normalized_corpus
139 |
140 |
141 | def parse_document(document):
142 | document = re.sub('\n', ' ', document)
143 | if isinstance(document, str):
144 | document = document
145 | elif isinstance(document, unicode):
146 | return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
147 | else:
148 | raise ValueError('Document is not string or unicode!')
149 | document = document.strip()
150 | sentences = nltk.sent_tokenize(document)
151 | sentences = [sentence.strip() for sentence in sentences]
152 | return sentences
153 |
--------------------------------------------------------------------------------
/api/google_search_console/gsc.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, print_function, unicode_literals
2 |
3 | import sys
4 | import httplib2
5 | import pandas as pd
6 | import time
7 | import re
8 | from tqdm import tqdm
9 | import datetime as dt
10 | import os
11 | from datetime import timedelta, date
12 | from urllib.error import HTTPError
13 | from apiclient import errors
14 | from apiclient.discovery import build
15 |
16 | from config import config
17 | from .errors import *
18 |
19 | class GscClient(object):
20 |
21 | def __init__(self, *kwargs):
22 | if len(kwargs) > 0:
23 | client = kwargs[0]
24 | if not isinstance(client,httplib2.Http):
25 | from api.google import authenticate
26 | try:
27 | client = authenticate()
28 | except:
29 | raise GscConfigError('Make sure that CLIENT_ID and CLIENT_SECRET is set in config.py')
30 |
31 | self.DATA_FOLDER = config['DATA_FOLDER']
32 | self.ROW_LIMIT = config['ROW_LIMIT']
33 | self.client = client
34 |
35 | # Call GSC Service
36 | def get_gsc_service(self):
37 |
38 | webmasters_service = build('webmasters', 'v3', http=self.client)
39 |
40 | return webmasters_service
41 |
42 | @staticmethod
43 | def daterange(start_date, end_date):
44 | for n in range(int ((end_date - start_date).days)):
45 | yield start_date + timedelta(n)
46 |
47 | @staticmethod
48 | def execute_request(service, property_uri, request):
49 | """
50 | Executes a searchAnalytics.query request.
51 | Args:
52 | service: The webmasters service to use when executing the query.
53 | property_uri: The site or app URI to request data for.
54 | request: The request to be executed.
55 | Returns:
56 | An array of response rows.
57 | """
58 | return service.searchanalytics().query(
59 | siteUrl=property_uri, body=request).execute()
60 |
61 |
62 |
63 | '''
64 | Parameters:
65 |
66 | Positional:
67 | clienturl: (str) The domain URL property name in Google Search Console.
68 | days_back: (int) How many days history to pull.
69 |
70 | Keyword:
71 | thresholdtype: (str) 'click' or 'impression'. Default: impression
72 | threshold: (int) Keep pulling, daily until less than this number of either clicks or impressions. Default: 1
73 | poslimit: (int) Omit results above this limit. Default: None
74 | country: (str) Country. Default: usa
75 | outputFn: (str) Name of the output file. If not set, a unique name will be chosen.
76 | '''
77 | def get_site_data(self, clienturl, days, **data):
78 |
79 | thresholdtype = data.get('threshold_type', 'impression')
80 | threshold = data.get('threshold', 1)
81 | poslimit = data.get('pos_limit', None)
82 | country = data.get('country', 'usa')
83 | outputFn = data.get('output_fn', "".join([self.DATA_FOLDER, "/", "gsc_", re.sub('[^0-9a-zA-Z]+', '_', clienturl), dt.date.today().strftime("%Y_%m"), ".csv"]))
84 |
85 | if (self.DATA_FOLDER + "/") not in outputFn and os.path.isdir(self.DATA_FOLDER):
86 | outputFn = "".join([self.DATA_FOLDER, "/",outputFn])
87 |
88 | start_date = (dt.date.today()-dt.timedelta(days = (days+3) ))
89 | end_date = (dt.date.today()-dt.timedelta(days = 3))
90 |
91 | row_limit = self.ROW_LIMIT
92 |
93 | if os.path.isfile(outputFn):
94 | print('Reloading Existing: ' + outputFn)
95 | df = pd.read_csv(outputFn, encoding = "utf-8")
96 | if poslimit is not None:
97 | return df[df.position <= poslimit]
98 | return df
99 |
100 | output = []
101 |
102 | print("Building new {} file".format(outputFn));
103 | print('Getting Webmaster Service')
104 | webmasters_service = self.get_gsc_service()
105 | time.sleep(1)
106 |
107 | pbar = tqdm(total=int((end_date - start_date).days), desc='Pulling Google Search Console Data', file=sys.stdout)
108 |
109 | for single_date in self.daterange(start_date, end_date):
110 |
111 | month_date = str(single_date.strftime("%Y-%m"))
112 | single_date = str(single_date)
113 | pbar.update()
114 |
115 | try:
116 | n = 0
117 | Count = 11
118 | startRow = 0
119 | while (Count >= threshold):
120 |
121 | #print("-----Executing------- " + str(startRow))
122 | request = {
123 | 'startDate': single_date,
124 | 'endDate': single_date,
125 | 'dimensions': ['query', 'page'],
126 | 'dimensionFilterGroups': [
127 | {
128 | 'filters': [
129 | {
130 | 'dimension': 'country',
131 | 'expression': country
132 | }
133 | ]
134 | }
135 | ],
136 | 'rowLimit': row_limit,
137 | 'startRow': int(startRow)
138 | }
139 | try:
140 | response = self.execute_request(webmasters_service, clienturl, request)
141 | except Exception as e:
142 | print("API Error:", str(e))
143 | time.sleep(30)
144 | continue
145 |
146 | startRow = startRow + (row_limit)
147 | tCount, NewOutput = self.handle_response(response, clienturl, thresholdtype, threshold, month_date)
148 | output = output + NewOutput
149 |
150 | n = n + 1
151 | if (n % 3 == 0):
152 | time.sleep(1)
153 | Count = int(tCount)
154 |
155 |
156 | except Exception as e:
157 | raise GscApiError(str(e))
158 |
159 |
160 | pbar.close()
161 |
162 | df = pd.DataFrame(output)
163 | print("Total rows found: {}. Saving to csv.".format(str( len(df) ) ) );
164 | df.to_csv(outputFn, header=True, index=False, encoding='utf-8')
165 |
166 | if poslimit:
167 | return df[df.position <= poslimit]
168 |
169 | return df
170 |
171 | @staticmethod
172 | def handle_response(response, clienturl, thresholdtype, threshold, month_date):
173 |
174 | output = []
175 | tCount = -1
176 |
177 | if 'rows' not in response:
178 | return int(tCount), output
179 |
180 | rows = response['rows']
181 | row_format = '{:<20}' + '{:>20}' * 4
182 | for row in rows:
183 | keys = ''
184 |
185 | if 'keys' in row:
186 |
187 | if thresholdtype == 'click':
188 | tcheck = int(row['clicks'])
189 | else:
190 | tcheck = int(row['impressions'])
191 |
192 | if tcheck < int(threshold):
193 | continue
194 |
195 | query = str(row['keys'][0])
196 | page = str(row['keys'][1])
197 | dict = {'clientID': clienturl, 'query': query, 'page': page,
198 | 'clicks': row['clicks'], 'impressions': row['impressions'], 'ctr': row['ctr'],
199 | 'position': int(row['position']), 'month':str(month_date)}
200 |
201 | output.append(dict)
202 | tCount = tcheck
203 |
204 | return int(tCount), output
205 |
--------------------------------------------------------------------------------
/AwesomeResources/ML Problem Framing Worksheet.md:
--------------------------------------------------------------------------------
1 | # ML Problem Framing Worksheet
2 |
3 | (This worksheet was transcribed into Markdown from the original provided by Kshitij Gautam. Neil Martinsen-Burrell also helped modify current doc.)
4 |
5 | ## Exercise 1: Start Clearly and Simply
6 |
7 | **Write what you'd like the machine learned model to do.**
8 |
9 | _We want the machine learned model to..._
10 |
11 |
12 | **Example**: We want the machine learned model to predict how popular a video just
13 | uploaded now will become in the future.
14 |
15 | **Tips**: At this point, the statement can be qualitative, but make sure this
16 | captures your real goal, not an indirect goal.
17 |
18 | ## Exercise 2: Your Ideal Outcome
19 |
20 | **Your ML model is intended to produce some desirable outcome. What is this
21 | outcome, independent of the model itself. Note that this outcome may be quite
22 | different from how you assess the model and its quality.**
23 |
24 | _Our ideal outcome is..._
25 |
26 | **Example**: Our ideal outcome is to only transcode popular videos to minimize
27 | server resource utilization.
28 |
29 | **Example**: Our ideal outcome is to suggest videos that people find useful,
30 | entertaining, and worth their time
31 |
32 | **Tips**: You don't need to limit yourself to metrics for which your product
33 | has already been optimizing. Instead, try to focus on the larger objective of
34 | your product or service.
35 |
36 | ## Exercise 3: Your Success Metrics
37 |
38 | **Write down your metrics for success and failyre with the ML system. The
39 | failure metrics are important. Both metrics should be phrased independently of
40 | the evaluation metrics of the model. Talk about the anticipated outcomes
41 | instead.**
42 |
43 | _Our success metrics are..._
44 |
45 | _Our key results for the success metrics are..._
46 |
47 | _Our ML model is deemed a failure if..._
48 |
49 | **Example**: Our success metrics are CPU resource utilization. Our KR for the
50 | success metric is to achieve a 35% reduced cost for transcoding. Our ML model
51 | is a failure if the CPU resource cost reduction is less than the CPU costs for
52 | training and serving the model.
53 |
54 | **Example**: Our success metrics are the number of popular videos properly
55 | predicted. Our KR for the success metric is to properly predict the top 95% 28
56 | days after being uploaded. Our ML model is a failure if the number of videos
57 | properly predicted is no better than current heuristics.
58 |
59 | **Tips**: Are the metrics measurable? How will you measure them? (It's okay if
60 | this is via a live experiment. Some metrics can't be measured offline.) When
61 | are you able to measure them? (How long will it take to know whether your new
62 | system is a success or failure?) Consider long-term engineering and
63 | maintenance costs. Failure may not only be caused by non-achievement of the
64 | success metric.
65 |
66 | ## Exercise 4: Your Output
67 |
68 | **Write the output that you want your ML model to produce.**
69 |
70 | _The output from our ML model will be..._
71 |
72 | _It is defined as..._
73 |
74 | **Example**: The output from our ML model will be one of the 3 classes of
75 | videos (very popular, somewhat popular, not popular) defined as the top 3, 7,
76 | or 90 percentile of watch time 28 days after uploading.
77 |
78 | **Tips**: The output must be quantifiable with a definition that the model can
79 | produce. Are your able to obtain example outputs to use for training data?
80 | (How and from what source?) Your output examples may need to be engineered
81 | (like above where watch time is turned into a percentile). If it is difficult
82 | to obtain example outputs for training, you may need to reformulate your
83 | problem.
84 |
85 | ## Exercise 5: Using the Output
86 |
87 | **Write when your output must be obtained from the ML model and how it is used
88 | in your product.**
89 |
90 | _The output from the ML model will be made..._
91 |
92 | _The output will be used for..._
93 |
94 | **Example**: The prediction of a video's popularity will be made as soon as a
95 | new video is uploaded. The output will be used for determining the transcoding
96 | output for the video.
97 |
98 | **Tips**: Consider how you will use the model output. Will it be presented to
99 | a user in a UI? Consumed by subsequent business logic? Do you have latency
100 | requirements? The latency of data from remote services might make them
101 | infeasible to use. Remember the Oracle Test: if you always had the correct
102 | answer, how would you use that in your product?
103 |
104 |
105 | ## Exercise 6: Your Heuristics
106 | _
107 |
108 | **Write how you would solve the problem if you didn't use ML. What heuristics
109 | might you use?**
110 |
111 | _If we didn't use ML, we would..._
112 |
113 | **Example**: If we didn't use ML, we would assume new videos uploaded by
114 | creators who had uploaded popular videos in the past will become popular
115 | again.
116 |
117 | **Tips**: Think about a scenario where you need to deliver the product
118 | tomorrow and you can only hardcode the business logic. What would you do?
119 |
120 | ## Exercise 7a: Formulate Your Problem as an ML Problem
121 |
122 | **Write down what you think is the best technical solution for your problem.**
123 |
124 | _Our problem is best framed as:_
125 | - _Binary classification_
126 | - _Unidimensional Regression_
127 | - _Multi-class, single-label classification_
128 | - _Multi-class, multi-label classification_
129 | - _Multidimensional regression_
130 | - _Clustering (unsupervised)_
131 | - _Other:_
132 |
133 | _which predicts..._
134 |
135 | **Example**: Our problem is best framed as 3-class, single label
136 | classification which predicts whether a video will be in one of three classes
137 | (very popular, somewhat popular, not popular) 28 days after being uploaded.
138 |
139 | ## Exercise 7b: Cast Your Prolem as a Simpler Problem
140 |
141 | **Restate your problem as a binary classification or unidimensional
142 | regression.**
143 |
144 | _Our problem is best framed as:_
145 | - _Binary classification_
146 | - _Unidimensional regression_
147 |
148 | **Example** We will predict whether upload videos will become very popular or
149 | not. OR We will predict how popular an uploaded video will be in terms of the
150 | number of views it will receive in a 28 day window.
151 |
152 | ## Exercise 8: Design your Data for the Model
153 |
154 | **Write the data you want the ML model to use to make the predictions.**
155 |
156 | _Input 1:_
157 |
158 | _Input 2:_
159 |
160 | _Input 3:_
161 |
162 | **Example**: Input 1: Title, Input 2: Uploader, Input 3: Upload time, Input 4:
163 | Uploaders recent videos
164 |
165 | **Tips**: Only include information available at the time the prediction is
166 | made. Each input can be a number or a list of numbers or strings. If your
167 | input has a different structure, consider that is the best representation for
168 | your data. (Split a list into two separate inputs? Flatten nested structures?)
169 |
170 | ## Exercise 9: Where the Data Comes From
171 |
172 | **Write down where each input comes from. Assess how much work it will be to
173 | develop a data pipeline to construct each column for one row.**
174 |
175 | _Input 1:_
176 |
177 | _Input 2:_
178 |
179 | _Input 3:_
180 |
181 | **Example**: Input 1: Title, part of VideoUploadEvent record, Input 2:
182 | Uploader, same, Input 3: Upload time, same, Input 4: Recent videos, list from
183 | a separate system.
184 |
185 | **Tips**: When does the example output become available for training purposes?
186 | Make sure all your inputs are available at serving tie in exactly the format
187 | you specified.
188 |
189 | ## Exercise 10: Easily Obtained Inputs
190 |
191 | **Among the inputs you listed in Exercise 8, pick 1-3 that are easy to obtain
192 | and would produce a reasonable initial outcome.**
193 |
194 | _Input 1:_
195 |
196 | _Input 2:_
197 |
198 | **Tips**: For your heuristics, what inputs would be useful for those
199 | heuristics? Focus on inputs that can be obtained from a single system with a
200 | simple pipeline. Start with the minimum possible infrastructure.
201 |
--------------------------------------------------------------------------------
/Models/pytorch/SentenceVAE.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.utils.rnn as rnn_utils
4 | from utils import to_var
5 |
6 | class SentenceVAE(nn.Module):
7 |
8 | def __init__(self, vocab_size, embedding_size, rnn_type, hidden_size, word_dropout, embedding_dropout, latent_size,
9 | sos_idx, eos_idx, pad_idx, unk_idx, max_sequence_length, num_layers=1, bidirectional=False):
10 |
11 | super().__init__()
12 | self.tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
13 |
14 | self.max_sequence_length = max_sequence_length
15 | self.sos_idx = sos_idx
16 | self.eos_idx = eos_idx
17 | self.pad_idx = pad_idx
18 | self.unk_idx = unk_idx
19 |
20 | self.latent_size = latent_size
21 |
22 | self.rnn_type = rnn_type
23 | self.bidirectional = bidirectional
24 | self.num_layers = num_layers
25 | self.hidden_size = hidden_size
26 |
27 | self.embedding = nn.Embedding(vocab_size, embedding_size)
28 | self.word_dropout_rate = word_dropout
29 | self.embedding_dropout = nn.Dropout(p=embedding_dropout)
30 |
31 | if rnn_type == 'rnn':
32 | rnn = nn.RNN
33 | elif rnn_type == 'gru':
34 | rnn = nn.GRU
35 | # elif rnn_type == 'lstm':
36 | # rnn = nn.LSTM
37 | else:
38 | raise ValueError()
39 |
40 | self.encoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
41 | self.decoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
42 |
43 | self.hidden_factor = (2 if bidirectional else 1) * num_layers
44 |
45 | self.hidden2mean = nn.Linear(hidden_size * self.hidden_factor, latent_size)
46 | self.hidden2logv = nn.Linear(hidden_size * self.hidden_factor, latent_size)
47 | self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
48 | self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
49 |
50 | def forward(self, input_sequence, length):
51 |
52 | batch_size = input_sequence.size(0)
53 | sorted_lengths, sorted_idx = torch.sort(length, descending=True)
54 | input_sequence = input_sequence[sorted_idx]
55 |
56 | # ENCODER
57 | input_embedding = self.embedding(input_sequence)
58 |
59 | packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.data.tolist(), batch_first=True)
60 |
61 | _, hidden = self.encoder_rnn(packed_input)
62 |
63 | if self.bidirectional or self.num_layers > 1:
64 | # flatten hidden state
65 | hidden = hidden.view(batch_size, self.hidden_size*self.hidden_factor)
66 | else:
67 | hidden = hidden.squeeze()
68 |
69 | # REPARAMETERIZATION
70 | mean = self.hidden2mean(hidden)
71 | logv = self.hidden2logv(hidden)
72 | std = torch.exp(0.5 * logv)
73 |
74 | z = to_var(torch.randn([batch_size, self.latent_size]))
75 | z = z * std + mean
76 |
77 | # DECODER
78 | hidden = self.latent2hidden(z)
79 |
80 | if self.bidirectional or self.num_layers > 1:
81 | # unflatten hidden state
82 | hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
83 | else:
84 | hidden = hidden.unsqueeze(0)
85 |
86 | # decoder input
87 | if self.word_dropout_rate > 0:
88 | # randomly replace decoder input with | \n", 63 | " | clicks | \n", 64 | "clientID | \n", 65 | "ctr | \n", 66 | "impressions | \n", 67 | "month | \n", 68 | "page | \n", 69 | "position | \n", 70 | "query | \n", 71 | "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", 76 | "2.0 | \n", 77 | "https://adaptpartners.com | \n", 78 | "0.666667 | \n", 79 | "3.0 | \n", 80 | "2018-10 | \n", 81 | "https://adaptpartners.com/ | \n", 82 | "1 | \n", 83 | "adapt partners | \n", 84 | "
| 1 | \n", 87 | "2.0 | \n", 88 | "https://adaptpartners.com | \n", 89 | "0.400000 | \n", 90 | "5.0 | \n", 91 | "2018-10 | \n", 92 | "https://adaptpartners.com/job/political-journa... | \n", 93 | "2 | \n", 94 | "political internships | \n", 95 | "
| 2 | \n", 98 | "1.0 | \n", 99 | "https://adaptpartners.com | \n", 100 | "0.052632 | \n", 101 | "19.0 | \n", 102 | "2018-10 | \n", 103 | "https://adaptpartners.com/technical-seo/python... | \n", 104 | "4 | \n", 105 | "google search console api | \n", 106 | "
| 3 | \n", 109 | "0.0 | \n", 110 | "https://adaptpartners.com | \n", 111 | "0.000000 | \n", 112 | "11.0 | \n", 113 | "2018-10 | \n", 114 | "https://adaptpartners.com/ | \n", 115 | "13 | \n", 116 | "adapt | \n", 117 | "
| 4 | \n", 120 | "0.0 | \n", 121 | "https://adaptpartners.com | \n", 122 | "0.000000 | \n", 123 | "1.0 | \n", 124 | "2018-10 | \n", 125 | "https://adaptpartners.com/ | \n", 126 | "14 | \n", 127 | "adapt agency | \n", 128 | "
| \n", 364 | " | date | \n", 365 | "sessions | \n", 366 | "pageviews | \n", 367 | "
|---|---|---|---|
| 0 | \n", 372 | "2018-11-07 | \n", 373 | "59 | \n", 374 | "88 | \n", 375 | "
| 1 | \n", 378 | "2018-11-08 | \n", 379 | "55 | \n", 380 | "66 | \n", 381 | "
| 2 | \n", 384 | "2018-11-09 | \n", 385 | "38 | \n", 386 | "46 | \n", 387 | "
| 3 | \n", 390 | "2018-11-10 | \n", 391 | "23 | \n", 392 | "34 | \n", 393 | "
| 4 | \n", 396 | "2018-11-11 | \n", 397 | "22 | \n", 398 | "27 | \n", 399 | "