├── .gitignore
├── AwesomeResources
    ├── ML Problem Framing Worksheet.md
    └── README.md
├── Demos.ipynb
├── Docs
    └── README.md
├── LICENSE
├── Models
    ├── Todo.txt
    ├── __init__.py
    └── pytorch
    │   ├── SentenceVAE.py
    │   ├── __init__.py
    │   ├── gpt2_finetuning.py
    │   ├── gpt2_finetuning_test.py
    │   └── openai_gpt_finetuning.py
├── NLP
    ├── contractions.py
    ├── normalization.py
    ├── tfidf.py
    └── todo.txt
├── README.md
├── Testing
    └── todo.txt
├── Tutorials
    ├── Google_Language_API_Intro.ipynb
    ├── PageRankandCheiRank.ipynb
    ├── Pytorch
    │   ├── ClickPrediction.ipynb
    │   └── __init__.py
    ├── Sklearn
    │   └── PythonforSEOTechSEOBoost2018_Hamlet_Batista.ipynb
    ├── Todo.txt
    └── Training Data
    │   └── Get Images in Chrome DevTools
    │       ├── README.md
    │       └── get-image-urls.js
├── api
    ├── __init__.py
    ├── google
    │   └── __init__.py
    ├── google_analytics
    │   ├── __init__.py
    │   └── errors.py
    ├── google_search_console
    │   ├── __init__.py
    │   ├── errors.py
    │   └── gsc.py
    ├── python_semrush
    │   ├── __init__.py
    │   ├── errors.py
    │   ├── semrush.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── response.txt
    │   │   └── test_semrush.py
    ├── scrape.py
    ├── todo.txt
    └── watson.py
├── config-demo.py
├── dataset
    ├── __init__.py
    ├── bert_ds.py
    └── pandas_ds.py
├── main.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # MLTS
  2 | config-working.py
  3 | *.csv
  4 | *credentials*
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | .dmypy.json
116 | dmypy.json
117 | 
118 | # Pyre type checker
119 | .pyre/
120 | 


--------------------------------------------------------------------------------
/AwesomeResources/ML Problem Framing Worksheet.md:
--------------------------------------------------------------------------------
  1 | # ML Problem Framing Worksheet
  2 | 
  3 | (This worksheet was transcribed into Markdown from the original provided by Kshitij Gautam. Neil Martinsen-Burrell also helped modify current doc.)
  4 | 
  5 | ## Exercise 1: Start Clearly and Simply
  6 | 
  7 | **Write what you'd like the machine learned model to do.**
  8 | 
  9 | _We want the machine learned model to..._
 10 | 
 11 | 
 12 | **Example**: We want the machine learned model to predict how popular a video just
 13 | uploaded now will become in the future.
 14 | 
 15 | **Tips**: At this point, the statement can be qualitative, but make sure this
 16 | captures your real goal, not an indirect goal.
 17 | 
 18 | ## Exercise 2: Your Ideal Outcome
 19 | 
 20 | **Your ML model is intended to produce some desirable outcome. What is this
 21 | outcome, independent of the model itself. Note that this outcome may be quite
 22 | different from how you assess the model and its quality.**
 23 | 
 24 | _Our ideal outcome is..._
 25 | 
 26 | **Example**: Our ideal outcome is to only transcode popular videos to minimize
 27 | server resource utilization.
 28 | 
 29 | **Example**: Our ideal outcome is to suggest videos that people find useful,
 30 | entertaining, and worth their time
 31 | 
 32 | **Tips**: You don't need to limit yourself to metrics for which your product
 33 | has already been optimizing. Instead, try to focus on the larger objective of
 34 | your product or service.
 35 | 
 36 | ## Exercise 3: Your Success Metrics
 37 | 
 38 | **Write down your metrics for success and failyre with the ML system. The
 39 | failure metrics are important. Both metrics should be phrased independently of
 40 | the evaluation metrics of the model. Talk about the anticipated outcomes
 41 | instead.**
 42 | 
 43 | _Our success metrics are..._
 44 | 
 45 | _Our key results for the success metrics are..._
 46 | 
 47 | _Our ML model is deemed a failure if..._
 48 | 
 49 | **Example**: Our success metrics are CPU resource utilization. Our KR for the
 50 | success metric is to achieve a 35% reduced cost for transcoding. Our ML model
 51 | is a failure if the CPU resource cost reduction is less than the CPU costs for
 52 | training and serving the model.
 53 | 
 54 | **Example**: Our success metrics are the number of popular videos properly
 55 | predicted. Our KR for the success metric is to properly predict the top 95% 28
 56 | days after being uploaded. Our ML model is a failure if the number of videos
 57 | properly predicted is no better than current heuristics.
 58 | 
 59 | **Tips**: Are the metrics measurable? How will you measure them? (It's okay if
 60 | this is via a live experiment. Some metrics can't be measured offline.) When
 61 | are you able to measure them? (How long will it take to know whether your new
 62 | system is a success or failure?) Consider long-term engineering and
 63 | maintenance costs. Failure may not only be caused by non-achievement of the
 64 | success metric.
 65 | 
 66 | ## Exercise 4: Your Output
 67 | 
 68 | **Write the output that you want your ML model to produce.**
 69 | 
 70 | _The output from our ML model will be..._
 71 | 
 72 | _It is defined as..._
 73 | 
 74 | **Example**: The output from our ML model will be one of the 3 classes of
 75 | videos (very popular, somewhat popular, not popular) defined as the top 3, 7,
 76 | or 90 percentile of watch time 28 days after uploading.
 77 | 
 78 | **Tips**: The output must be quantifiable with a definition that the model can
 79 | produce. Are your able to obtain example outputs to use for training data?
 80 | (How and from what source?) Your output examples may need to be engineered
 81 | (like above where watch time is turned into a percentile). If it is difficult
 82 | to obtain example outputs for training, you may need to reformulate your
 83 | problem.
 84 | 
 85 | ## Exercise 5: Using the Output
 86 | 
 87 | **Write when your output must be obtained from the ML model and how it is used
 88 | in your product.**
 89 | 
 90 | _The output from the ML model will be made..._
 91 | 
 92 | _The output will be used for..._
 93 | 
 94 | **Example**: The prediction of a video's popularity will be made as soon as a
 95 | new video is uploaded. The output will be used for determining the transcoding
 96 | output for the video.
 97 | 
 98 | **Tips**: Consider how you will use the model output. Will it be presented to
 99 | a user in a UI? Consumed by subsequent business logic? Do you have latency
100 | requirements? The latency of data from remote services might make them
101 | infeasible to use. Remember the Oracle Test: if you always had the correct
102 | answer, how would you use that in your product?
103 | 
104 | 
105 | ## Exercise 6: Your Heuristics
106 | _
107 | 
108 | **Write how you would solve the problem if you didn't use ML. What heuristics
109 | might you use?**
110 | 
111 | _If we didn't use ML, we would..._
112 | 
113 | **Example**: If we didn't use ML, we would assume new videos uploaded by
114 | creators who had uploaded popular videos in the past will become popular
115 | again.
116 | 
117 | **Tips**: Think about a scenario where you need to deliver the product
118 | tomorrow and you can only hardcode the business logic. What would you do?
119 | 
120 | ## Exercise 7a: Formulate Your Problem as an ML Problem
121 | 
122 | **Write down what you think is the best technical solution for your problem.**
123 | 
124 | _Our problem is best framed as:_
125 | - _Binary classification_
126 | - _Unidimensional Regression_
127 | - _Multi-class, single-label classification_
128 | - _Multi-class, multi-label classification_
129 | - _Multidimensional regression_
130 | - _Clustering (unsupervised)_
131 | - _Other:_
132 | 
133 | _which predicts..._
134 | 
135 | **Example**: Our problem is best framed as 3-class, single label
136 | classification which predicts whether a video will be in one of three classes
137 | (very popular, somewhat popular, not popular) 28 days after being uploaded.
138 | 
139 | ## Exercise 7b: Cast Your Prolem as a Simpler Problem
140 | 
141 | **Restate your problem as a binary classification or unidimensional
142 | regression.**
143 | 
144 | _Our problem is best framed as:_
145 | - _Binary classification_
146 | - _Unidimensional regression_
147 | 
148 | **Example** We will predict whether upload videos will become very popular or
149 | not. OR We will predict how popular an uploaded video will be in terms of the
150 | number of views it will receive in a 28 day window.
151 | 
152 | ## Exercise 8: Design your Data for the Model
153 | 
154 | **Write the data you want the ML model to use to make the predictions.**
155 | 
156 | _Input 1:_
157 | 
158 | _Input 2:_
159 | 
160 | _Input 3:_
161 | 
162 | **Example**: Input 1: Title, Input 2: Uploader, Input 3: Upload time, Input 4:
163 | Uploaders recent videos
164 | 
165 | **Tips**: Only include information available at the time the prediction is
166 | made. Each input can be a number or a list of numbers or strings. If your
167 | input has a different structure, consider that is the best representation for
168 | your data. (Split a list into two separate inputs? Flatten nested structures?)
169 | 
170 | ## Exercise 9: Where the Data Comes From
171 | 
172 | **Write down where each input comes from. Assess how much work it will be to
173 | develop a data pipeline to construct each column for one row.**
174 | 
175 | _Input 1:_
176 | 
177 | _Input 2:_
178 | 
179 | _Input 3:_
180 | 
181 | **Example**: Input 1: Title, part of VideoUploadEvent record, Input 2:
182 | Uploader, same, Input 3: Upload time, same, Input 4: Recent videos, list from
183 | a separate system.
184 | 
185 | **Tips**: When does the example output become available for training purposes?
186 | Make sure all your inputs are available at serving tie in exactly the format
187 | you specified.
188 | 
189 | ## Exercise 10: Easily Obtained Inputs
190 | 
191 | **Among the inputs you listed in Exercise 8, pick 1-3 that are easy to obtain
192 | and would produce a reasonable initial outcome.**
193 | 
194 | _Input 1:_
195 | 
196 | _Input 2:_
197 | 
198 | **Tips**: For your heuristics, what inputs would be useful for those
199 | heuristics? Focus on inputs that can be obtained from a single system with a
200 | simple pipeline. Start with the minimum possible infrastructure.
201 | 


--------------------------------------------------------------------------------
/AwesomeResources/README.md:
--------------------------------------------------------------------------------
 1 | # Awesome Machine Learning Resources for SEO
 2 | 
 3 | A curated list of libraries with application to SEO
 4 | 
 5 | 
 6 | ## Contents
 7 | 
 8 | - [Machine Learning Frameworks](#machine-learning-frameworks)
 9 | - [Deep Learning Frameworks](#deep-learning-frameworks)
10 | - [Deep Learning Tools](#deep-learning-tools)
11 | - [Deep Learning Projects](#deep-learning-projects)
12 | - [Natural Language Processing(NLP)](#nlp)
13 | - [Public Data Sets](#public-data-sets)
14 | 
15 | ## Machine Learning Frameworks
16 | 
17 | - [scikit-learn](http://scikit-learn.org/stable/) - scikit-learn: machine learning in Python.
18 | - [vowpal_porpoise](https://github.com/josephreisinger/vowpal_porpoise) - Wrapper for vowpal_wabbit.
19 | - [Xgboost](https://xgboost.readthedocs.io/en/latest/) - Scalable, Portable and Distributed Gradient Boosting.
20 | 
21 | 
22 | ## Deep Learning Frameworks
23 | 
24 | - [Pytorch](https://github.com/pytorch/pytorch) - Tensors and Dynamic neural networks in Python with strong GPU acceleration
25 | - [Tensorflow](https://github.com/tensorflow/tensorflow) - Computation using data flow graphs for scalable machine learning.
26 | - [Keras](https://keras.io) - High-level neutral networks API.
27 | - [chainer](https://github.com/chainer/chainer) - A flexible framework of neural networks for deep learning.
28 | 
29 | 
30 | ## Deep Learning Projects
31 | 
32 | - [fairseq-py](https://github.com/facebookresearch/fairseq-py) - Sequence-to-Sequence Toolkit.
33 | - [DrQA](https://github.com/facebookresearch/DrQA) - Reading Wikipedia to Answer Open-Domain Questions.
34 | - [tensorflow-wavenet](https://github.com/ibab/tensorflow-wavenet) - DeepMind's WaveNet.
35 | 
36 | 
37 | ## Examples
38 | 
39 | - [Seedbank](https://research.google.com/seedbank/) - Collection of interactive machine learning models.
40 | - [Google CodeLabs](https://codelabs.developers.google.com/?cat=TensorFlow) - Guided Tensorflow tutorials.
41 | - [Tensorflow Workshops](https://github.com/tensorflow/workshops) - Colab Notebook examples.
42 | - [Tensorflow.js](https://js.tensorflow.org/) - Interactive tensorflow.js demos.
43 | - [Tensorflow Wide & Deep](https://github.com/tensorflow/models/tree/master/official/wide_deep) - Predicting income with the census income dataset example.
44 | - [What If Tool](https://pair-code.github.io/what-if-tool/) - Inspect the inner worrkings of a model, no code required.
45 | - [PEAR](https://ai.google/research/teams/brain/pair) - People + AI Research
46 | - [Facets](https://pair-code.github.io/facets/) - Interactive data vizualiztion.
47 | - [Beat Blender](https://experiments.withgoogle.com/ai/beat-blender/view/) - Make beats with machine learning.
48 | - [Quick, Draw!](https://quickdraw.withgoogle.com/) - Give Google your drawing training data. :)
49 | - [Breast Cancer Detection](https://colab.research.google.com/drive/1ANmq66IO-nKoYWOTC1eIyqvtNxlR7bkn) - Incredible example of how machine learning can help detect cancer.
50 | 
51 | ## NLP
52 | 
53 | - [gensim](https://github.com/piskvorky/gensim) - Topic Modeling.
54 | - [nltk](http://www.nltk.org) - Natural Language Toolkit.
55 | - [pattern](https://github.com/clips/pattern) - Web mining module.
56 | - [goose3 / goose3](https://github.com/goose3/goose3) - A Python 3 compatible version of goose web text extraction.
57 | - [SpaCy](https://github.com/explosion/spaCy) - library is pretty awesome. Hard to install on windows.
58 | - [jellyfish](https://github.com/jamesturk/jellyfish) - Approximate and phonetic matching of strings.
59 | - [facebook/fastText](https://github.com/facebookresearch/fastText) - Library for fast text representation and classification.
60 | - [google/sentencepiece](https://github.com/google/sentencepiece) - Unsupervised text tokenizer for Neural Network-based text generation.
61 | 
62 | 
63 | ## Public Data Sets
64 | 
65 | - [Awesome Public Datasets](https://github.com/caesar0301/awesome-public-datasets)
66 | 
67 | 
68 | Started based on [Awsome Python Data Science](https://github.com/thomasjpfan/awesome-python-data-science)
69 | 


--------------------------------------------------------------------------------
/Demos.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from api import *"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Getting Started\n",
 17 |     "\n",
 18 |     "1. Enter keys in config-demo.py and rename the file to config.py\n",
 19 |     "2. All keys are not needed for all API services.\n",
 20 |     "3. You can set PROXY_ENPOINT to '' to bypass using a proxy.  Otherwise, could be used for rendertron (eg 'https://my-project.appspot.com/render/<URL\\>' )\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## GSC Service"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 10,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Reloading Existing: data/demo2.csv\n"
 40 |      ]
 41 |     },
 42 |     {
 43 |      "data": {
 44 |       "text/html": [
 45 |        "<div>\n",
 46 |        "<style scoped>\n",
 47 |        "    .dataframe tbody tr th:only-of-type {\n",
 48 |        "        vertical-align: middle;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe tbody tr th {\n",
 52 |        "        vertical-align: top;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe thead th {\n",
 56 |        "        text-align: right;\n",
 57 |        "    }\n",
 58 |        "</style>\n",
 59 |        "<table border=\"1\" class=\"dataframe\">\n",
 60 |        "  <thead>\n",
 61 |        "    <tr style=\"text-align: right;\">\n",
 62 |        "      <th></th>\n",
 63 |        "      <th>clicks</th>\n",
 64 |        "      <th>clientID</th>\n",
 65 |        "      <th>ctr</th>\n",
 66 |        "      <th>impressions</th>\n",
 67 |        "      <th>month</th>\n",
 68 |        "      <th>page</th>\n",
 69 |        "      <th>position</th>\n",
 70 |        "      <th>query</th>\n",
 71 |        "    </tr>\n",
 72 |        "  </thead>\n",
 73 |        "  <tbody>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>0</th>\n",
 76 |        "      <td>2.0</td>\n",
 77 |        "      <td>https://adaptpartners.com</td>\n",
 78 |        "      <td>0.666667</td>\n",
 79 |        "      <td>3.0</td>\n",
 80 |        "      <td>2018-10</td>\n",
 81 |        "      <td>https://adaptpartners.com/</td>\n",
 82 |        "      <td>1</td>\n",
 83 |        "      <td>adapt partners</td>\n",
 84 |        "    </tr>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>1</th>\n",
 87 |        "      <td>2.0</td>\n",
 88 |        "      <td>https://adaptpartners.com</td>\n",
 89 |        "      <td>0.400000</td>\n",
 90 |        "      <td>5.0</td>\n",
 91 |        "      <td>2018-10</td>\n",
 92 |        "      <td>https://adaptpartners.com/job/political-journa...</td>\n",
 93 |        "      <td>2</td>\n",
 94 |        "      <td>political internships</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>2</th>\n",
 98 |        "      <td>1.0</td>\n",
 99 |        "      <td>https://adaptpartners.com</td>\n",
100 |        "      <td>0.052632</td>\n",
101 |        "      <td>19.0</td>\n",
102 |        "      <td>2018-10</td>\n",
103 |        "      <td>https://adaptpartners.com/technical-seo/python...</td>\n",
104 |        "      <td>4</td>\n",
105 |        "      <td>google search console api</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>3</th>\n",
109 |        "      <td>0.0</td>\n",
110 |        "      <td>https://adaptpartners.com</td>\n",
111 |        "      <td>0.000000</td>\n",
112 |        "      <td>11.0</td>\n",
113 |        "      <td>2018-10</td>\n",
114 |        "      <td>https://adaptpartners.com/</td>\n",
115 |        "      <td>13</td>\n",
116 |        "      <td>adapt</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>4</th>\n",
120 |        "      <td>0.0</td>\n",
121 |        "      <td>https://adaptpartners.com</td>\n",
122 |        "      <td>0.000000</td>\n",
123 |        "      <td>1.0</td>\n",
124 |        "      <td>2018-10</td>\n",
125 |        "      <td>https://adaptpartners.com/</td>\n",
126 |        "      <td>14</td>\n",
127 |        "      <td>adapt agency</td>\n",
128 |        "    </tr>\n",
129 |        "  </tbody>\n",
130 |        "</table>\n",
131 |        "</div>"
132 |       ],
133 |       "text/plain": [
134 |        "   clicks                   clientID       ctr  impressions    month  \\\n",
135 |        "0     2.0  https://adaptpartners.com  0.666667          3.0  2018-10   \n",
136 |        "1     2.0  https://adaptpartners.com  0.400000          5.0  2018-10   \n",
137 |        "2     1.0  https://adaptpartners.com  0.052632         19.0  2018-10   \n",
138 |        "3     0.0  https://adaptpartners.com  0.000000         11.0  2018-10   \n",
139 |        "4     0.0  https://adaptpartners.com  0.000000          1.0  2018-10   \n",
140 |        "\n",
141 |        "                                                page  position  \\\n",
142 |        "0                         https://adaptpartners.com/         1   \n",
143 |        "1  https://adaptpartners.com/job/political-journa...         2   \n",
144 |        "2  https://adaptpartners.com/technical-seo/python...         4   \n",
145 |        "3                         https://adaptpartners.com/        13   \n",
146 |        "4                         https://adaptpartners.com/        14   \n",
147 |        "\n",
148 |        "                       query  \n",
149 |        "0             adapt partners  \n",
150 |        "1      political internships  \n",
151 |        "2  google search console api  \n",
152 |        "3                      adapt  \n",
153 |        "4               adapt agency  "
154 |       ]
155 |      },
156 |      "execution_count": 10,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "gsc_property = 'https://adaptpartners.com'\n",
163 |     "days_back = 30\n",
164 |     "\n",
165 |     "'''\n",
166 |     "Parameters:\n",
167 |     "\n",
168 |     "Positional:\n",
169 |     "clienturl: (str) The domain URL property name in Google Search Console.\n",
170 |     "days_back: (int) How many days history to pull.\n",
171 |     "\n",
172 |     "Keyword:\n",
173 |     "thresholdtype: (str)  'click' or 'impression'. Default: impression\n",
174 |     "threshold: (int) Keep pulling, daily until less than this number of either clicks or impressions. Default: 1\n",
175 |     "poslimit: (int) Omit results above this limit. Default: None\n",
176 |     "country: (str) Country. Default: usa\n",
177 |     "outputFn: (str) Name of the output file.  If not set, a unique name will be chosen.\n",
178 |     "'''\n",
179 |     "\n",
180 |     "df = gscservice.get_site_data(gsc_property, days_back, output_fn=\"demo2.csv\")\n",
181 |     "df.head()"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## URL Extraction"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 11,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "name": "stdout",
198 |      "output_type": "stream",
199 |      "text": [
200 |       "loading url https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/\n",
201 |       "Extracted 509 words.\n",
202 |       "\n",
203 |       "\n",
204 |       "Title: Python Notebooks: Connect to Google Search Console API and Extract Data\n",
205 |       "H1: ['Python Notebooks: Connect to Google Search Console API and Extract Data']\n",
206 |       "Extracted Text: On this post we want to show you an easy way that you can use Python notebooks to connect to Google’s Search Console API. After connecting to the API, you will be able to do several interesting things.\n",
207 |       "\n",
208 |       "The first thing you need is to create a new Oauth Credential in Google Developers Console and select “Other” as type. Google provides detailed information on how to set this up here.\n",
209 |       "\n",
210 |       "After completing these steps you’ll have a CLIENT_ID and CLIENT_SECRET that you will need to use in this notebook in order to connect to Google Search Console. Dominic Woodman’s post in Moz’s blog shows easy step by step instructions on how you can set this up on Google.\n",
211 |       "\n",
212 |       "Google provides information on how to use Python to connect to their API, however the code they provide on this page is in Python 2. We went ahead and updated this code to Python 3 and added a few changes so that the credentials are saved so that you don’t need to plug in the verification code each time you run the code.\n",
213 |       "\n",
214 |       "With the code above you’ll be able to connect with Google Search Console. Now, you’ll be able to do several interesting things by connecting to this API:\n",
215 |       "• Search Analytics: You’ll be able to extract information from the search analytics report. With the “Query” method you can obtain your search traffic data. You can add filters, date range and parameters in order to extract the data you need. The method returns zero or more rows grouped by the row keys (dimensions) that you define. For more detailed information and examples, please see https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics#resource\n",
216 |       "• Sitemaps: You are able to delete sitemap from sites that you select, retrieves information about a specific sitemap with the get method, list the sitemaps-entries submitted for a site, or included in the sitemap index file with the list method. You can also submit a new sitemap for a site by using the submit method. For detailed information see: https://developers.google.com/webmaster-tools/search-console-api-original/v3/sitemaps#resource\n",
217 |       "• URL Crawl Error Counts: From the crawl error report, you are able to retrieve a time series of the number of URL crawl errors per error category (404, soft 404s, 50x errors, etc) and platform (web, smartphone, etc.) with the query method. https://developers.google.com/webmaster-tools/search-console-api-original/v3/urlcrawlerrorscounts#resource\n",
218 |       "• URL Crawl Errors Samples: From the same crawl error report, you can retrieve details about crawl errors for a site’s sample URL. Here is a great example of that JR Oakes did where he uses the API to find through the crawl errors pages that are linked externally and should be redirected.. You can also extract a list a site’s sample URLs for the specified crawl error category and platform. It is also possible to mark crawl errors as fixed and removes it from the sample list. https://developers.google.com/webmaster-tools/search-console-api-original/v3/urlcrawlerrorssamples#resource\n",
219 |       "\n",
220 |       "As you can see there are several things that can be done when connecting to Google’s search console.\n",
221 |       "\n",
222 |       "You might also be interested in reading JR’s post about saving Google Search Console data to BigQuery.\n",
223 |       "\n",
224 |       "If you have other examples you would like to share, please link to them on the comments.\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "\n",
230 |     "url = 'https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/'\n",
231 |     "text, infos = extract_url_data(url)\n",
232 |     "print('\\n')\n",
233 |     "print(\"Title:\", infos['title'])\n",
234 |     "print(\"H1:\", infos['h1'])\n",
235 |     "print(\"Extracted Text:\", text)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "## Watson API"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 2,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "loading url https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/\n",
255 |       "Extracted 509 words.\n",
256 |       "\n",
257 |       " Entities:\n",
258 |       "   count                                     disambiguation  relevance  \\\n",
259 |       "0      9  {'subtype': ['AcademicInstitution', 'AwardPres...   0.876669   \n",
260 |       "1      1                                                NaN   0.169787   \n",
261 |       "\n",
262 |       "              text     type  \n",
263 |       "0           Google  Company  \n",
264 |       "1  Dominic Woodman   Person  \n",
265 |       "\n",
266 |       " Keywords:\n",
267 |       "    relevance                       text\n",
268 |       "0    0.984217      Google Search Console\n",
269 |       "1    0.715410  Google Developers Console\n",
270 |       "2    0.653734         Search Console API\n",
271 |       "3    0.573839        Search Console data\n",
272 |       "4    0.562398         interesting things\n",
273 |       "5    0.562259       new Oauth Credential\n",
274 |       "6    0.528074            shows easy step\n",
275 |       "7    0.474809           Python notebooks\n",
276 |       "8    0.447963            Dominic Woodman\n",
277 |       "9    0.443203                   easy way\n",
278 |       "10   0.434548       detailed information\n",
279 |       "11   0.426069          step instructions\n",
280 |       "12   0.420550          verification code\n",
281 |       "13   0.341544                       post\n",
282 |       "14   0.338461                        Moz\n",
283 |       "15   0.325447                       type\n",
284 |       "16   0.325374                      steps\n",
285 |       "17   0.325252                credentials\n",
286 |       "18   0.324879                  CLIENT_ID\n",
287 |       "19   0.324861              CLIENT_SECRET\n",
288 |       "20   0.323923                       page\n",
289 |       "21   0.323567                    changes\n",
290 |       "22   0.323496                   examples\n",
291 |       "23   0.323422                       link\n",
292 |       "24   0.323390                       time\n",
293 |       "25   0.323376                   comments\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "url = 'https://adaptpartners.com/technical-seo/python-notebooks-connect-to-google-search-console-api-and-extract-data/'\n",
299 |     "text, infos = extract_url_data(url)\n",
300 |     "html = infos['html']\n",
301 |     "\n",
302 |     "print('\\n Entities:')\n",
303 |     "print(watsonservice.watson_entities(html))\n",
304 |     "\n",
305 |     "print('\\n Keywords:')\n",
306 |     "print(watsonservice.watson_keywords(html))"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "## SEMRush"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# Uses this library: https://github.com/storerjeremy/python-semrush\n",
323 |     "# See Readme for implentation \n",
324 |     "\n",
325 |     "domain = 'adaptpartners.com'\n",
326 |     "database = 'us'\n",
327 |     "ranks = semrushservice.domain_organic(domain, database)\n",
328 |     "ranks.head()"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "## Google Analytics"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 7,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/html": [
346 |        "<div>\n",
347 |        "<style scoped>\n",
348 |        "    .dataframe tbody tr th:only-of-type {\n",
349 |        "        vertical-align: middle;\n",
350 |        "    }\n",
351 |        "\n",
352 |        "    .dataframe tbody tr th {\n",
353 |        "        vertical-align: top;\n",
354 |        "    }\n",
355 |        "\n",
356 |        "    .dataframe thead th {\n",
357 |        "        text-align: right;\n",
358 |        "    }\n",
359 |        "</style>\n",
360 |        "<table border=\"1\" class=\"dataframe\">\n",
361 |        "  <thead>\n",
362 |        "    <tr style=\"text-align: right;\">\n",
363 |        "      <th></th>\n",
364 |        "      <th>date</th>\n",
365 |        "      <th>sessions</th>\n",
366 |        "      <th>pageviews</th>\n",
367 |        "    </tr>\n",
368 |        "  </thead>\n",
369 |        "  <tbody>\n",
370 |        "    <tr>\n",
371 |        "      <th>0</th>\n",
372 |        "      <td>2018-11-07</td>\n",
373 |        "      <td>59</td>\n",
374 |        "      <td>88</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <th>1</th>\n",
378 |        "      <td>2018-11-08</td>\n",
379 |        "      <td>55</td>\n",
380 |        "      <td>66</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>2</th>\n",
384 |        "      <td>2018-11-09</td>\n",
385 |        "      <td>38</td>\n",
386 |        "      <td>46</td>\n",
387 |        "    </tr>\n",
388 |        "    <tr>\n",
389 |        "      <th>3</th>\n",
390 |        "      <td>2018-11-10</td>\n",
391 |        "      <td>23</td>\n",
392 |        "      <td>34</td>\n",
393 |        "    </tr>\n",
394 |        "    <tr>\n",
395 |        "      <th>4</th>\n",
396 |        "      <td>2018-11-11</td>\n",
397 |        "      <td>22</td>\n",
398 |        "      <td>27</td>\n",
399 |        "    </tr>\n",
400 |        "  </tbody>\n",
401 |        "</table>\n",
402 |        "</div>"
403 |       ],
404 |       "text/plain": [
405 |        "         date  sessions  pageviews\n",
406 |        "0  2018-11-07        59         88\n",
407 |        "1  2018-11-08        55         66\n",
408 |        "2  2018-11-09        38         46\n",
409 |        "3  2018-11-10        23         34\n",
410 |        "4  2018-11-11        22         27"
411 |       ]
412 |      },
413 |      "execution_count": 7,
414 |      "metadata": {},
415 |      "output_type": "execute_result"
416 |     }
417 |    ],
418 |    "source": [
419 |     "# Uses this library: https://github.com/debrouwere/google-analytics/wiki/Querying\n",
420 |     "# See link for usage instructions.\n",
421 |     "\n",
422 |     "ga = gaservice\n",
423 |     "\n",
424 |     "ga_account = \"Adapt Partners\" \n",
425 |     "ga_webproperty = \"Adapt Partners\" \n",
426 |     "ga_profile = \"Adapt Partners\"\n",
427 |     "\n",
428 |     "profile = gaservice.get_profile(account=ga_account, webproperty=ga_webproperty, profile=ga_profile )\n",
429 |     "query = profile.core.query.daily(days=-5)\n",
430 |     "query.metrics('sessions', 'pageviews').as_dataframe()\n"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "# Dataset"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 3,
443 |    "metadata": {},
444 |    "outputs": [
445 |     {
446 |      "name": "stdout",
447 |      "output_type": "stream",
448 |      "text": [
449 |       "Reloading Existing: data/demo2.csv\n",
450 |       "# training samples: 6115\n",
451 |       "# batches: 191\n"
452 |      ]
453 |     }
454 |    ],
455 |    "source": [
456 |     "# Create dataset from Google Search Console data for usage in Machine Learning projects.\n",
457 |     "\n",
458 |     "import dataset\n",
459 |     "\n",
460 |     "gsc_property = 'https://adaptpartners.com'\n",
461 |     "days_back = 30\n",
462 |     "\n",
463 |     "df = gscservice.get_site_data(gsc_property, days_back, output_fn=\"demo2.csv\")\n",
464 |     "df.head()\n",
465 |     "\n",
466 |     "features = df[['position','impressions']]\n",
467 |     "labels = df[['clicks']]\n",
468 |     "\n",
469 |     "data_loader = dataset.load_pandas(features, labels, batch_size=32, shuffle=True, drop_last=True)"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "# Bert"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [
484 |     {
485 |      "name": "stdout",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "Reloading Existing: data/demo2.csv\n"
489 |      ]
490 |     },
491 |     {
492 |      "name": "stderr",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "11/29/2018 08:25:28 - INFO - dataset.bert_ds -   device: cpu n_gpu: 1 distributed training: False\n",
496 |       "11/29/2018 08:25:29 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\\Users\\jroak\\.pytorch_pretrained_bert\\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n",
497 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***\n",
498 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 0\n",
499 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] adapt partners [SEP]\n",
500 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 15581 5826 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
501 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_mask: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
502 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
503 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***\n",
504 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 1\n",
505 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] political internship ##s [SEP]\n",
506 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 2576 22676 2015 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
507 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
508 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
509 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***\n",
510 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 2\n",
511 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] google search console api [SEP]\n",
512 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 8224 3945 10122 17928 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
513 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_mask: 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
514 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
515 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***\n",
516 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 3\n",
517 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] adapt [SEP]\n",
518 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 15581 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
519 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_mask: 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
520 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
521 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   *** Example ***\n",
522 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   unique_id: 4\n",
523 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   tokens: [CLS] adapt agency [SEP]\n",
524 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_ids: 101 15581 4034 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
525 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_mask: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
526 |       "11/29/2018 08:25:29 - INFO - dataset.bert_ds -   input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
527 |       "11/29/2018 08:25:31 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\\Users\\jroak\\.pytorch_pretrained_bert\\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
528 |       "11/29/2018 08:25:31 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file C:\\Users\\jroak\\.pytorch_pretrained_bert\\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\\Users\\jroak\\AppData\\Local\\Temp\\tmpk0bt_yu6\n",
529 |       "11/29/2018 08:25:37 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
530 |       "  \"attention_probs_dropout_prob\": 0.1,\n",
531 |       "  \"hidden_act\": \"gelu\",\n",
532 |       "  \"hidden_dropout_prob\": 0.1,\n",
533 |       "  \"hidden_size\": 768,\n",
534 |       "  \"initializer_range\": 0.02,\n",
535 |       "  \"intermediate_size\": 3072,\n",
536 |       "  \"max_position_embeddings\": 512,\n",
537 |       "  \"num_attention_heads\": 12,\n",
538 |       "  \"num_hidden_layers\": 12,\n",
539 |       "  \"type_vocab_size\": 2,\n",
540 |       "  \"vocab_size\": 30522\n",
541 |       "}\n",
542 |       "\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "# Implements the BERT model for usage in machine learning projects.\n",
548 |     "\n",
549 |     "import dataset\n",
550 |     "\n",
551 |     "gsc_property = 'https://adaptpartners.com'\n",
552 |     "days_back = 30\n",
553 |     "\n",
554 |     "df = gscservice.get_site_data(gsc_property, days_back, output_fn=\"demo2.csv\")\n",
555 |     "df.head()\n",
556 |     "\n",
557 |     "features = df[['position','impressions']]\n",
558 |     "labels = df[['clicks']]\n",
559 |     "\n",
560 |     "def apply_embed(row):\n",
561 |     "    embed = row['embedding']\n",
562 |     "    for i, e in enumerate(embed):\n",
563 |     "        row['e_'+str(i)] = e\n",
564 |     "    return row\n",
565 |     "\n",
566 |     "data_loader_bert, df_bert = dataset.load_bert_df(input_df=df, input_row=\"query\")\n",
567 |     "\n",
568 |     "df_bert_embed = df_bert.apply(apply_embed,axis=1).drop(columns=['embedding','linex_index','tokens'])\n",
569 |     "\n",
570 |     "features = pd.concat([features.reset_index(drop =True), df_bert_embed.reset_index(drop =True)], axis=1)\n",
571 |     "\n",
572 |     "data_loader = dataset.load_pandas(features, labels, batch_size=32, shuffle=True, drop_last=True)"
573 |    ]
574 |   }
575 |  ],
576 |  "metadata": {
577 |   "kernelspec": {
578 |    "display_name": "Python 3",
579 |    "language": "python",
580 |    "name": "python3"
581 |   },
582 |   "language_info": {
583 |    "codemirror_mode": {
584 |     "name": "ipython",
585 |     "version": 3
586 |    },
587 |    "file_extension": ".py",
588 |    "mimetype": "text/x-python",
589 |    "name": "python",
590 |    "nbconvert_exporter": "python",
591 |    "pygments_lexer": "ipython3",
592 |    "version": "3.7.2"
593 |   }
594 |  },
595 |  "nbformat": 4,
596 |  "nbformat_minor": 2
597 | }
598 | 


--------------------------------------------------------------------------------
/Docs/README.md:
--------------------------------------------------------------------------------
1 | # MLTS
2 | Machine Learning Toolkit for SEO
3 | 
4 | ## Documentation
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Models/Todo.txt:
--------------------------------------------------------------------------------
 1 | Example: https://github.com/eriklindernoren/ML-From-Scratch/tree/master/mlfromscratch
 2 | For: Pytorch and Tensorflow
 3 | 
 4 | Format:
 5 | pytorch-gan.py
 6 | tf-gan.py
 7 | 
 8 | 
 9 | https://developers.google.com/machine-learning/guides/rules-of-ml/
10 | 


--------------------------------------------------------------------------------
/Models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLTSEO/MLTS/c44328d2a83faea03abcd3ff3960b184fd83de96/Models/__init__.py


--------------------------------------------------------------------------------
/Models/pytorch/SentenceVAE.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.utils.rnn as rnn_utils
  4 | from utils import to_var
  5 | 
  6 | class SentenceVAE(nn.Module):
  7 | 
  8 |     def __init__(self, vocab_size, embedding_size, rnn_type, hidden_size, word_dropout, embedding_dropout, latent_size,
  9 |                 sos_idx, eos_idx, pad_idx, unk_idx, max_sequence_length, num_layers=1, bidirectional=False):
 10 | 
 11 |         super().__init__()
 12 |         self.tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
 13 | 
 14 |         self.max_sequence_length = max_sequence_length
 15 |         self.sos_idx = sos_idx
 16 |         self.eos_idx = eos_idx
 17 |         self.pad_idx = pad_idx
 18 |         self.unk_idx = unk_idx
 19 | 
 20 |         self.latent_size = latent_size
 21 | 
 22 |         self.rnn_type = rnn_type
 23 |         self.bidirectional = bidirectional
 24 |         self.num_layers = num_layers
 25 |         self.hidden_size = hidden_size
 26 | 
 27 |         self.embedding = nn.Embedding(vocab_size, embedding_size)
 28 |         self.word_dropout_rate = word_dropout
 29 |         self.embedding_dropout = nn.Dropout(p=embedding_dropout)
 30 | 
 31 |         if rnn_type == 'rnn':
 32 |             rnn = nn.RNN
 33 |         elif rnn_type == 'gru':
 34 |             rnn = nn.GRU
 35 |         # elif rnn_type == 'lstm':
 36 |         #     rnn = nn.LSTM
 37 |         else:
 38 |             raise ValueError()
 39 | 
 40 |         self.encoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
 41 |         self.decoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True)
 42 | 
 43 |         self.hidden_factor = (2 if bidirectional else 1) * num_layers
 44 | 
 45 |         self.hidden2mean = nn.Linear(hidden_size * self.hidden_factor, latent_size)
 46 |         self.hidden2logv = nn.Linear(hidden_size * self.hidden_factor, latent_size)
 47 |         self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
 48 |         self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
 49 | 
 50 |     def forward(self, input_sequence, length):
 51 | 
 52 |         batch_size = input_sequence.size(0)
 53 |         sorted_lengths, sorted_idx = torch.sort(length, descending=True)
 54 |         input_sequence = input_sequence[sorted_idx]
 55 | 
 56 |         # ENCODER
 57 |         input_embedding = self.embedding(input_sequence)
 58 | 
 59 |         packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.data.tolist(), batch_first=True)
 60 | 
 61 |         _, hidden = self.encoder_rnn(packed_input)
 62 | 
 63 |         if self.bidirectional or self.num_layers > 1:
 64 |             # flatten hidden state
 65 |             hidden = hidden.view(batch_size, self.hidden_size*self.hidden_factor)
 66 |         else:
 67 |             hidden = hidden.squeeze()
 68 | 
 69 |         # REPARAMETERIZATION
 70 |         mean = self.hidden2mean(hidden)
 71 |         logv = self.hidden2logv(hidden)
 72 |         std = torch.exp(0.5 * logv)
 73 | 
 74 |         z = to_var(torch.randn([batch_size, self.latent_size]))
 75 |         z = z * std + mean
 76 | 
 77 |         # DECODER
 78 |         hidden = self.latent2hidden(z)
 79 | 
 80 |         if self.bidirectional or self.num_layers > 1:
 81 |             # unflatten hidden state
 82 |             hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
 83 |         else:
 84 |             hidden = hidden.unsqueeze(0)
 85 | 
 86 |         # decoder input
 87 |         if self.word_dropout_rate > 0:
 88 |             # randomly replace decoder input with <unk>
 89 |             prob = torch.rand(input_sequence.size())
 90 |             prob[(input_sequence.data - self.sos_idx) * (input_sequence.data - self.pad_idx) == 0] = 1
 91 |             decoder_input_sequence = input_sequence.clone()
 92 |             decoder_input_sequence[prob < self.word_dropout_rate] = self.unk_idx
 93 |             input_embedding = self.embedding(decoder_input_sequence)
 94 |         input_embedding = self.embedding_dropout(input_embedding)
 95 |         packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.data.tolist(), batch_first=True)
 96 | 
 97 |         # decoder forward pass
 98 |         outputs, _ = self.decoder_rnn(packed_input, hidden)
 99 | 
100 |         # process outputs
101 |         padded_outputs = rnn_utils.pad_packed_sequence(outputs, batch_first=True)[0]
102 |         padded_outputs = padded_outputs.contiguous()
103 |         _,reversed_idx = torch.sort(sorted_idx)
104 |         padded_outputs = padded_outputs[reversed_idx]
105 |         b,s,_ = padded_outputs.size()
106 | 
107 |         # project outputs to vocab
108 |         logp = nn.functional.log_softmax(self.outputs2vocab(padded_outputs.view(-1, padded_outputs.size(2))), dim=-1)
109 |         logp = logp.view(b, s, self.embedding.num_embeddings)
110 | 
111 | 
112 |         return logp, mean, logv, z
113 | 
114 | 
115 |     def inference(self, n=4, z=None):
116 | 
117 |         if z is None:
118 |             batch_size = n
119 |             z = to_var(torch.randn([batch_size, self.latent_size]))
120 |         else:
121 |             batch_size = z.size(0)
122 | 
123 |         hidden = self.latent2hidden(z)
124 | 
125 |         if self.bidirectional or self.num_layers > 1:
126 |             # unflatten hidden state
127 |             hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
128 | 
129 |         hidden = hidden.unsqueeze(0)
130 | 
131 |         # required for dynamic stopping of sentence generation
132 |         sequence_idx = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch
133 |         sequence_running = torch.arange(0, batch_size, out=self.tensor()).long() # all idx of batch which are still generating
134 |         sequence_mask = torch.ones(batch_size, out=self.tensor()).byte()
135 | 
136 |         running_seqs = torch.arange(0, batch_size, out=self.tensor()).long() # idx of still generating sequences with respect to current loop
137 | 
138 |         generations = self.tensor(batch_size, self.max_sequence_length).fill_(self.pad_idx).long()
139 | 
140 |         t=0
141 |         while(t<self.max_sequence_length and len(running_seqs)>0):
142 | 
143 |             if t == 0:
144 |                 input_sequence = to_var(torch.Tensor(batch_size).fill_(self.sos_idx).long())
145 | 
146 |             input_sequence = input_sequence.unsqueeze(1)
147 | 
148 |             input_embedding = self.embedding(input_sequence)
149 | 
150 |             output, hidden = self.decoder_rnn(input_embedding, hidden)
151 | 
152 |             logits = self.outputs2vocab(output)
153 | 
154 |             input_sequence = self._sample(logits)
155 | 
156 |             # save next input
157 |             generations = self._save_sample(generations, input_sequence, sequence_running, t)
158 | 
159 |             # update gloabl running sequence
160 |             sequence_mask[sequence_running] = (input_sequence != self.eos_idx).data
161 |             sequence_running = sequence_idx.masked_select(sequence_mask)
162 | 
163 |             # update local running sequences
164 |             running_mask = (input_sequence != self.eos_idx).data
165 |             running_seqs = running_seqs.masked_select(running_mask)
166 | 
167 |             # prune input and hidden state according to local update
168 |             if len(running_seqs) > 0:
169 |                 input_sequence = input_sequence[running_seqs]
170 |                 hidden = hidden[:, running_seqs]
171 | 
172 |                 running_seqs = torch.arange(0, len(running_seqs), out=self.tensor()).long()
173 | 
174 |             t += 1
175 | 
176 |         return generations, z
177 | 
178 |     def _sample(self, dist, mode='greedy'):
179 | 
180 |         if mode == 'greedy':
181 |             _, sample = torch.topk(dist, 1, dim=-1)
182 |         sample = sample.squeeze()
183 | 
184 |         return sample
185 | 
186 |     def _save_sample(self, save_to, sample, running_seqs, t):
187 |         # select only still running
188 |         running_latest = save_to[running_seqs]
189 |         # update token at position t
190 |         running_latest[:,t] = sample.data
191 |         # save back
192 |         save_to[running_seqs] = running_latest
193 | 
194 |         return save_to
195 | 


--------------------------------------------------------------------------------
/Models/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLTSEO/MLTS/c44328d2a83faea03abcd3ff3960b184fd83de96/Models/pytorch/__init__.py


--------------------------------------------------------------------------------
/Models/pytorch/gpt2_finetuning.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import csv
  4 | import random
  5 | import logging
  6 | from tqdm import tqdm, trange
  7 | 
  8 | from apex import amp
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import math
 13 | import torch
 14 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 15 |                               TensorDataset)
 16 | 
 17 | from pytorch_transformers import (GPT2DoubleHeadsModel, GPT2Tokenizer,
 18 |                                      AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
 19 |                                      WarmupLinearSchedule)
 20 | 
 21 | 
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | def accuracy(out, labels):
 26 |     outputs = np.argmax(out, axis=1)
 27 |     return np.sum(outputs == labels)
 28 |   
 29 |               
 30 | def load_dataset(tokenizer, dataset, num_prior = None):
 31 | 
 32 |     num_prior   = num_prior or 3
 33 |     df = pd.read_csv(dataset)
 34 | 
 35 |     # Clear empty and tokenize all input.
 36 |     text_data = [' '.join(t.split()) for t in df['Text'].tolist() if len(t.split()) > 1]
 37 | 
 38 |     output = []
 39 | 
 40 |     for i, text in enumerate(text_data):
 41 |         if i >= num_prior:
 42 |           pri = ' '.join(text_data[i-num_prior:i])
 43 |           nxt = text
 44 |           rdm = nxt
 45 |           while rdm == nxt:
 46 |             rdm = random.choice(text_data)
 47 |         
 48 |           s = random.choice([0,1])
 49 |           if s == 0:
 50 |             output.append((pri, nxt, rdm, s))
 51 |           else:
 52 |             output.append((pri, rdm, nxt, s))
 53 |             
 54 |     return output
 55 | 
 56 |       
 57 |       
 58 | def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
 59 |     """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 60 |         To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
 61 |         input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 62 |     """
 63 |     tensor_datasets = []
 64 |     for dataset in encoded_datasets:
 65 |         n_batch = len(dataset)
 66 |         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
 67 |         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
 68 |         lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
 69 |         mc_labels = np.zeros((n_batch,), dtype=np.int64)
 70 |         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
 71 |    
 72 |             try:
 73 |                 with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 74 |                 with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
 75 |                 input_ids[i, 0, :len(with_cont1)] = with_cont1
 76 |                 input_ids[i, 1, :len(with_cont2)] = with_cont2
 77 |                 mc_token_ids[i, 0] = len(with_cont1) - 1
 78 |                 mc_token_ids[i, 1] = len(with_cont2) - 1
 79 |                 lm_labels[i, 0, :len(with_cont1)] = with_cont1
 80 |                 lm_labels[i, 1, :len(with_cont2)] = with_cont2
 81 |                 mc_labels[i] = mc_label
 82 |             except Exception as e:
 83 |                 print('Exception:', str(e))
 84 |                 print('cont1:', str(with_cont1))
 85 |                 print('cont2:', str(with_cont2))
 86 |                 exit()
 87 | 
 88 |         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
 89 |         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
 90 |     return tensor_datasets
 91 | 
 92 | def main():
 93 |   
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument('--model_name', type=str, default='gpt2-medium',
 96 |                         help='pretrained model name')
 97 |     parser.add_argument("--do_train", action='store_true', default=True, help="Whether to run training.")
 98 |     parser.add_argument("--output_dir", default='fintuned_gpt', type=str,
 99 |                         help="The output directory where the model predictions and checkpoints will be written.")
100 |     parser.add_argument('--dataset', type=str, default='', required=True)
101 |     parser.add_argument('--seed', type=int, default=42)
102 |     parser.add_argument('--opt_level', type=str, default='O1')
103 |     parser.add_argument('--num_train_epochs', type=int, default=3)
104 |     parser.add_argument('--train_batch_size', type=int, default=8)
105 |     parser.add_argument('--eval_batch_size', type=int, default=8)
106 |     parser.add_argument('--num_prior', type=int, default=2)
107 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
108 |                         help="Epsilon for Adam optimizer.")
109 |     parser.add_argument('--max_grad_norm', type=int, default=1)
110 |     parser.add_argument("--max_steps", default=-1, type=int,
111 |                         help="If > 0: set total number of training \
112 |                         steps to perform. Override num_train_epochs.")
113 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
114 |                         help="Number of updates steps to accumulate before\
115 |                         performing a backward/update pass.")
116 |     parser.add_argument('--learning_rate', type=float, default=6.25e-5)
117 |     parser.add_argument("--warmup_steps", default=0, type=int,
118 |                         help="Linear warmup over warmup_steps.")
119 |     parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
120 |     parser.add_argument('--weight_decay', type=float, default=0.01)
121 |     parser.add_argument('--lm_coef', type=float, default=0.9)
122 | 
123 |     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
124 |     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
125 |     args = parser.parse_args()
126 |     print(args)
127 | 
128 | 
129 |     random.seed(args.seed)
130 |     np.random.seed(args.seed)
131 |     torch.manual_seed(args.seed)
132 |     torch.cuda.manual_seed_all(args.seed)
133 | 
134 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135 |     n_gpu = torch.cuda.device_count()
136 |     logger.info("device: {}, n_gpu {}".format(device, n_gpu))
137 | 
138 |     if not os.path.exists(args.output_dir):
139 |         os.makedirs(args.output_dir)
140 | 
141 |     # Load tokenizer and model
142 |     # This loading functions also add new tokens and embeddings called `special tokens`
143 |     # These new embeddings will be fine-tuned on the RocStories dataset.
144 |     # start_token, delimiter_token, clf_token
145 | 
146 |     special_tokens_dict = {'cls_token': '<|cls|>',
147 |                            'unk_token': '<|unk|>',
148 |                            'bos_token': '<|endoftext|>',
149 |                            'eos_token': '<|endoftext|>',
150 |                            'sep_token': '<|endoftext|>'
151 |                           }
152 |     tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
153 |     
154 |     num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
155 |     print('We have added', num_added_toks, 'tokens')
156 |     
157 |     #start_token, delimiter_token, clf_token
158 |     special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in ['<|endoftext|>', '<|endoftext|>', '<|cls|>'])
159 |     model = GPT2DoubleHeadsModel.from_pretrained(args.model_name)
160 |     model.resize_token_embeddings(len(tokenizer))
161 |     model.to(device)
162 | 
163 | 
164 |     def tokenize_and_encode(obj):
165 |         """ Tokenize and encode a nested object """
166 |         if isinstance(obj, str):
167 |             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
168 |         elif isinstance(obj, int):
169 |             return obj
170 |         return list(tokenize_and_encode(o) for o in obj)
171 |     logger.info("Encoding dataset...")
172 | 
173 |     train_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior)
174 |     eval_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior)
175 | 
176 |     datasets = (train_dataset, eval_dataset)
177 |     encoded_datasets = tokenize_and_encode(datasets)
178 | 
179 |     # Compute the max input length for the Transformer
180 |     max_length = model.config.n_positions // 2 - 2
181 |     input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
182 |                         for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
183 |     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
184 | 
185 |     # Prepare inputs tensors and dataloaders
186 |     tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
187 |     train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
188 | 
189 |     train_data = TensorDataset(*train_tensor_dataset)
190 |     train_sampler = RandomSampler(train_data)
191 |     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
192 | 
193 |     eval_data = TensorDataset(*eval_tensor_dataset)
194 |     eval_sampler = SequentialSampler(eval_data)
195 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
196 | 
197 |     # Prepare optimizer
198 | 
199 |     if args.max_steps > 0:
200 |         t_total = args.max_steps
201 |         args.num_train_epochs = args.max_steps //\
202 |             (len(train_dataloader) // args.gradient_accumulation_steps) + 1
203 |     else:
204 |         t_total = len(train_dataloader)\
205 |             // args.gradient_accumulation_steps * args.num_train_epochs
206 | 
207 |     param_optimizer = list(model.named_parameters())
208 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
209 |     optimizer_grouped_parameters = [
210 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
211 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
212 |         ]
213 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
214 |     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
215 | 
216 |     model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, verbosity=1)
217 | 
218 | 
219 |     nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
220 |     model.train()
221 |     for i, _ in enumerate(range(int(args.num_train_epochs))):
222 |         print('Starting Epoch: {} of {}'.format(str(i+1), str(int(args.num_train_epochs))))
223 |         tr_loss = 0
224 |         nb_tr_steps = 0
225 |         tqdm_bar = tqdm(train_dataloader, desc="Training")
226 |         for step, batch in enumerate(tqdm_bar):
227 |             batch = tuple(t.to(device) for t in batch)
228 |             input_ids, mc_token_ids, lm_labels, mc_labels = batch
229 |             losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
230 |             loss = args.lm_coef * losses[0] + losses[1]
231 |             with amp.scale_loss(loss, optimizer) as scaled_loss:
232 |                 scaled_loss.backward()   
233 |                 
234 |             if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
235 |                 scheduler.step()
236 |                 optimizer.step()
237 |                 optimizer.zero_grad()
238 |                 
239 |             tr_loss += loss.item()
240 |             exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
241 |             nb_tr_steps += 1
242 |             tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
243 | 
244 |         if torch.cuda.is_available():
245 |             torch.cuda.empty_cache()
246 |             
247 | # Save a trained model
248 | 
249 |     # Save a trained model, configuration and tokenizer
250 |     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
251 | 
252 |     # If we save using the predefined names, we can load using `from_pretrained`
253 |     output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
254 |     output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
255 | 
256 |     torch.save(model_to_save.state_dict(), output_model_file)
257 |     model_to_save.config.to_json_file(output_config_file)
258 |     tokenizer.save_vocabulary(args.output_dir)
259 | 
260 |     # Load a trained model and vocabulary that you have fine-tuned
261 |     model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
262 |     tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
263 |     model.to(device)
264 | 
265 | 
266 | if __name__ == '__main__':
267 |     main()
268 | 
269 | 


--------------------------------------------------------------------------------
/Models/pytorch/gpt2_finetuning_test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import csv
  4 | import random
  5 | import logging
  6 | from tqdm import tqdm, trange
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import math
 11 | import torch
 12 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 13 |                               TensorDataset)
 14 | 
 15 | from pytorch_transformers import (GPT2DoubleHeadsModel, GPT2Tokenizer,
 16 |                                      AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
 17 |                                      WarmupLinearSchedule)
 18 | 
 19 | 
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | def accuracy(out, labels):
 24 |     outputs = np.argmax(out, axis=1)
 25 |     return np.sum(outputs == labels)
 26 |   
 27 |               
 28 | def load_dataset(tokenizer, dataset, num_prior = None):
 29 | 
 30 |     num_prior   = num_prior or 3
 31 |     df = pd.read_csv(dataset)
 32 | 
 33 |     # Clear empty and tokenize all input.
 34 |     text_data = [' '.join(t.split()) for t in df['Text'].tolist() if len(t.split()) > 1]
 35 | 
 36 |     output = []
 37 | 
 38 |     for i, text in enumerate(text_data):
 39 |         if i >= num_prior:
 40 |           pri = ' '.join(text_data[i-num_prior:i])
 41 |           nxt = text
 42 |           rdm = random.choice(text_data)
 43 |           if rdm == nxt:
 44 |             rdm = random.choice(text_data)
 45 |         
 46 |           s = random.choice([0,1])
 47 |           if s == 0:
 48 |             output.append((pri, nxt, rdm, s))
 49 |           else:
 50 |             output.append((pri, rdm, nxt, s))
 51 |             
 52 |     return output
 53 | 
 54 |       
 55 |       
 56 | def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
 57 |     """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 58 |         To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
 59 |         input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 60 |     """
 61 |     tensor_datasets = []
 62 |     for dataset in encoded_datasets:
 63 |         n_batch = len(dataset)
 64 |         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
 65 |         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
 66 |         lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
 67 |         mc_labels = np.zeros((n_batch,), dtype=np.int64)
 68 |         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
 69 |    
 70 |             try:
 71 |                 with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 72 |                 with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
 73 |                 input_ids[i, 0, :len(with_cont1)] = with_cont1
 74 |                 input_ids[i, 1, :len(with_cont2)] = with_cont2
 75 |                 mc_token_ids[i, 0] = len(with_cont1) - 1
 76 |                 mc_token_ids[i, 1] = len(with_cont2) - 1
 77 |                 lm_labels[i, 0, :len(with_cont1)] = with_cont1
 78 |                 lm_labels[i, 1, :len(with_cont2)] = with_cont2
 79 |                 mc_labels[i] = mc_label
 80 |             except Exception as e:
 81 |                 print('Exception:', str(e))
 82 |                 print('cont1:', str(with_cont1))
 83 |                 print('cont2:', str(with_cont2))
 84 |                 exit()
 85 | 
 86 |         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
 87 |         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
 88 |     return tensor_datasets
 89 | 
 90 | def main():
 91 |   
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('--model_name', type=str, default='gpt2-medium',
 94 |                         help='pretrained model name')
 95 |     parser.add_argument("--do_train", action='store_true', default=True, help="Whether to run training.")
 96 |     parser.add_argument("--output_dir", default='fintuned_gpt', type=str,
 97 |                         help="The output directory where the model predictions and checkpoints will be written.")
 98 |     parser.add_argument('--dataset', type=str, default='', required=True)
 99 |     parser.add_argument('--seed', type=int, default=42)
100 |     parser.add_argument('--num_train_epochs', type=int, default=3)
101 |     parser.add_argument('--train_batch_size', type=int, default=8)
102 |     parser.add_argument('--eval_batch_size', type=int, default=8)
103 |     parser.add_argument('--num_prior', type=int, default=2)
104 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
105 |                         help="Epsilon for Adam optimizer.")
106 |     parser.add_argument('--max_grad_norm', type=int, default=1)
107 |     parser.add_argument("--max_steps", default=-1, type=int,
108 |                         help="If > 0: set total number of training \
109 |                         steps to perform. Override num_train_epochs.")
110 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
111 |                         help="Number of updates steps to accumulate before\
112 |                         performing a backward/update pass.")
113 |     parser.add_argument('--learning_rate', type=float, default=6.25e-5)
114 |     parser.add_argument("--warmup_steps", default=0, type=int,
115 |                         help="Linear warmup over warmup_steps.")
116 |     parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
117 |     parser.add_argument('--weight_decay', type=float, default=0.01)
118 |     parser.add_argument('--lm_coef', type=float, default=0.9)
119 | 
120 |     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
121 |     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
122 |     args = parser.parse_args()
123 |     print(args)
124 | 
125 | 
126 |     random.seed(args.seed)
127 |     np.random.seed(args.seed)
128 |     torch.manual_seed(args.seed)
129 |     torch.cuda.manual_seed_all(args.seed)
130 | 
131 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
132 |     n_gpu = torch.cuda.device_count()
133 |     logger.info("device: {}, n_gpu {}".format(device, n_gpu))
134 | 
135 |     if not os.path.exists(args.output_dir):
136 |         os.makedirs(args.output_dir)
137 | 
138 |     # Load tokenizer and model
139 |     # This loading functions also add new tokens and embeddings called `special tokens`
140 |     # These new embeddings will be fine-tuned on the RocStories dataset.
141 |     # start_token, delimiter_token, clf_token
142 | 
143 |     special_tokens = ['<|endoftext|>', '<|endoftext|>', '<|cls|>']
144 |     tokenizer = GPT2Tokenizer.from_pretrained(args.model_name, unk_token = '<|endoftext|>', bos_token = '<|endoftext|>', eos_token = '<|endoftext|>', cls_token='<|cls|>')
145 |     tokenizer.add_tokens(['<|cls|>']) 
146 |     special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
147 |     model = GPT2DoubleHeadsModel.from_pretrained(args.model_name)
148 |     model.resize_token_embeddings(new_num_tokens=int(len(tokenizer)))
149 |     
150 |     model.to(device)
151 | 
152 | 
153 |     def tokenize_and_encode(obj):
154 |         """ Tokenize and encode a nested object """
155 |         if isinstance(obj, str):
156 |             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
157 |         elif isinstance(obj, int):
158 |             return obj
159 |         return list(tokenize_and_encode(o) for o in obj)
160 |     logger.info("Encoding dataset...")
161 | 
162 |     train_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior)
163 |     eval_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior)
164 | 
165 |     datasets = (train_dataset, eval_dataset)
166 |     encoded_datasets = tokenize_and_encode(datasets)
167 | 
168 |     # Compute the max input length for the Transformer
169 |     max_length = model.config.n_positions // 2 - 2
170 |     input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
171 |                         for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
172 |     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
173 | 
174 |     # Prepare inputs tensors and dataloaders
175 |     tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
176 |     train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
177 | 
178 |     train_data = TensorDataset(*train_tensor_dataset)
179 |     train_sampler = RandomSampler(train_data)
180 |     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
181 | 
182 |     eval_data = TensorDataset(*eval_tensor_dataset)
183 |     eval_sampler = SequentialSampler(eval_data)
184 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
185 | 
186 |     # Prepare optimizer
187 | 
188 |     if args.max_steps > 0:
189 |         t_total = args.max_steps
190 |         args.num_train_epochs = args.max_steps //\
191 |             (len(train_dataloader) // args.gradient_accumulation_steps) + 1
192 |     else:
193 |         t_total = len(train_dataloader)\
194 |             // args.gradient_accumulation_steps * args.num_train_epochs
195 | 
196 |     param_optimizer = list(model.named_parameters())
197 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
198 |     optimizer_grouped_parameters = [
199 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
200 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
201 |         ]
202 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
203 |     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
204 | 
205 | 
206 |     nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
207 |     model.train()
208 |     for i, _ in enumerate(range(int(args.num_train_epochs))):
209 |         print('Starting Epoch: {} of {}'.format(str(i+1), str(int(args.num_train_epochs))))
210 |         tr_loss = 0
211 |         nb_tr_steps = 0
212 |         tqdm_bar = tqdm(train_dataloader, desc="Training")
213 |         for step, batch in enumerate(tqdm_bar):
214 |             batch = tuple(t.to(device) for t in batch)
215 |             input_ids, mc_token_ids, lm_labels, mc_labels = batch
216 |             losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
217 |             loss = args.lm_coef * losses[0] + losses[1]
218 |             loss.backward()
219 |             scheduler.step()
220 |             optimizer.step()
221 |             optimizer.zero_grad()
222 |             tr_loss += loss.item()
223 |             exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
224 |             nb_tr_steps += 1
225 |             tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
226 | 
227 | # Save a trained model
228 | 
229 |     # Save a trained model, configuration and tokenizer
230 |     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
231 | 
232 |     # If we save using the predefined names, we can load using `from_pretrained`
233 |     output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
234 |     output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
235 | 
236 |     torch.save(model_to_save.state_dict(), output_model_file)
237 |     model_to_save.config.to_json_file(output_config_file)
238 |     tokenizer.save_vocabulary(args.output_dir)
239 | 
240 |     # Load a trained model and vocabulary that you have fine-tuned
241 |     model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
242 |     tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
243 |     model.to(device)
244 | 
245 | 
246 | if __name__ == '__main__':
247 |     main()
248 | 
249 | 


--------------------------------------------------------------------------------
/Models/pytorch/openai_gpt_finetuning.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import os
  4 | import csv
  5 | import random
  6 | import logging
  7 | from tqdm import tqdm, trange
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import math
 12 | import torch
 13 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 14 |                               TensorDataset)
 15 | 
 16 | from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
 17 |                                      AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
 18 |                                      WarmupLinearSchedule)
 19 | 
 20 | 
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | def accuracy(out, labels):
 25 |     outputs = np.argmax(out, axis=1)
 26 |     return np.sum(outputs == labels)
 27 |   
 28 |               
 29 | def load_dataset(tokenizer, dataset, num_prior = None, t='train'):
 30 | 
 31 |     num_prior   = num_prior or 3
 32 |     df = pd.read_csv(dataset)
 33 | 
 34 |     # Clear empty and tokenize all input.
 35 |     text_data = [' '.join(t.split()) for t in df['Text'].tolist() if len(t.split()) > 1]
 36 |     
 37 |     sec = math.floor(len(text_data)*.75)
 38 |     if t == 'train':
 39 |        text_data = text_data[:sec]
 40 |     else:
 41 |        text_data = text_data[sec:]
 42 | 
 43 | 
 44 |     output = []
 45 | 
 46 |     for i, text in enumerate(text_data):
 47 |         if i >= num_prior:
 48 |           pri = ' '.join(text_data[i-num_prior:i])
 49 |           nxt = text
 50 |           rdm = random.choice(text_data)
 51 |           if rdm == nxt:
 52 |             rdm = random.choice(text_data)
 53 |         
 54 |           s = random.choice([0,1])
 55 |           if s == 0:
 56 |             output.append((pri, nxt, rdm, s))
 57 |           else:
 58 |             output.append((pri, rdm, nxt, s))
 59 |             
 60 |     return output
 61 | 
 62 |       
 63 |       
 64 | def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
 65 |     """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
 66 |         To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
 67 |         input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 68 |     """
 69 |     tensor_datasets = []
 70 |     for dataset in encoded_datasets:
 71 |         n_batch = len(dataset)
 72 |         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
 73 |         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
 74 |         lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
 75 |         mc_labels = np.zeros((n_batch,), dtype=np.int64)
 76 |         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
 77 |             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
 78 |             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
 79 |             input_ids[i, 0, :len(with_cont1)] = with_cont1
 80 |             input_ids[i, 1, :len(with_cont2)] = with_cont2
 81 |             mc_token_ids[i, 0] = len(with_cont1) - 1
 82 |             mc_token_ids[i, 1] = len(with_cont2) - 1
 83 |             lm_labels[i, 0, :len(with_cont1)] = with_cont1
 84 |             lm_labels[i, 1, :len(with_cont2)] = with_cont2
 85 |             mc_labels[i] = mc_label
 86 |         all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
 87 |         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
 88 |     return tensor_datasets
 89 | 
 90 | def main():
 91 |   
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('--model_name', type=str, default='openai-gpt',
 94 |                         help='pretrained model name')
 95 |     parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
 96 |     parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
 97 |     parser.add_argument("--output_dir", default='fintuned_gpt', type=str, required=True,
 98 |                         help="The output directory where the model predictions and checkpoints will be written.")
 99 |     parser.add_argument('--dataset', type=str, default='')
100 |     parser.add_argument('--seed', type=int, default=42)
101 |     parser.add_argument('--num_train_epochs', type=int, default=3)
102 |     parser.add_argument('--train_batch_size', type=int, default=8)
103 |     parser.add_argument('--eval_batch_size', type=int, default=16)
104 |     parser.add_argument('--num_prior', type=int, default=2)
105 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
106 |                         help="Epsilon for Adam optimizer.")
107 |     parser.add_argument('--max_grad_norm', type=int, default=1)
108 |     parser.add_argument("--max_steps", default=-1, type=int,
109 |                         help="If > 0: set total number of training \
110 |                         steps to perform. Override num_train_epochs.")
111 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
112 |                         help="Number of updates steps to accumulate before\
113 |                         performing a backward/update pass.")
114 |     parser.add_argument('--learning_rate', type=float, default=6.25e-5)
115 |     parser.add_argument("--warmup_steps", default=0, type=int,
116 |                         help="Linear warmup over warmup_steps.")
117 |     parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
118 |     parser.add_argument('--weight_decay', type=float, default=0.01)
119 |     parser.add_argument('--lm_coef', type=float, default=0.9)
120 | 
121 |     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
122 |     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
123 |     args = parser.parse_args()
124 |     print(args)
125 | 
126 | 
127 |     random.seed(args.seed)
128 |     np.random.seed(args.seed)
129 |     torch.manual_seed(args.seed)
130 |     torch.cuda.manual_seed_all(args.seed)
131 | 
132 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
133 |     n_gpu = torch.cuda.device_count()
134 |     logger.info("device: {}, n_gpu {}".format(device, n_gpu))
135 | 
136 |     if not args.do_train and not args.do_eval:
137 |         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
138 | 
139 |     if not os.path.exists(args.output_dir):
140 |         os.makedirs(args.output_dir)
141 | 
142 |     # Load tokenizer and model
143 |     # This loading functions also add new tokens and embeddings called `special tokens`
144 |     # These new embeddings will be fine-tuned on the RocStories dataset
145 |     special_tokens = ['[BOS]', '[SEP]', '[CLS]']
146 |     tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
147 |     special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
148 |     model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
149 |     model.resize_token_embeddings(new_num_tokens=len(tokenizer))
150 |     
151 |     model.to(device)
152 | 
153 | 
154 |     def tokenize_and_encode(obj):
155 |         """ Tokenize and encode a nested object """
156 |         if isinstance(obj, str):
157 |             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
158 |         elif isinstance(obj, int):
159 |             return obj
160 |         return list(tokenize_and_encode(o) for o in obj)
161 |     logger.info("Encoding dataset...")
162 | 	
163 |     train_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior, t='train')
164 |     eval_dataset = load_dataset(tokenizer, args.dataset, num_prior = args.num_prior, t='eval')
165 | 	
166 |     datasets = (train_dataset, eval_dataset)
167 |     encoded_datasets = tokenize_and_encode(datasets)
168 | 
169 |     # Compute the max input length for the Transformer
170 |     max_length = model.config.n_positions // 2 - 2
171 |     input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
172 |                            for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
173 |     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
174 | 
175 |     # Prepare inputs tensors and dataloaders
176 |     tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
177 |     train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
178 | 
179 |     train_data = TensorDataset(*train_tensor_dataset)
180 |     train_sampler = RandomSampler(train_data)
181 |     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
182 | 
183 |     eval_data = TensorDataset(*eval_tensor_dataset)
184 |     eval_sampler = SequentialSampler(eval_data)
185 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
186 | 
187 |     # Prepare optimizer
188 |     if args.do_train:
189 |         if args.max_steps > 0:
190 |             t_total = args.max_steps
191 |             args.num_train_epochs = args.max_steps //\
192 |                 (len(train_dataloader) // args.gradient_accumulation_steps) + 1
193 |         else:
194 |             t_total = len(train_dataloader)\
195 |                 // args.gradient_accumulation_steps * args.num_train_epochs
196 | 
197 |         param_optimizer = list(model.named_parameters())
198 |         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
199 |         optimizer_grouped_parameters = [
200 |             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
201 |             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
202 |             ]
203 |         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
204 |         scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
205 | 
206 |     if args.do_train:
207 |         nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
208 |         model.train()
209 |         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
210 |             tr_loss = 0
211 |             nb_tr_steps = 0
212 |             tqdm_bar = tqdm(train_dataloader, desc="Training")
213 |             for step, batch in enumerate(tqdm_bar):
214 |                 batch = tuple(t.to(device) for t in batch)
215 |                 input_ids, mc_token_ids, lm_labels, mc_labels = batch
216 |                 losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
217 |                 loss = args.lm_coef * losses[0] + losses[1]
218 |                 loss.backward()
219 |                 scheduler.step()
220 |                 optimizer.step()
221 |                 optimizer.zero_grad()
222 |                 tr_loss += loss.item()
223 |                 exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
224 |                 nb_tr_steps += 1
225 |                 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
226 | 
227 |     # Save a trained model
228 |     if args.do_train:
229 |         # Save a trained model, configuration and tokenizer
230 |         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
231 | 
232 |         # If we save using the predefined names, we can load using `from_pretrained`
233 |         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
234 |         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
235 | 
236 |         torch.save(model_to_save.state_dict(), output_model_file)
237 |         model_to_save.config.to_json_file(output_config_file)
238 |         tokenizer.save_vocabulary(args.output_dir)
239 | 
240 |         # Load a trained model and vocabulary that you have fine-tuned
241 |         model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
242 |         tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
243 |         model.to(device)
244 | 
245 |     if args.do_eval:
246 |         model.eval()
247 |         eval_loss, eval_accuracy = 0, 0
248 |         nb_eval_steps, nb_eval_examples = 0, 0
249 |         for batch in tqdm(eval_dataloader, desc="Evaluating"):
250 |             batch = tuple(t.to(device) for t in batch)
251 |             input_ids, mc_token_ids, lm_labels, mc_labels = batch
252 |             with torch.no_grad():
253 |                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels)
254 | 
255 |             mc_logits = mc_logits.detach().cpu().numpy()
256 |             mc_labels = mc_labels.to('cpu').numpy()
257 |             tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
258 | 
259 |             eval_loss += mc_loss.mean().item()
260 |             eval_accuracy += tmp_eval_accuracy
261 | 
262 |             nb_eval_examples += input_ids.size(0)
263 |             nb_eval_steps += 1
264 | 
265 |         eval_loss = eval_loss / nb_eval_steps
266 |         eval_accuracy = eval_accuracy / nb_eval_examples
267 |         train_loss = tr_loss/nb_tr_steps if args.do_train else None
268 |         result = {'eval_loss': eval_loss,
269 |                   'eval_accuracy': eval_accuracy,
270 |                   'train_loss': train_loss}
271 | 
272 |         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
273 |         with open(output_eval_file, "w") as writer:
274 |             logger.info("***** Eval results *****")
275 |             for key in sorted(result.keys()):
276 |                 logger.info("  %s = %s", key, str(result[key]))
277 |                 writer.write("%s = %s\n" % (key, str(result[key])))
278 | 
279 | if __name__ == '__main__':
280 |     main()
281 | 


--------------------------------------------------------------------------------
/NLP/contractions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Aug 01 01:11:02 2016
  4 | 
  5 | @author: DIP
  6 | """
  7 | 
  8 | CONTRACTION_MAP = {
  9 | "ain't": "is not",
 10 | "aren't": "are not",
 11 | "can't": "cannot",
 12 | "can't've": "cannot have",
 13 | "'cause": "because",
 14 | "could've": "could have",
 15 | "couldn't": "could not",
 16 | "couldn't've": "could not have",
 17 | "didn't": "did not",
 18 | "doesn't": "does not",
 19 | "don't": "do not",
 20 | "hadn't": "had not",
 21 | "hadn't've": "had not have",
 22 | "hasn't": "has not",
 23 | "haven't": "have not",
 24 | "he'd": "he would",
 25 | "he'd've": "he would have",
 26 | "he'll": "he will",
 27 | "he'll've": "he he will have",
 28 | "he's": "he is",
 29 | "how'd": "how did",
 30 | "how'd'y": "how do you",
 31 | "how'll": "how will",
 32 | "how's": "how is",
 33 | "I'd": "I would",
 34 | "I'd've": "I would have",
 35 | "I'll": "I will",
 36 | "I'll've": "I will have",
 37 | "I'm": "I am",
 38 | "I've": "I have",
 39 | "i'd": "i would",
 40 | "i'd've": "i would have",
 41 | "i'll": "i will",
 42 | "i'll've": "i will have",
 43 | "i'm": "i am",
 44 | "i've": "i have",
 45 | "isn't": "is not",
 46 | "it'd": "it would",
 47 | "it'd've": "it would have",
 48 | "it'll": "it will",
 49 | "it'll've": "it will have",
 50 | "it's": "it is",
 51 | "let's": "let us",
 52 | "ma'am": "madam",
 53 | "mayn't": "may not",
 54 | "might've": "might have",
 55 | "mightn't": "might not",
 56 | "mightn't've": "might not have",
 57 | "must've": "must have",
 58 | "mustn't": "must not",
 59 | "mustn't've": "must not have",
 60 | "needn't": "need not",
 61 | "needn't've": "need not have",
 62 | "o'clock": "of the clock",
 63 | "oughtn't": "ought not",
 64 | "oughtn't've": "ought not have",
 65 | "shan't": "shall not",
 66 | "sha'n't": "shall not",
 67 | "shan't've": "shall not have",
 68 | "she'd": "she would",
 69 | "she'd've": "she would have",
 70 | "she'll": "she will",
 71 | "she'll've": "she will have",
 72 | "she's": "she is",
 73 | "should've": "should have",
 74 | "shouldn't": "should not",
 75 | "shouldn't've": "should not have",
 76 | "so've": "so have",
 77 | "so's": "so as",
 78 | "that'd": "that would",
 79 | "that'd've": "that would have",
 80 | "that's": "that is",
 81 | "there'd": "there would",
 82 | "there'd've": "there would have",
 83 | "there's": "there is",
 84 | "they'd": "they would",
 85 | "they'd've": "they would have",
 86 | "they'll": "they will",
 87 | "they'll've": "they will have",
 88 | "they're": "they are",
 89 | "they've": "they have",
 90 | "to've": "to have",
 91 | "wasn't": "was not",
 92 | "we'd": "we would",
 93 | "we'd've": "we would have",
 94 | "we'll": "we will",
 95 | "we'll've": "we will have",
 96 | "we're": "we are",
 97 | "we've": "we have",
 98 | "weren't": "were not",
 99 | "what'll": "what will",
100 | "what'll've": "what will have",
101 | "what're": "what are",
102 | "what's": "what is",
103 | "what've": "what have",
104 | "when's": "when is",
105 | "when've": "when have",
106 | "where'd": "where did",
107 | "where's": "where is",
108 | "where've": "where have",
109 | "who'll": "who will",
110 | "who'll've": "who will have",
111 | "who's": "who is",
112 | "who've": "who have",
113 | "why's": "why is",
114 | "why've": "why have",
115 | "will've": "will have",
116 | "won't": "will not",
117 | "won't've": "will not have",
118 | "would've": "would have",
119 | "wouldn't": "would not",
120 | "wouldn't've": "would not have",
121 | "y'all": "you all",
122 | "y'all'd": "you all would",
123 | "y'all'd've": "you all would have",
124 | "y'all're": "you all are",
125 | "y'all've": "you all have",
126 | "you'd": "you would",
127 | "you'd've": "you would have",
128 | "you'll": "you will",
129 | "you'll've": "you will have",
130 | "you're": "you are",
131 | "you've": "you have"
132 | }


--------------------------------------------------------------------------------
/NLP/normalization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Aug 26 20:45:10 2016
  4 | 
  5 | @author: DIP
  6 | """
  7 | 
  8 | from lib.contractions import CONTRACTION_MAP
  9 | import re
 10 | import nltk
 11 | import string
 12 | from nltk.stem import WordNetLemmatizer
 13 | from html.parser import HTMLParser
 14 | import unicodedata
 15 | 
 16 | stopword_list = nltk.corpus.stopwords.words('english')
 17 | 
 18 | wnl = WordNetLemmatizer()
 19 | html_parser = HTMLParser()
 20 | 
 21 | def tokenize_text(text):
 22 |     tokens = nltk.word_tokenize(text)
 23 |     tokens = [token.strip() for token in tokens]
 24 |     return tokens
 25 | 
 26 | def expand_contractions(text, contraction_mapping):
 27 | 
 28 |     contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
 29 |                                       flags=re.IGNORECASE|re.DOTALL)
 30 |     def expand_match(contraction):
 31 |         match = contraction.group(0)
 32 |         first_char = match[0]
 33 |         expanded_contraction = contraction_mapping.get(match)\
 34 |                                 if contraction_mapping.get(match)\
 35 |                                 else contraction_mapping.get(match.lower())
 36 |         expanded_contraction = first_char+expanded_contraction[1:]
 37 |         return expanded_contraction
 38 | 
 39 |     expanded_text = contractions_pattern.sub(expand_match, text)
 40 |     expanded_text = re.sub("'", "", expanded_text)
 41 |     return expanded_text
 42 | 
 43 | 
 44 | from nltk.corpus import wordnet as wn
 45 | import en_core_web_sm
 46 | nlp = en_core_web_sm.load()
 47 | 
 48 | # Annotate text tokens with POS tags
 49 | def pos_tag_text(text):
 50 | 
 51 |     def penn_to_wn_tags(pos_tag):
 52 |         if pos_tag.startswith('ADJ'):
 53 |             return wn.ADJ
 54 |         elif pos_tag.startswith('VERB'):
 55 |             return wn.VERB
 56 |         elif pos_tag.startswith('NOUN'):
 57 |             return wn.NOUN
 58 |         elif pos_tag.startswith('ADV'):
 59 |             return wn.ADV
 60 |         else:
 61 |             return None
 62 | 
 63 |     tagged_text = nlp(text)
 64 |     tagged_lower_text = [(str(word).lower(), penn_to_wn_tags(word.pos_))
 65 |                          for word in
 66 |                          tagged_text]
 67 |     return tagged_lower_text
 68 | 
 69 | # lemmatize text based on POS tags
 70 | def lemmatize_text(text):
 71 | 
 72 |     pos_tagged_text = pos_tag_text(text)
 73 |     lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
 74 |                          else word
 75 |                          for word, pos_tag in pos_tagged_text]
 76 |     lemmatized_text = ' '.join(lemmatized_tokens)
 77 |     return lemmatized_text
 78 | 
 79 | 
 80 | def remove_special_characters(text):
 81 |     tokens = tokenize_text(text)
 82 |     pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
 83 |     filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
 84 |     filtered_text = ' '.join(filtered_tokens)
 85 |     return filtered_text
 86 | 
 87 | def remove_stopwords(text):
 88 |     tokens = tokenize_text(text)
 89 |     filtered_tokens = [token for token in tokens if token not in stopword_list]
 90 |     filtered_text = ' '.join(filtered_tokens)
 91 |     return filtered_text
 92 | 
 93 | def sort_terms(text):
 94 |     tokens = tokenize_text(text)
 95 |     tokens.sort()
 96 |     filtered_text = ' '.join(tokens)
 97 |     return filtered_text
 98 | 
 99 | def keep_text_characters(text):
100 |     filtered_tokens = []
101 |     tokens = tokenize_text(text)
102 |     for token in tokens:
103 |         if re.search('[a-zA-Z]', token):
104 |             filtered_tokens.append(token)
105 |     filtered_text = ' '.join(filtered_tokens)
106 |     return filtered_text
107 | 
108 | def unescape_html(parser, text):
109 | 
110 |     return parser.unescape(text)
111 | 
112 | 
113 | def normalize_corpus(corpus, lemmatize=True,
114 |                      only_text_chars=False,
115 |                      tokenize=False, sort_text=False):
116 | 
117 |     normalized_corpus = []
118 |     for text in corpus:
119 |         text = html_parser.unescape(text)
120 |         text = expand_contractions(text, CONTRACTION_MAP)
121 |         if lemmatize:
122 |             text = lemmatize_text(text)
123 |         else:
124 |             text = text.lower()
125 |         text = remove_special_characters(text)
126 |         text = remove_stopwords(text)
127 |         if sort_text:
128 |             text = sort_terms(text)
129 |         if only_text_chars:
130 |             text = keep_text_characters(text)
131 | 
132 |         if tokenize:
133 |             text = tokenize_text(text)
134 |             normalized_corpus.append(text)
135 |         else:
136 |             normalized_corpus.append(text)
137 | 
138 |     return normalized_corpus
139 | 
140 | 
141 | def parse_document(document):
142 |     document = re.sub('\n', ' ', document)
143 |     if isinstance(document, str):
144 |         document = document
145 |     elif isinstance(document, unicode):
146 |         return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
147 |     else:
148 |         raise ValueError('Document is not string or unicode!')
149 |     document = document.strip()
150 |     sentences = nltk.sent_tokenize(document)
151 |     sentences = [sentence.strip() for sentence in sentences]
152 |     return sentences
153 | 


--------------------------------------------------------------------------------
/NLP/tfidf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 5 | from nlp.normalization import normalize_corpus
 6 | 
 7 | 
 8 | 
 9 | def get_tfidf(items, **data):
10 | 
11 |     ngram_range = data.get('ngram_range',(1, 4))
12 |     min_df = data.get('min_df',3)
13 |     top_n = data.get('top_n', 10)
14 | 
15 |     items = normalize_corpus(items)
16 |     vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df = min_df)
17 | 
18 |     tvec_weights = vectorizer.fit_transform(items)
19 | 
20 |     weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
21 | 
22 |     weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})
23 | 
24 |     top_features = weights_df.sort_values(by='weight', ascending=False).head(top_n)['term'].tolist()
25 | 
26 |     return top_features
27 | 


--------------------------------------------------------------------------------
/NLP/todo.txt:
--------------------------------------------------------------------------------
1 | Spacy
2 | StarSpace
3 | Facebook and Google libraries
4 | Universal Sentence Encoder
5 | https://www.tensorflow.org/hub/modules/google/universal-sentence-encoder/1
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MLTS
 2 | Machine Learning Toolkit for SEO
 3 | 
 4 | Initial demo notebook [here](https://github.com/MLTSEO/MLTS/blob/master/Demos.ipynb)
 5 | 
 6 | ## What are the problems/needs?
 7 | What are the particular problems in the community that could be solved via machine learning.
 8 | * Generating better titles.
 9 | * Generating descriptions for pages. Summarization.
10 | * Generating alt text from images.
11 | * Need to get from the community.
12 | * Create a Twitterbot
13 | 
14 | ## What is the overall flow?
15 | * Data Getting
16 | * Data Cleaning and Feature Extraction
17 | * Iteration and Updating
18 | * Optimization
19 | * Models (train / predict)
20 | 
21 | ## Roles
22 | * Developing Use Cases
23 | * Evangelism / Community
24 | * Analytics (per Britney: [Analytics](https://ga-beacon.appspot.com/UA-XXXXX-X/gist-id?pixel))
25 | * Coding
26 | * Tutorials
27 | * Documentation / Readability
28 | * Unit Tests / Linting
29 | * Design
30 | 
31 | ## Data needs
32 | * Link data
33 | * Analytics
34 | * Scraping
35 | * Ranking data
36 | * Anonymous performance data
37 | 
38 | ## Proposed Structure
39 | Most folders include a Todo.txt with some suggested items to start with.
40 | 
41 | * APIs: Holds glue for various SEO APIs
42 | * Data: Holds datagetter classes for APIs and hosted datasets.
43 | * Docs: Holds the documentation for the repo.
44 | * Models: Holds various models that can be used to train on.
45 | * NPL: Glue for NLP libraries
46 | * Testing: Unit testing and CI
47 | * Tutorials: Holds iPython tutorials in Pytorch and Tensorflow
48 | * Config.py: Holds API keys and configuration data.
49 | * Main.py: The main application file.
50 | * requirements.txt: Python libraries needed to install via Pip.
51 | 
52 | 
53 | Original concept gist: ([source](https://gist.github.com/jroakes/e84180a6ebafce11cecc9554421a9ac3))
54 | 


--------------------------------------------------------------------------------
/Testing/todo.txt:
--------------------------------------------------------------------------------
1 | Continuous Integration
2 | Travis CI (free for OpenSource)
3 | Unit tests: https://github.com/udacity/deep-learning/blob/master/tv-script-generation/problem_unittests.py
4 | 


--------------------------------------------------------------------------------
/Tutorials/PageRankandCheiRank.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## PageRank and CheiRank Calculated from ScreamingFrog Crawl Data and Moz API Data\n",
  8 |     "\n",
  9 |     "The following workflow requires two files.\n",
 10 |     "\n",
 11 |     "* **internal_html.csv**: Is an export of Internal HTML from Screaming Frog. The Moz API must be enabled (requires API key). Select `URL` > `MozRank External Equity` in `API Access` > `Moz` > `Metrics` of Screaming Frog.\n",
 12 |     "\n",
 13 |     "* **all_inlinks.csv**: Is an bulk export of `Bulk Export` >  `All Inlinks` from Screaming Frog.\n",
 14 |     "\n",
 15 |     "Both files are raw exports so Column names are the defaults and the read_csv function expects to skip the first row.\n",
 16 |     "\n",
 17 |     "A follow up to this [tweet](https://twitter.com/willem_nout/status/1101417508685467648).\n",
 18 |     "\n",
 19 |     "Follow me, [JR Oakes](https://twitter.com/jroakes), on Twitter for more Technical SEO goodness."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Install These Libraries\n",
 27 |     "\n",
 28 |     "If you don't have them.  Otherwise, skip."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "!pip install networkx\n",
 38 |     "!pip insall pandas\n",
 39 |     "!pip insall tqdm"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Import Libraries"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 26,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import networkx as nx\n",
 56 |     "import pandas as pd\n",
 57 |     "import re\n",
 58 |     "from tqdm import tqdm"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "### Specify Some Variables\n",
 66 |     "\n",
 67 |     "We need to specify some variables we will use later."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 45,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Base domain for your property\n",
 77 |     "domain = \"domain.com\"\n",
 78 |     "\n",
 79 |     "# Specify the output filename base.  Auto-generated from domain if `None`.\n",
 80 |     "filename = None"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Functions Which Consolidate URLs\n",
 88 |     "1. Consolidates URLs to canonical versions based on canonical link element.\n",
 89 |     "1. Provides mapping dictionary for 30X URls from intial 301 to final 200 (canonical) URL."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "\n",
 99 |     "def apply_mapping(row, mapping):\n",
100 |     "    if row['Destination'] in mapping:\n",
101 |     "        row['Destination'] = mapping[row['Destination']]\n",
102 |     "    if row['Source'] in mapping:\n",
103 |     "        row['Source'] = mapping[row['Source']]\n",
104 |     "    return row\n",
105 |     "\n",
106 |     "\n",
107 |     "def consolidate_urls(df_html):\n",
108 |     "    \n",
109 |     "    mappings = {}\n",
110 |     "    \n",
111 |     "    # Consolidate canonicals\n",
112 |     "    good_statuses = [200]\n",
113 |     "    df_html_200 = pd.DataFrame()\n",
114 |     "    \n",
115 |     "    df_html_good = df_html[df_html['Status Code'].isin(good_statuses)]\n",
116 |     "    \n",
117 |     "    for i, row in tqdm(df_html_good.iterrows(), total=df_html_good.shape[0]):\n",
118 |     "        \n",
119 |     "        canonical = str(row['Canonical Link Element 1'])\n",
120 |     "        \n",
121 |     "        if \"/\" in canonical and canonical != row['Address']:\n",
122 |     "            mappings[row['Address']] = canonical\n",
123 |     "            row['Address'] = canonical\n",
124 |     "        else:\n",
125 |     "            mappings[row['Address']] = row['Address']\n",
126 |     "            \n",
127 |     "        df_html_200 = df_html_200.append(row, ignore_index=True)\n",
128 |     "    \n",
129 |     "    df_html_200 = df_html_200.groupby(['Address'], as_index=False).agg({'Moz External Equity Links - Exact': 'sum', 'Outlinks':'max'})\n",
130 |     "    \n",
131 |     "    # Create mapping for redirects\n",
132 |     "    redirect_statuses = [301,302]\n",
133 |     "    df_html_redir = df_html[df_html['Status Code'].isin(redirect_statuses)]\n",
134 |     "    \n",
135 |     "    addresslist = df_html_redir['Address'].tolist()\n",
136 |     "    redirlist =  df_html_redir['Redirect URL'].tolist()\n",
137 |     "    \n",
138 |     "    for i, address in tqdm(enumerate(addresslist)):\n",
139 |     " \n",
140 |     "        redir = redirlist[i]\n",
141 |     "        \n",
142 |     "        if redir in mappings:\n",
143 |     "            mappings[address] = mappings[redir]\n",
144 |     "        else:\n",
145 |     "            for _ in range(5):\n",
146 |     "                if redir in addresslist:\n",
147 |     "                    redir = redirlist[addresslist.index(redir)]\n",
148 |     "                    if redir in mappings:\n",
149 |     "                        mappings[address] = mappings[redir]\n",
150 |     "                        break\n",
151 |     "                        \n",
152 |     "                        \n",
153 |     "    return df_html_200, mappings\n",
154 |     "        "
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "### Read Crawl HTML Data\n",
162 |     "\n",
163 |     "Crawl with Screaming Frog and export HTML.  Ensure that you respect robots.txt, noindex, canonical, etc to try to get as close a representation to what Google gets as possible.\n",
164 |     "\n",
165 |     "**Warning**: Make sure that `URL` > `MozRank External Equity` is selected in `API Access` > `Moz` > `Metrics` of Screaming Frog (requires API key) \n",
166 |     "\n",
167 |     "**Expects**: `internal_html.csv` file from Screaming Frog"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "df_html = pd.read_csv('internal_html.csv', skiprows=1)\n",
177 |     "\n",
178 |     "# Grab 200 urls and canonicalize\n",
179 |     "df_html, mappings = consolidate_urls(df_html)\n",
180 |     "\n",
181 |     "df_html = df_html[['Address','Moz External Equity Links - Exact', 'Outlinks']]\n",
182 |     "df_html.columns = ['Address', 'Equity', 'Outlinks']\n",
183 |     "df_html.head()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Read Internal Link Data\n",
191 |     "\n",
192 |     "After the prior crawl, use the Bulk Export tool to export All Inlinks. We then clean the data a bit to ensure we have only the links that we want.\n",
193 |     "\n",
194 |     "**Expects**: `all_inlinks.csv` file from Screaming Frog"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "df_links = pd.read_csv('all_inlinks.csv', skiprows=1, low_memory=False)\n",
204 |     "\n",
205 |     "# keep only Ahref and Follow\n",
206 |     "df_links = df_links[(df_links['Type'] == \"AHREF\") & (df_links['Follow'] == True)]\n",
207 |     "\n",
208 |     "# keep only internal links\n",
209 |     "df_links = df_links[(df_links['Destination'].str.match(r'^(http:|https:)//(www.)?{}.*$'.format(domain), case=False)) & (df_links['Source'].str.match(r'^(http:|https:)//(www.)?{}.*$'.format(domain), case=False))]\n",
210 |     "\n",
211 |     "# Map links to their final destination\n",
212 |     "df_links = df_links.apply(apply_mapping, axis=1, args=(mappings,))\n",
213 |     "\n",
214 |     "# Keep only the columns we need\n",
215 |     "df_links = df_links[['Source','Destination']]\n",
216 |     "\n",
217 |     "df_links.head()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "### Clean the Links in Both Datasets\n",
225 |     "Converts the urls to paths and removes trailing slashes.  \n",
226 |     "\n",
227 |     "**Warning**: This is not really needed as the consolidation done earlier is more reflective of how Google handles URLs. "
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "from urllib.parse import urlparse\n",
237 |     "\n",
238 |     "def remove_trail_slash(s):\n",
239 |     "    if s.endswith('/'):\n",
240 |     "        s = s[:-1]\n",
241 |     "    return s\n",
242 |     "\n",
243 |     "# This may or may not be what you want to do depending on the site and can, for sure be extended to keep important querystings or consolidate canoncicals.\n",
244 |     "def apply_clean_links(row):\n",
245 |     "    \n",
246 |     "    cols=['Address', 'Source', 'Destination']\n",
247 |     "    \n",
248 |     "    for c in cols:\n",
249 |     "        if c in row:\n",
250 |     "            row[c] = remove_trail_slash(urlparse(row[c]).path)\n",
251 |     "        \n",
252 |     "    return row\n",
253 |     "\n",
254 |     "\n",
255 |     "df_links = df_links.apply(apply_clean_links, axis=1)\n",
256 |     "df_html = df_html.apply(apply_clean_links, axis=1)\n",
257 |     "\n",
258 |     "#Consolidate External Equity\n",
259 |     "df_html = df_html.groupby([ 'Address'], as_index=False).agg({\"Equity\": \"max\",\"Outlinks\":\"max\"})\n",
260 |     "\n",
261 |     "\n",
262 |     "df_html.head()"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "### Set Up the Graphs\n",
270 |     "This sets up the directed graphs used in the PR and CR algorithms"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 6,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "def traverse_dataframe(df, addresses, graph, gtype= \"PR\"):\n",
280 |     "    \n",
281 |     "    for i, row in df.iterrows():\n",
282 |     "        \n",
283 |     "        # Add nodes\n",
284 |     "        if 'Address' in row:\n",
285 |     "            if graph.has_node(row['Address']) == False:\n",
286 |     "                graph.add_node(row['Address'])\n",
287 |     "        # Add edges\n",
288 |     "        elif 'Destination' in row and 'Source' in row:\n",
289 |     "            \n",
290 |     "            #Skip adding edge if source or destination is not in set of pages.\n",
291 |     "            if row['Destination'] in addresses and row['Source'] in addresses:\n",
292 |     "                if gtype == 'PR':\n",
293 |     "                    graph.add_edge(row['Source'], row['Destination'])\n",
294 |     "                else:\n",
295 |     "                    graph.add_edge(row['Destination'], row['Source'])\n",
296 |     "            \n",
297 |     "        else:\n",
298 |     "            raise Exception('The correct dataframes were not supplied.  Expecting either `Address` or `Destination` and `Source` columns.')\n",
299 |     "            \n",
300 |     "            \n",
301 |     "\n",
302 |     "def run_graphs(df_links, df_html):\n",
303 |     "\n",
304 |     "    pr_graph = nx.DiGraph()\n",
305 |     "    cr_graph = nx.DiGraph()\n",
306 |     "    \n",
307 |     "    addresses = df_html['Address'].tolist()\n",
308 |     "    \n",
309 |     "    # Pagerank Graph\n",
310 |     "    traverse_dataframe(df_html, addresses, pr_graph, gtype= \"PR\")\n",
311 |     "    traverse_dataframe(df_links, addresses, pr_graph, gtype= \"PR\")\n",
312 |     "                  \n",
313 |     "    # CheiRank Graph\n",
314 |     "    traverse_dataframe(df_html, addresses, cr_graph, gtype= \"CR\")\n",
315 |     "    traverse_dataframe(df_links, addresses, cr_graph, gtype= \"CR\")\n",
316 |     "    \n",
317 |     "    \n",
318 |     "    return pr_graph, cr_graph\n",
319 |     "    \n",
320 |     "    \n",
321 |     "\n",
322 |     "pr_graph, cr_graph = run_graphs(df_links, df_html)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "### Get initial weights from Moz External Equity and run PageRank and CheiRank\n",
330 |     "This does all the work"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 7,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "adr= df_html['Address'].tolist()\n",
340 |     "eqt= df_html['Equity'].tolist()\n",
341 |     "\n",
342 |     "init_nstart = {v:eqt[i] for i,v in enumerate(adr)}\n",
343 |     "\n",
344 |     "scores_pr = nx.pagerank(pr_graph, nstart=init_nstart, max_iter=1000)\n",
345 |     "scores_cr = nx.pagerank(cr_graph, nstart=init_nstart, max_iter=1000)"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "### Plot PageRank and CheiRank Graph\n",
353 |     "**Warning**: This will more than likely run out of memory or be hard to read for large sites."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "import matplotlib.pyplot as plt\n",
363 |     "from matplotlib.pyplot import figure\n",
364 |     "\n",
365 |     "topn = 10\n",
366 |     "\n",
367 |     "if scores_pr:\n",
368 |     "    # Sort nodes by best\n",
369 |     "    ranked_nodes_pr = sorted(((scores_pr[s],s) for i,s in enumerate(list(scores_pr.keys()))), reverse=True)\n",
370 |     "    # Get the topn nodes\n",
371 |     "    nodelist = [n[1] for n in ranked_nodes_pr][:topn]\n",
372 |     "    edgelist = [(a[0],a[1]) for a in pr_graph.edges() if a[0] in nodelist and a[1] in nodelist]\n",
373 |     "    labels = {n:n for n in nodelist}\n",
374 |     "    sizes_pr = [(scores_pr[x])*10000 for x in list(scores_pr) if x in nodelist]   \n",
375 |     "    sm = nx.draw(pr_graph, with_labels = True, node_size=sizes_pr, nodelist=nodelist, edgelist=edgelist, labels=labels)\n",
376 |     "    plt.show()\n",
377 |     "    \n",
378 |     "if scores_cr:\n",
379 |     "    # Sort nodes by best\n",
380 |     "    ranked_nodes_cr = sorted(((scores_cr[s],s) for i,s in enumerate(list(scores_cr.keys()))), reverse=True)\n",
381 |     "    # Get the topn nodes\n",
382 |     "    nodelist = [n[1] for n in ranked_nodes_cr][:topn]\n",
383 |     "    edgelist = [(a[0],a[1]) for a in cr_graph.edges() if a[0] in nodelist and a[1] in nodelist]\n",
384 |     "    labels = {n:n for n in nodelist}\n",
385 |     "    sizes_cr = [(scores_cr[x])*10000 for x in list(scores_cr) if x in nodelist]   \n",
386 |     "    sm = nx.draw(cr_graph, with_labels = True, node_size=sizes_cr, nodelist=nodelist, edgelist=edgelist, labels=labels)\n",
387 |     "    plt.show()"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "### Save to a CSV\n",
395 |     "Saves the initial normalized data to csv"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "def apply_scores(row,scores_pr,scores_cr):\n",
405 |     "    adr = row['Address']\n",
406 |     "    otl = int(row['Outlinks'] or 1)\n",
407 |     "    row['PageRank'] = scores_pr.get(adr,0)\n",
408 |     "    row['CheiRank'] = scores_cr.get(adr,0)\n",
409 |     "    row['Link Equity'] = float(scores_cr.get(adr,0)/otl)\n",
410 |     "   \n",
411 |     "    return row\n",
412 |     "\n",
413 |     "def normalize_colums(df):\n",
414 |     "    cols = ['PageRank','Equity','CheiRank','Link Equity']\n",
415 |     "    \n",
416 |     "    for c in cols:\n",
417 |     "        df[c] = (df[c]-df[c].min())/(df[c].max()-df[c].min())\n",
418 |     "        \n",
419 |     "    return df\n",
420 |     "\n",
421 |     "df_html = df_html.apply(apply_scores, args=(scores_pr,scores_cr), axis=1)\n",
422 |     "df_html_norm = normalize_colums(df_html)\n",
423 |     "\n",
424 |     "fname = 'norm_' + (filename or domain.replace('.','_') + \".csv\")\n",
425 |     "\n",
426 |     "df_html_norm.to_csv(fname)\n",
427 |     "df_html_norm.head()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "### Apply Categories\n",
435 |     "This apply function can be adjusted to apply category groupings to your data however you like."
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 35,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "def apply_cat(row):\n",
445 |     "    address = row[\"Address\"]\n",
446 |     "    row['Category'] = \"None\"\n",
447 |     "    \n",
448 |     "    address_parts = re.sub(r'^https?:\\/\\/(www\\.)?{}/'.format(domain), \"\", address).split('/')\n",
449 |     "    \n",
450 |     "    # Adjust below to categorize how you like. \n",
451 |     "    \n",
452 |     "    if len(address_parts) > 2:\n",
453 |     "        row['Category'] = address_parts[0]+\"-\"+address_parts[1]\n",
454 |     "    elif len(address_parts) > 0:\n",
455 |     "        row['Category'] = address_parts[0]\n",
456 |     "            \n",
457 |     "    return row\n",
458 |     "\n",
459 |     "df_html_norm = df_html_norm.apply(apply_cat, axis=1)\n",
460 |     "\n",
461 |     "fname = 'cat_norm_' + (filename or domain.replace('.','_') + \".csv\")\n",
462 |     "\n",
463 |     "df_html_norm.to_csv(fname)\n",
464 |     "df_html_norm.head()"
465 |    ]
466 |   }
467 |  ],
468 |  "metadata": {
469 |   "kernelspec": {
470 |    "display_name": "Python 3",
471 |    "language": "python",
472 |    "name": "python3"
473 |   },
474 |   "language_info": {
475 |    "codemirror_mode": {
476 |     "name": "ipython",
477 |     "version": 3
478 |    },
479 |    "file_extension": ".py",
480 |    "mimetype": "text/x-python",
481 |    "name": "python",
482 |    "nbconvert_exporter": "python",
483 |    "pygments_lexer": "ipython3",
484 |    "version": "3.7.2"
485 |   }
486 |  },
487 |  "nbformat": 4,
488 |  "nbformat_minor": 2
489 | }
490 | 


--------------------------------------------------------------------------------
/Tutorials/Pytorch/ClickPrediction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Click Prediction\n",
  8 |     "\n",
  9 |     "This is a toy demo to show the overall process. This should not be expected to product accurate click predictions."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import torch\n",
 20 |     "import torch.nn as nn\n",
 21 |     "from torch.autograd import Variable\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "%matplotlib inline\n",
 24 |     "from sklearn.metrics import accuracy_score, confusion_matrix\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "import sys\n",
 28 |     "sys.path.insert(0,'../..')\n",
 29 |     "\n",
 30 |     "from api import *"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Services available:\n",
 38 |     "\n",
 39 |     "* gscservice: Google Search Console\n",
 40 |     "* gaservice: Google Analytics.  A wrapper around https://github.com/debrouwere/google-analytics.\n",
 41 |     "* semrushservice: SEM Rush. A port of https://github.com/storerjeremy/python-semrush.\n",
 42 |     "* watsonservice: IBM Watson API.  Pulls keywords and entities for a given html"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## 1) Get GSC Data"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 58,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "Reloading Existing: demo3.csv\n"
 62 |      ]
 63 |     },
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "(2055, 6)"
 68 |       ]
 69 |      },
 70 |      "execution_count": 58,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "gsc_profile = 'https://www.domain.com'\n",
 77 |     "days_back = 180\n",
 78 |     "\n",
 79 |     "# Combine the dataframe by nquery and page.\n",
 80 |     "def combine_gsc(df):\n",
 81 |     "    \n",
 82 |     "    df = df.groupby(['query'], as_index=False).agg({\"impressions\": \"sum\",\"clicks\":\"sum\",\"position\": \"mean\"})\n",
 83 |     "        \n",
 84 |     "    df['ctr'] = df['clicks']/df['impressions']\n",
 85 |     "    df['position'] = df['position'].astype(int)\n",
 86 |     "    df = df.round({'ctr': 2})\n",
 87 |     "    return df\n",
 88 |     "\n",
 89 |     "df = gscservice.get_site_data(gsc_profile, days_back, output_fn=\"demo3.csv\")\n",
 90 |     "\n",
 91 |     "df = combine_gsc(df).reset_index()\n",
 92 |     "#Reduce to top queries by clicks\n",
 93 |     "df = df[df.clicks > 5]\n",
 94 |     "df.shape"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## 2) Set Up DataLoader"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "import dataset\n",
111 |     "\n",
112 |     "features = df[['position','impressions']]\n",
113 |     "labels = df[['clicks']]\n",
114 |     "\n",
115 |     "def apply_embed(row):\n",
116 |     "    embed = row['embedding']\n",
117 |     "    for i, e in enumerate(embed):\n",
118 |     "        row['e_'+str(i)] = e\n",
119 |     "    return row\n",
120 |     "\n",
121 |     "data_loader_bert, df_bert = dataset.load_bert_df(input_df=df, input_row=\"query\")\n",
122 |     "df_bert_embed = df_bert.apply(apply_embed,axis=1).drop(columns=['embedding','linex_index','tokens'])\n",
123 |     "\n",
124 |     "features = pd.concat([features.reset_index(drop =True), df_bert_embed.reset_index(drop =True)], axis=1)\n",
125 |     "\n",
126 |     "data_loader = dataset.load_pandas(features, labels, batch_size=32, shuffle=True, drop_last=True)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "## 3) Set up Model"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 70,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "torch.manual_seed(123)\n",
143 |     "\n",
144 |     "# Make it\n",
145 |     "model = torch.nn.Sequential(\n",
146 |     "    torch.nn.Linear(770, 100),\n",
147 |     "    torch.nn.ReLU(),\n",
148 |     "    torch.nn.Linear(100, 1),\n",
149 |     ")\n",
150 |     "\n",
151 |     "# Optimizing options\n",
152 |     "loss_function =  nn.MSELoss()\n",
153 |     "\n",
154 |     "optimizer = torch.optim.Adam(model.parameters(), lr = 0.00001)\n",
155 |     "#optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)\n",
156 |     "\n",
157 |     "# Number of Epochs\n",
158 |     "n_epochs = 500"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## 4) Train"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 71,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3Xl4XPV97/H3d0arJW+yZcCrDHHAxhgDYklMSGgJMbSx217SQJKWhCQUUnrTNM1T0t6mPNze5ybpE0KSOqXkhpKkBBeSG+L2OiEQIGHHNsYb3uQFW5YXWfs+2/f+cUayJGsZySPJOvq8nkeP5pz5zTnf+Z1zvvOb3/nNOebuiIhIuETGOgAREck+JXcRkRBSchcRCSEldxGREFJyFxEJISV3EZEQUnIXEQkhJXcRkRBSchcRCaGcsVrxzJkzvaysbKxWLyIyLm3atOmku5cOVm7MkntZWRkbN24cq9WLiIxLZvZOJuXULSMiEkIZJXczW2lmu82swszu7eP5+Wb2vJltNrOtZnZz9kMVEZFMDZrczSwKrAFuApYAt5nZkl7F/gfwhLtfBtwKfDfbgYqISOYy6XO/Cqhw9/0AZrYWWA283a2MA1PSj6cCVdkMUkTObvF4nMrKStrb28c6lNAoKChg7ty55ObmDuv1mST3OcDhbtOVwNW9ytwH/MrM/gIoAm4YVjQiMi5VVlYyefJkysrKMLOxDmfcc3dqamqorKxk4cKFw1pGJn3ufW2p3nf4uA141N3nAjcDPzKz05ZtZnea2UYz21hdXT30aEXkrNTe3s6MGTOU2LPEzJgxY8YZfRPKJLlXAvO6Tc/l9G6XTwNPALj7q0ABMLP3gtz9YXcvd/fy0tJBh2mKyDiixJ5dZ1qfmST3DcAiM1toZnkEJ0zX9SpzCPjddECLCZK7muZyVjnW0M7xRvUJy8QwaHJ39wRwD/A0sJNgVMwOM7vfzFali30R+KyZbQEeBz7pujmrnGUef+MQP3790FiHISPk29/+NosXL+bjH//4qK63uLh4wOfr6+v57ndHfwBhRr9Qdff1wPpe877S7fHbwIrshiYikrnvfve7/OIXvzjtBGQikSAnZ8x+jN+V3D/3uc+N6nrH7h2LiGTJXXfdxf79+1m1ahV33HEHDQ0NVFVVcfDgQWbOnMkjjzzC3XffzcaNG8nJyeGBBx7g+uuv59FHH+Wpp54imUyyfft2vvjFLxKLxfjRj35Efn4+69evp6SkpMe6Dhw4wMc+9jESiQQrV67smt/c3Mzq1aupq6sjHo/zj//4j6xevZp7772Xffv2sXz5cj74wQ/yD//wD32WyzYldxHJqhd2n6C6qSOryyydnM8HLpzV7/MPPfQQv/zlL3n++eeZOXMm9913H5s2beKll16isLCQb3zjGwBs27aNXbt2ceONN7Jnzx4Atm/fzubNm2lvb+dd73oXX/va19i8eTNf+MIX+OEPf8hf/uVf9ljX5z//ee6++27+9E//lDVr1nTNLygo4Gc/+xlTpkzh5MmTXHPNNaxatYqvfvWrbN++nbfeegsIvkn0VS7bJ6R1bRkRCaVVq1ZRWFgIwEsvvcSf/MmfAHDRRRexYMGCruR+/fXXM3nyZEpLS5k6dSof/vCHAbjkkks4ePDgact9+eWXue222wC6lgnB2PS//du/ZdmyZdxwww0cOXKE48ePn/b6TMudKbXcRSSrBmphj6aioqKuxwON78jPz+96HIlEuqYjkQiJRKLP1/TVyn7ssceorq5m06ZN5ObmUlZW1uc49UzLnSm13EUk9K677joee+wxAPbs2cOhQ4e48MILh7WsFStWsHbtWoCuZQI0NDQwa9YscnNzef7553nnneDKvJMnT6apqWnQctmm5C4iofe5z32OZDLJJZdcwkc/+lEeffTRHi32ofjWt77FmjVruPLKK2loaOia//GPf5yNGzdSXl7OY489xkUXXQTAjBkzWLFiBUuXLuVLX/pSv+WyzcZqOHp5ebnrZh0ymr75TNDH+oUPvnuMIwmfnTt3snjx4rEOI3T6qlcz2+Tu5YO9Vi13EZEQUnIXEQkhJXcRkRBSchcRCSEldxGREFJyFxEJISV3EZFeOi/jW1VVxS233DJg2QcffJDW1tau6Ztvvpn6+voRjS8TSu4iMiEkk8khv2b27Nn85Cc/GbBM7+S+fv16pk2bNuR1ZZuSu4iMewcPHuSiiy7i9ttvZ9myZdxyyy20trZSVlbG/fffz7XXXsuTTz7Jvn37WLlyJVdccQXve9/72LVrFxBcxvc973kPV155JX//93/fY7lLly4Fgg+Hv/7rv+aSSy5h2bJlfOc73+Hb3/42VVVVXH/99Vx//fUAlJWVcfLkSQAeeOABli5dytKlS3nwwQe7lrl48WI++9nPcvHFF3PjjTfS1taW9TrJ6MJhZrYS+BYQBf6Pu3+11/PfBK5PT04CZrn72H90icjo2/ssNGf5KofF58CiGwYssnv3br7//e+zYsUK7rjjjq67HxUUFPDSSy8B8Lu/+7s89NBDLFq0iNdff53Pfe5zPPfcc/1exre7hx9+mAMHDrB582ZycnKora2lpKSEBx54oOtSw91t2rSJf/u3f+P111/H3bn66qt5//vfz/Tp09m7dy+PP/443/ve9/jjP/5jfvrTn/KJT3wiCxV1yqAtdzOLAmuAm4AlwG1mtqR7GXf/grsvd/flwHeA/5vVKEVEBjFv3jxWrAhuCPeJT3yiK6F/9KMfBYKbabzyyit85CMfYfny5fzZn/0ZR48eBfq/jG93zz77LHfddVfXXZ1638Sjt5deeok//MM/pKioiOLiYv7oj/6IF198EYCFCxeyfPlyAK644oo+Ly18pjJpuV8FVLj7fgAzWwusBt7up/xtwD9kJzwRGXcGaWGPlN6X4e2c7rz0byqVYtq0aV03zRjs9b25+5BuqJHpZYaj0eiIdMtk0uc+BzjcbboyPe80ZrYAWAg818/zd5rZRjPbWF1dPdRYRUT6dejQIV599VUAHn/8ca699toez0+ZMoWFCxfy5JNPAkHy3bJlC9D/ZXy7u/HGG3nooYe6rvFeW1sLnH5J307XXXcdTz31FK2trbS0tPCzn/2M973vfVl4p5nJJLn39VHV30fSrcBP3L3P09Lu/rC7l7t7eWlpaaYxiogMavHixfzgBz9g2bJl1NbWcvfdd59W5rHHHuP73/8+l156KRdffDE///nPgf4v49vdZz7zGebPn8+yZcu49NJL+fGPfwzAnXfeyU033dR1QrXT5Zdfzic/+Umuuuoqrr76aj7zmc9w2WWXZfld92/QS/6a2XuA+9z9Q+npLwO4+//uo+xm4M/d/ZXBVqxL/spo0yV/R85YX/L34MGD/P7v/z7bt28fsxhGwkhf8ncDsMjMFppZHkHrfF3vQmZ2ITAdeDWjqEVEZMQMmtzdPQHcAzwN7ASecPcdZna/ma3qVvQ2YK2P1d0/RGTCKisrC12r/UxlNM7d3dcD63vN+0qv6fuyF5aIjDdDHU0iAzvTdrJ+oSoiZ6ygoICampozTkgScHdqamooKCgY9jIyarmLiAxk7ty5VFZWoiHO2VNQUMDcuXOH/XoldxE5Y7m5uSxcuHCsw5Bu1C0jIhJCSu4iIiGk5C4iEkJK7iIiIaTkLiISQkruIiIhpOQuIhJCSu4iIiGk5C4iEkJK7iIiIaTkLiISQkruIiIhlFFyN7OVZrbbzCrM7N5+yvyxmb1tZjvM7MfZDVNERIZi0KtCmlkUWAN8EKgENpjZOnd/u1uZRcCXgRXuXmdms0YqYBERGVwmLfergAp33+/uMWAtsLpXmc8Ca9y9DsDdT2Q3TBERGYpMkvsc4HC36cr0vO7eDbzbzF42s9fMbGW2AhQRkaHL5GYdfd0Usfe9tHKARcAHgLnAi2a21N3reyzI7E7gToD58+cPOVgREclMJi33SmBet+m5QFUfZX7u7nF3PwDsJkj2Pbj7w+5e7u7lpaWlw41ZREQGkUly3wAsMrOFZpYH3Aqs61XmKeB6ADObSdBNsz+bgYqISOYGTe7ungDuAZ4GdgJPuPsOM7vfzFaliz0N1JjZ28DzwJfcvWakghYRkYFldINsd18PrO817yvdHjvwV+k/EREZY/qFqohICCm5i4iEkJK7iEgIKbmLiISQkruISAgpuYuIhJCSu4hICCm5i4iEkJK7iEgIKbmLiISQkruISAgpuYuIhJCSu4hICCm5i4iEkJK7iEgIKbmLiIRQRsndzFaa2W4zqzCze/t4/pNmVm1mb6X/PpP9UEWGrrqpg1giNdZhiIy6QZO7mUWBNcBNwBLgNjNb0kfR/3D35em//5PlOEWGLJFM8e+vvcN/be19P3eR8Muk5X4VUOHu+909BqwFVo9sWCJnLuXB/6r6trENRGQMZJLc5wCHu01Xpuf19t/MbKuZ/cTM5mUlOhERGZZMkrv1Mc97Tf8nUObuy4BngR/0uSCzO81so5ltrK6uHlqkIiKSsUySeyXQvSU+F+jRienuNe7ekZ78HnBFXwty94fdvdzdy0tLS4cTr4iIZCCT5L4BWGRmC80sD7gVWNe9gJmd121yFbAzeyGKiMhQ5QxWwN0TZnYP8DQQBR5x9x1mdj+w0d3XAf/dzFYBCaAW+OQIxiwiIoMYNLkDuPt6YH2veV/p9vjLwJezG5qIiAyXfqEqIhJCSu4iIiGk5C4iEkJK7iIiIaTkLiISQkruIiIhpOQuIhJCSu4iIiGk5C4iEkJK7iIiIaTkLiISQkruIiIhpOQuIhJCSu4iIiGk5C4iEkJK7hJaftqtfkUmjoySu5mtNLPdZlZhZvcOUO4WM3MzK89eiCIiMlSDJncziwJrgJuAJcBtZrakj3KTgf8OvJ7tIEVEZGgyablfBVS4+353jwFrgdV9lPufwNeB9izGJyIiw5BJcp8DHO42XZme18XMLgPmuft/ZTE2EREZpkySu/Uxr+tMlZlFgG8CXxx0QWZ3mtlGM9tYXV2deZQiIjIkmST3SmBet+m5QFW36cnAUuAFMzsIXAOs6+ukqrs/7O7l7l5eWlo6/KhFRGRAmST3DcAiM1toZnnArcC6zifdvcHdZ7p7mbuXAa8Bq9x944hELJIh10hImcAGTe7ungDuAZ4GdgJPuPsOM7vfzFaNdIAiIjJ0OZkUcvf1wPpe877ST9kPnHlYIiJyJvQLVRGREFJyFxEJISV3EZEQUnIXEQkhJXcRkRBSchcRCSEldxGREFJyFxEJISV3EZEQUnIXEQkhJXcJLV04TCYyJXcRkRBSchcRCSEldxGREFJyFxEJISV3EZEQyii5m9lKM9ttZhVmdm8fz99lZtvM7C0ze8nMlmQ/VBERydSgyd3MosAa4CZgCXBbH8n7x+5+ibsvB74OPJD1SEWGyNFYSJm4Mmm5XwVUuPt+d48Ba4HV3Qu4e2O3ySLQUSUiMpYyuYfqHOBwt+lK4Orehczsz4G/AvKA3+lrQWZ2J3AnwPz584caq4iIZCiTlrv1Me+0lrm7r3H3C4C/Af5HXwty94fdvdzdy0tLS4cWqYiIZCyT5F4JzOs2PReoGqD8WuAPziQoERE5M5kk9w3AIjNbaGZ5wK3Auu4FzGxRt8nfA/ZmL0QRERmqQfvc3T1hZvcATwNR4BF332Fm9wMb3X0dcI+Z3QDEgTrg9pEMWiQTunCYTGSZnFDF3dcD63vN+0q3x5/PclwiInIG9AtVEZEQUnIXEQkhJXcRkRBSchcRCSEldxGREFJyl9DSSEiZyJTcZcJJJFO8UnGSeDI11qGIjBgld5lwtlTW8/qBWja9UzfWoYiMGCV3mXASyaDDJplSx42El5K7iEgIKbmLiISQkruISAgpuUtouS4LKROYkruISAgpuYuIhJCSu4hICGWU3M1spZntNrMKM7u3j+f/yszeNrOtZvZrM1uQ/VBFRCRTgyZ3M4sCa4CbgCXAbWa2pFexzUC5uy8DfgJ8PduBiohI5jJpuV8FVLj7fnePAWuB1d0LuPvz7t6annwNmJvdMEWGTmNlZCLLJLnPAQ53m65Mz+vPp4Ff9PWEmd1pZhvNbGN1dXXmUYqIyJBkktytj3l9NorM7BNAOfBPfT3v7g+7e7m7l5eWlmYepYiIDElOBmUqgXndpucCVb0LmdkNwN8B73f3juyEJyIiw5FJy30DsMjMFppZHnArsK57ATO7DPhXYJW7n8h+mCIiMhSDJnd3TwD3AE8DO4En3H2Hmd1vZqvSxf4JKAaeNLO3zGxdP4sTEZFRkEm3DO6+Hljfa95Xuj2+IctxiYjIGdAvVCW0dN0wmciU3EVEQkjJXUQkhJTcJfTiSef/bT162vy+fsAhEhZK7jIh7DnedNo8dclLmCm5i4iEkJL7MDW2x3noN/uoa4mNdSgiIqdRch+mPceaaIsl2XakYaxDkf6o30UmMCV3EZEQUnIXEQkhJXcRkRBScpcJS+PcJcyU3GXC0vlWCTMldwktV/qWCUzJfZhM3+nHPW1CCbOMkruZrTSz3WZWYWb39vH8dWb2ppklzOyW7IcpIiJDMWhyN7MosAa4CVgC3GZmS3oVOwR8EvhxtgMUEZGhy+ROTFcBFe6+H8DM1gKrgbc7C7j7wfRzqRGIUUREhiiTbpk5wOFu05XpeSIicpbKJLn3dd5pWMMQzOxOM9toZhurq6uHs4izRrS9lmsOPUx+c2X/hVIp6GgO/svoc+eCmheY3HEMgLLal1lQ9wq4U9RxQvfhk1DLJLlXAvO6Tc8FqoazMnd/2N3L3b28tLR0OIs4a+S2ngBgcs2W/gttegRe+Q5UPDtKUUl3nkxQ2rKHi4+vo6T1AOc27+C8pu0UNuzlkuNPUVi/a6xDFBkxmST3DcAiM1toZnnArcC6kQ3r7OeR3PSDAVp/zelvJyfe7r+MjIp3n3ym63FORz0Aue21YxWOyIgbNLm7ewK4B3ga2Ak84e47zOx+M1sFYGZXmlkl8BHgX81sx0gGLZINGucuYZbJaBncfT2wvte8r3R7vIGgu0bkLKI+dZm49AvVM6YEctYa7ISpp2D3L4KT3iIhk1HLXWR8Gji5Fzbuh2QM4m2w9I9GKSaR0aGWu4TW4N+p9K1LwkvJXcJL49jlbHP4Daj49aisSsl9JHQ0webHxjoKGe2WeeVGqD80uuuU8aXi10GCHwVK7iNBB/nZoZ+Wu4/UIMi9z2T3Q/3lb8GeX2VveTKhKLlLiA3ccrezvc891gpHNo11FBOHO7zzanCCPQSU3EfFCCQRdzj4MrTVZ3/ZIhNR/SHY/0IwPDYElNyHKt7W/4XAEh2QiJ0+30fgwmHtDXDgt7Dtyewve4I41T1zlrfgh6ujWWP4hyKVCP4n+ziGxyEl96FIxOClB2HfqbPdBS2Vp06QvPgAvPKt01/nDu2N8Mo/Q2uWrmfSeZ+/RMfgZRMd0FaXnfWOJxP9apyvfCf4kyEKx4UplNyHovMT/cTbXf21kWSs59CmZCJoVXfnDid2BqNoqt7MTiyW3nSeHLzsW4/Baw9lZ73jSt8t8iP7tgPQ2JYYzWBERpWSe7YcfOnU4xM7ez6XSmR/9ExnV08qg+TedDy76x4v+hktk5NqB6C+tYP61hips6FXpnoPHHptrKOQEFFyz5YDLw78fE1F8L/2QHbW15m4stmf7z4hfvjT2deen2hi57Em9h5vGuOIgO0/hX3Pj3UUMpJG+dgal8n9mbeP881n9oz+irsn0uFuqJaT2WnFd8aSreSe6IAXvnrmrcfGKmgY4O5UoyqzbdTcoe6ZcScRg61PjK/RYiMxsGIA4zK5bz8S9Gn7aLcyuzaO0TtxHG9spzWWYZKItWYlnBNN7cTj8cxfMFB9xVqC/0ffCv631UPziaEHtekH8OaPhv66kdDP++09vj3831UGkIgF54nGm5q9ULMvGLo4XniKPceb2Fo5Oh9I4zK5R1Mx8hONxJKjPBqij0/e1ljQ573/ZAtbKhvYMkobrrGtg33VLewZSpdCOtm5O9sqG2iPd+uv7xx905kQX/sX2PD9IcdV3xajrvVsGUrWd9qOeM9klpU2QreFxM9kvxztBsuL34BN/zZ660slgy6o5jO8h3LXgIKzfERU/SHY/5vgsaeoaYnREsvgPFkWZJTczWylme02swozu7eP5/PN7D/Sz79uZmXZDrS7JSf+i8uq1tKRGO3kfurA62z9NbTFe3ytb40lSaRS7KtupjWWoCXT1vxQQ0kP84sNpQ7SB8LJ5hjP7jzO0zuO9VmsvjXGhoO1PZN/hnYebWLXsQw+cI5th5MVQ17+UPT3zS4RKegxnZVfqqbXVd8W45+fq+BowzB/5XigMxGcHlM8mRrS9m6PB/vioFpOZrzMM9Z0NDh5vOcMfyjUmdyrdwfdM1lQVd9G2zD2+QFtfgzeeSV4fLb1uZtZFFgD3AQsAW4zsyW9in0aqHP3dwHfBL6W7UC7K44FO2NDcxvt8SQdicw3SCKZyihpxRIpth9pIJkeStHQGgdP0RpLEEumyLVTY2H3Vff8oUhVfTsnmjrYUtnA1soGWmKJnonG+h9HW9cS47ldx3u8p0M1rdS3xthaWd+1nNRwxnC//CBHNq2n4nAV5gn2V7eQ6GxldsXnvF3VSCLlHKxpobkjQSrl1LYM3hof0gfNzv8c8AdY7fFkRstzd5ra42w/0kB106kx/6mU93ssJSL5PaZTvQq6+2n7SG1LjFR6XzhS38bLFb0TotPUEWfn0eCDrao+SO7bKhv45jN7Bt3n3J0dVQ3U7dvYOQOgx7fTf/3NPh7+7b4Bl9Pd5sP1bDnc9zfJZ98+zm/3VFPfGutqgJxs7qChLbNuvjcO1LLpnbqg/ttivLzzcFf9AP13UXa2tO0MOw0sCgRdk0f2baetV2s4Mci3p3gyOL6b2uO0x5M0dyR4escx3jpcT0cy83zS2B4fdF0ApFJ4Z9fnKMnkZh1XARXuvh/AzNYCq4Hud31eDdyXfvwT4J/NzHwEOsXbYsmudta6DRXEcopPK7P43GKWzozQGili59FGGtrixBIpmtp77nDL501l28HjXH7BbBKJGMnWeuriUSyay5HaFhbUvcZrxRcSmTaPhrY4FxS0UFrZQDwa49jMOuall9Paa8c6Ut+z1ba18tS49wUlkzi2/yj7DxWRSKYozItyuLaVaCRyKgG4s+VQPZhxbuIwsViC2sIyChINvLpvBvNKJlF56DCXAO2JFN98Zg/LZxfSuO2XTLn4g0yZMgV3SKacTbv2c/nRmm5t02PAc1wRyWfz7Fv5znMVXDAjj5baY5S9U0NHtIPNzbVcA9S1xvnFb/cTSSVIWZSrzp9BdWM7OVGYUphPayxBYV4O9a0xalti1LfGuSa9lud3nSCZctoTSfYe7/nht+icYt59vIlYMsUzz+zh/NIi9lcHO/4Ni8/B4628sKcG8xQzpxZz8bwS3jhQ27X9JuVFueb8GcwozuPJjT1P3n6kfC7xpPPU5iMUxuu49PRdiEnxnj8kq2ls5Z+f3cX5sybT3JHqsf0+fOlsnnn7OB2xGE4EzCiI1zOncTPPHFzAvIvfw692HGfBtAizjjR2ve63e05S3dTRlezXbamiIDfKvhNBXcwrmcTR+jYSKef80iLOOVCLA7WVdcw4XM2kvFyOH6gh5XBi00GWLSglngy24o6qBn6149Tw1svmT6O6qYOZxfkkWmpoiBkLZ88CIJZ03jhQy6S8KHWtMfYeb+aC0klsq6wDi5Cb/pb1WrcBCte9eyZN7QmK83N4ce9JZhbnEY1EiCWSnDu1kMq61q5t8fqOCkpb9jK38U3+/egnaKSIkqJ8TjQ0c+F50ynMi9IeT3KotpW8aIS8psMsOVHHDC9l767jHHv7VRoKZrOobAHTUzXUWQn17QkuPGcyTe1x5pVMIp5MYWbUNHew/UgD80omMb2tjqnVzZxIf6A/8Zt9XDJnKkcb2znZ2E401UEyWsD0SbnUtcb5wIWltMaSVDd1UJAbZfeRWoriJ3khZwolrQc5XryYcxJBXTy3s5pdNXv48KWzaYsleXZnUNfvvWAGx06coD0VoaBwEu1tbdTW15OM5HHp/Ok0JfOobYlx3tQCjjd2sPi8yZzX2E5eNMLR5/6dlsPbuuo4lXIikZH9sZQNln/N7BZgpbt/Jj39J8DV7n5PtzLb02Uq09P70mX6/b5XXl7uGzduHHLAG/efIP7817umO3KK2T/9feQlW2nKP4f8RBOLq4OvfLtKP0RJ60EKEg1M6TgWbMDmYAx6Q8EcWnOnc17TdtpzplKQaOhzfQBtudMAOFF0EQvqg9EkzXmzKI4N44RjWnvOFI4XL2ZKxzFSFsU8RWGinsL4qZZWfcFcprX3TF4teTMpivWs1tbc6bTnTqOk9QBVUy7l6ORLWFTzHPUF85hf//qAcTQWzGZKe1Wv5ZV0JcCaSRcwo3UftZMWcmTKZVx04hdESFI1+VJSFsEtwqzmXTTnzcJwSluCJLFn5g2AMat5F4loPk155zCncTMNBXPpiBYzt/HNrtgLEk3sLL2JKR1VnJz0Li47+h894tk743cojp2gNXcGs5u2UFu4kPqCuURIMrnjOHMa3uTolGW05M4gJ9WBWxTHuKD2Nxlvj5RFiUWL2TPzBgoT9Sw6+WvikUIOlFxLXrKFsrpXAGP3zA9y4clTV2psy53WY5sBdESLqZx6BbObtnBo6lXMbtpKXeECHCPqceY2nLoYWM2kCwCY0dqzRX6y6F3MbDnVbfX6vE9zXtM25te/wbZz/oB31b6AE6WucD4nJ13A7KYtVBddyJIT/0UykkvV5EuZ1xAcX9VFi6grXEBhvI54dBLn177YVfeT4sEvlzuPgea8UmonnU9z3kySlsd5zds4OekCSloPMr39EAemr6A1t4TZTVuZ1byrR8zHJl/MuU076MiZTH6iiZRFqS1cSHGsmqOTLyEZyWVuwyYKEsGHYGP+uUzpOL1rsHbSQlpyS6gvmMeMtgM0582iINHI0ckXM6XjGK25JSw+8f+6YofguDg89Qouqn6625KMttyp5CVa2DD3dqa2VzG97R2SkTz/2v09AAALdklEQVTmNG4ecH/YPfNGoh5jQf3r5CaDD/sD01ewsO7lfl9TW1hGSdtBNs++lSntR2ksOI/Lqtb2WTbn+r/hyvNLB4yhP2a2yd3LBy2XQXL/CPChXsn9Knf/i25ldqTLdE/uV7l7Ta9l3QncCTB//vwr3nnnnaG9K6BxxzPED7yMEbRKDOhIpNLTKaYU5BLp9o0vnnQ6PyBjiRTukJcTwQxyIhHqWmMU5EZJppxEKkXUjPzcKO5OyoMelIKcKDUtHZQU5ZFyiFiwzmTKKc7PIZ5M0R5PkUimyI1GiEaNnIiRSDk4RCNGLJkiPydCIuUU5ERIpqAjkSQ3GiHpTsSMRDJFXk6E/JwobenupvycaNd7K8rPwQi6LBzIi0bIiRqtsSQRM9riSYrzgi9jLbEEk/KCr67JlNMaSzIpL0o0XRmGdXX9JFKOu3fVQyRi5EYjRAwSSacwL4p7EG97PEVO1IiaEY0YKXfa4kmiZkzKi2IWvO+WjgSFucH6neBrcG400q2+gvfe0pEgNycCDgV5UfKjETqSKfDgPedFT/WIJ1NOU3uCwrwoqVQQb1s8eF+pFOREg3hSDlEzaltjFOVFKciNkhuNMCn9Po43tlOUn0My5ThOIunE03Xf1J4gNxrsH4mUU5QXJZZIdS0jkUrR0pEkYsF2zc8JWqYpD+opJxLp6uaImJEXjdDcEXTLFeXnkJcTCb7RxJO0xJIYQbdQQW6wnCkFueTnRkilgq4NTy+nIDeCe7APRyNB3SdTTjRitMSCuu7sFQnqJkJzR5KC3AjxpJMXjRBPBX32xfk5tMWDdRfkRkm5d33Ti0SMvPQ+lZ8TpTWWIC8nOKDcg/KJVKprf0umgmUD5OdGaGyLU5Sf07W9ImaYQXs81RVzQW6EnIjRkQjqvPMY7fyfSHn6/QSvT7kTTwbzOhJJciLBMWNAeyJJcX4O7fEUedEIKXfMgjrrSATHXFssSTS9zzrBN7/OXJDyoA5z08eSO9S1xrqObUsf6wU50a7YHGiPJUl68N49fXwWpOspLydCTvqbeHCcBsdEKl3H8WSKuZf+DjmLfmfI+Q8yT+6ZdMtUQlcPBMBcoKqfMpVmlgNMBU67iIq7Pww8DEHLPYN1n2bKohVwYugt/v6UFOVltVy2TC3MzbjstMLT55WSf/rMM5Z5TKXFma1/Zh/ligYoP2tyxiGcvs2u/QJsepTZkf6vs5PJ8mcMFCCnb7vh7juZvm5mP9u6ZJA4B15356Oh7UczikZivxtdxfnZubX0gMfwvEFz8xnL5F1sABaZ2ULgCHAr8LFeZdYBtwOvArcAz41EfzsAeZPg/X8TnJhpOQFv/hDmXAGzFsOOp6D8DkjF4dXv9nzdlZ8OrrESbz81L5oLl94KlRtgTnlwTZid/3n6OnMLT13jufxTwdnv6t09y5ReCOcshePbg+euuQuObYOpc2HbT4NLEFz/5eACYq+uCV4z/2o4ujVYdumFwetKFsJFvw+v/8up8cfv/Quo2gyTz4XpZcGY9IMvQd2BU1f9K5gC86+BvMnBiIuCqVB8TrDew29AfjEsvSVYRvcTuhW/Dq55U38ILr0NGith9y+hbAXMfy9U74Ta/XD87eC9LP94UL9tdVB/GCqehUi078sgTJ0D0xZA4TTYtR6KS+GKO+DAC1D1VvDDqUgOFM0ILpFw6Udhy39AXhFcna6/KbOhaCa0VAc3QWmpDrbZO68GdR1vg7lXQrIjOElX9VbwmkU3QlstlF4ER7fA5PMgtwCWfyyo/7lXBtcKWnRjsP27j5c+dylMmw9H3oSmdLfBNXcFdd1QefrY6hWfD/ZLd9i9PtimvZWcH9yE+8imYBvsfQYKpwfxFc0MtumMC4Jtfmwr4HDOJcE++sJXYcF7g/18//PBSKN33xiMcpm2ABoOQ93BU6NeJpX0f4G6i34vqF9PBbG86waYNCNY5671wTbu/BFafnHwnq/9QrCdKjcEx0LnJXHf+xcQaw72nVQiGPJX+m648PegozE4HltrTo+hcNqpHx9d96XgtS0nIJILmx4N5i/+cM9jsWAqnP8BqN7V89gr/xQ0HIG9vW5qcsktQR0c+M2pfbP8U5BXHGz3SA7kTw6O5bxJ0FIDkUiwXzQeDV7XfRTR/GuCH/hd9vFgvz22NRj1A0EdTj5n8Bu1RHOCi9mVfyo4XkfYoN0yAGZ2M/AgEAUecff/ZWb3AxvdfZ2ZFQA/Ai4jaLHf2nkCtj/D7XPPWCoZHOwndgaJMxI99Zx7sOELp9OjDwcgGQ8SRu6k4KJc8dagXP0hmDKn53IyFW8LEmjxrL7jbKsPEtxwlnv4DSh73+nvI5uS8aDOcvpoSXq676rxaLDDRnIgp4/WW2e5vrgHyWY4dTscqeTp64q3QTRvaDF0Xq6hr7pvqQkSUuOR4INigBFSWdHeGPzorOT8IB53aD4ORaXBe+o8zrMRx+E3gvc0+dzByybjwQdUW31wLE2ZHczvaA4+0Caf0+t9NAQNlEgkKJM7qY9jNBEkyu46fxjYcDg43rMh0dH3vtwjlvT76z7dWhu8r4H2+TOQtT73kTLiyV1EJIQyTe7j8heqIiIyMCV3EZEQUnIXEQkhJXcRkRBSchcRCSEldxGREFJyFxEJISV3EZEQGrMfMZlZNTD0K4cFZgKjeIeBcUv1lBnVU+ZUV5kZyXpa4O6DXlJyzJL7mTCzjZn8QmuiUz1lRvWUOdVVZs6GelK3jIhICCm5i4iE0HhN7g+PdQDjhOopM6qnzKmuMjPm9TQu+9xFRGRg47XlLiIiAxh3yd3MVprZbjOrMLN7xzqesWZmB81sm5m9ZWYb0/NKzOwZM9ub/j89Pd/M7NvputtqZpePbfQjx8weMbMT6Zu3d84bcr2Y2e3p8nvN7PaxeC8jqZ96us/MjqT3qbfSN+vpfO7L6XrabWYf6jY/1Melmc0zs+fNbKeZ7TCzz6fnn737lLuPmz+CO0HtA84H8oAtwJKxjmuM6+QgMLPXvK8D96Yf3wt8Lf34ZuAXgAHXAK+PdfwjWC/XAZcD24dbL0AJsD/9f3r68fSxfm+jUE/3AX/dR9kl6WMuH1iYPhajE+G4BM4DLk8/ngzsSdfHWbtPjbeW+1VAhbvvd/cYsBZYPcYxnY1WAz9IP/4B8Afd5v/QA68B08zsvLEIcKS5+285/SbtQ62XDwHPuHutu9cBzwArRz760dNPPfVnNbDW3Tvc/QBQQXBMhv64dPej7v5m+nETsBOYw1m8T4235D4HONxtujI9byJz4FdmtsnM7kzPO8fdj0KwUwKdN2+d6PU31HqZyPV1T7o74ZHOrgZUTwCYWRnB/aJf5yzep8Zbcu/rbrMTfbjPCne/HLgJ+HMzu26Asqq/vvVXLxO1vv4FuABYDhwFvpGeP+HrycyKgZ8Cf+nujQMV7WPeqNbVeEvulcC8btNzgaoxiuWs4O5V6f8ngJ8RfEU+3tndkv5/Il18otffUOtlQtaXux9396S7p4DvEexTMMHrycxyCRL7Y+7+f9Ozz9p9arwl9w3AIjNbaGZ5wK3AujGOacyYWZGZTe58DNwIbCeok86z8LcDP08/Xgf8afpM/jVAQ+dXygliqPXyNHCjmU1Pd03cmJ4Xar3Ow/whwT4FQT3damb5ZrYQWAS8wQQ4Ls3MgO8DO939gW5Pnb371FifhR7GWeubCc5U7wP+bqzjGeO6OJ9gZMIWYEdnfQAzgF8De9P/S9LzDViTrrttQPlYv4cRrJvHCboU4gStpU8Pp16AOwhOHFYAnxrr9zVK9fSjdD1sJUhS53Ur/3fpetoN3NRtfqiPS+Bagu6TrcBb6b+bz+Z9Sr9QFREJofHWLSMiIhlQchcRCSEldxGREFJyFxEJISV3EZEQUnIXEQkhJXcRkRBSchcRCaH/D8BLSDE49FpGAAAAAElFTkSuQmCC\n",
176 |       "text/plain": [
177 |        "<Figure size 432x288 with 1 Axes>"
178 |       ]
179 |      },
180 |      "metadata": {
181 |       "needs_background": "light"
182 |      },
183 |      "output_type": "display_data"
184 |     },
185 |     {
186 |      "data": {
187 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3XuQXOV95vHvr+9zlTQXJKERSBgJEJgIexBkIdixwRY4i9gCJ5B1lmxIadmCDVnHtcGbLK7gcsUhtbbjWhxMYlUuGyxjex0rLjkEX5RNFstosGSQBEIXLhrEZdBlJI3m1t2//eOcGbVaPTNHc1FrTj+fqq7p855zut+3Z+Y5b7/nZu6OiIjUhkS1KyAiImePQl9EpIYo9EVEaohCX0Skhij0RURqiEJfRKSGKPRFRGqIQl9EpIYo9EVEakiq2hUo19bW5kuWLKl2NUREZpXnnnvuXXdvn2i5cy70lyxZQldXV7WrISIyq5jZa1GW0/COiEgNUeiLiNQQhb6ISA2JNKZvZquBPwOSwF+6++fHWO4O4JvA1e7eFZZ9GrgHKAC/4+5PTUfFRURKDQ8P093dzcDAQLWrMqNyuRwdHR2k0+lJrT9h6JtZEngUuAnoBraY2QZ331m2XBPwO8BPS8pWAHcClwPnAz8ws+XuXphUbUVExtDd3U1TUxNLlizBzKpdnRnh7hw8eJDu7m6WLl06qdeIMryzCtjj7vvcfQhYD6ypsNxngUeA0s3sGmC9uw+6+yvAnvD1RESm1cDAAK2trbENfAAzo7W1dUrfZqKE/iJgf8l0d1hWWpGrgMXu/r0zXVdEZLrEOfBHTLWNUUK/0juM3mPRzBLAF4HfO9N1S15jrZl1mVlXT09PhCqdrm8wzxf+aRdbXz88qfVFRGpBlNDvBhaXTHcAB0qmm4ArgE1m9ipwLbDBzDojrAuAuz/u7p3u3tnePuEJZRUNDBf48o/28Hx376TWFxGZiiNHjvCVr3zljNe75ZZbOHLkyAzUqLIoob8FWGZmS80sQ7BjdsPITHfvdfc2d1/i7kuAzcCt4dE7G4A7zSxrZkuBZcCz094KIJUImlIo6kbvInL2jRX6hcL4x61s3LiRuXPnzlS1TjPh0Tvunjez+4GnCA7ZXOfuO8zsYaDL3TeMs+4OM3sS2Ankgftm6sidZDIYSVLoi0g1PPjgg+zdu5eVK1eSTqdpbGxk4cKFbNu2jZ07d3Lbbbexf/9+BgYGeOCBB1i7di1w8tIzx48f5+abb+b666/nmWeeYdGiRXz3u9+lrq5uWusZ6Th9d98IbCwre2iMZT9YNv054HOTrF9kqUQQ+nmFvkjN+6N/2MHOA0en9TVXnN/MZ/7t5WPO//znP8/27dvZtm0bmzZt4mMf+xjbt28fPbRy3bp1tLS00N/fz9VXX83tt99Oa2vrKa+xe/duvv71r/MXf/EX/Oqv/irf/va3+cQnPjGt7TjnLrg2WcnESE+/WOWaiIjAqlWrTjmW/stf/jLf+c53ANi/fz+7d+8+LfSXLl3KypUrAXj/+9/Pq6++Ou31ik/om3r6IhIYr0d+tjQ0NIw+37RpEz/4wQ/4yU9+Qn19PR/84AcrHmufzWZHnyeTSfr7+6e9XrG59k4iYSRMY/oiUh1NTU0cO3as4rze3l7mzZtHfX09L730Eps3bz7LtTspNj19CI7gUU9fRKqhtbWV6667jiuuuIK6ujrmz58/Om/16tU89thjXHnllVxyySVce+21VatnrEI/mTD19EWkap544omK5dlslu9///sV542M27e1tbF9+/bR8k996lPTXj+I0fAOBEfw5AsKfRGRscQq9JNJ09E7IiLjiFXopxKmMX2RGuYe////qbYxVqGfMI3pi9SqXC7HwYMHYx38I9fTz+Vyk36NWO3IVU9fpHZ1dHTQ3d3NZK/UO1uM3DlrsmIV+smkUVToi9SkdDo96btJ1ZJYDe/oOH0RkfHFKvR1nL6IyPhiFfrBmL4O2RQRGUusQl89fRGR8cUq9HX0jojI+GIV+urpi4iML1Lom9lqM9tlZnvM7MEK8+81sxfMbJuZ/auZrQjLl5hZf1i+zcwem+4GlEolErr2jojIOCY8Tt/MksCjwE1AN7DFzDa4+86SxZ5w98fC5W8FvgCsDuftdfeV01vtytTTFxEZX5Se/ipgj7vvc/chYD2wpnQBdy+9GWUDUJXkTSV19I6IyHiihP4iYH/JdHdYdgozu8/M9gKPAL9TMmupmW01s382s1+q9AZmttbMusysayqnUKunLyIyviihbxXKTktWd3/U3d8D/D7wh2Hxm8AF7n4V8EngCTNrrrDu4+7e6e6d7e3t0WtfJmk6ekdEZDxRQr8bWFwy3QEcGGf59cBtAO4+6O4Hw+fPAXuB5ZOr6sTU0xcRGV+U0N8CLDOzpWaWAe4ENpQuYGbLSiY/BuwOy9vDHcGY2UXAMmDfdFS8klRSoS8iMp4Jj95x97yZ3Q88BSSBde6+w8weBrrcfQNwv5ndCAwDh4G7w9VvAB42szxQAO5190Mz0RCAZCKh0BcRGUekSyu7+0ZgY1nZQyXPHxhjvW8D355KBc+EzsgVERmfzsgVEakhsQp9XWVTRGR8sQp99fRFRMYXq9DXmL6IyPhiFfrJRIKCLrgmIjKmWIV+cO0dhb6IyFhiFfoa0xcRGV+sQl9H74iIjC9WoZ8wo+hQVG9fRKSiWIV+KhFcELTgCn0RkUriFfrJoDm6ZaKISGWxCv10MujpD2tcX0SkoliFfiYVNGc4r9AXEakkVqGfDod3hjW8IyJSUUxDXz19EZFKYhb6wZj+kEJfRKSiSKFvZqvNbJeZ7TGzByvMv9fMXjCzbWb2r2a2omTep8P1dpnZR6ez8uUy6umLiIxrwtAP73H7KHAzsAK4qzTUQ0+4+3vdfSXwCPCFcN0VBPfUvRxYDXxl5J65M2F0eCevMX0RkUqi9PRXAXvcfZ+7DwHrgTWlC7j70ZLJBmAkddcA69190N1fAfaErzcj0uHRO0OFwky9hYjIrBblHrmLgP0l093ANeULmdl9wCeBDPChknU3l627aFI1jWB0TF89fRGRiqL09K1C2Wmp6u6Puvt7gN8H/vBM1jWztWbWZWZdPT09EapUWTalMX0RkfFECf1uYHHJdAdwYJzl1wO3ncm67v64u3e6e2d7e3uEKlWmQzZFRMYXJfS3AMvMbKmZZQh2zG4oXcDMlpVMfgzYHT7fANxpZlkzWwosA56derUrU+iLiIxvwjF9d8+b2f3AU0ASWOfuO8zsYaDL3TcA95vZjcAwcBi4O1x3h5k9CewE8sB97j5je1lHQn9IZ+SKiFQUZUcu7r4R2FhW9lDJ8wfGWfdzwOcmW8EzMXqcvq69IyJSUbzOyE2FV9nU8I6ISEXxCn2N6YuIjCuWoa8xfRGRymIV+rr2jojI+GIV+qN3ztKOXBGRimIV+smEYaaevojIWGIV+mZGOpnQmL6IyBhiFfoQjOurpy8iUlnsQj+dNIW+iMgYYhj66umLiIwllqGv6+mLiFQWu9DPpNTTFxEZS+xCX2P6IiJji2Hoq6cvIjKWWIa+jtMXEaksdqGfSSZ0GQYRkTHELvTTKY3pi4iMJVLom9lqM9tlZnvM7MEK8z9pZjvN7Hkz+6GZXVgyr2Bm28LHhvJ1p5vG9EVExjbh7RLNLAk8CtwEdANbzGyDu+8sWWwr0OnuJ8zsPwOPAL8Wzut395XTXO8xaUxfRGRsUXr6q4A97r7P3YeA9cCa0gXc/cfufiKc3Ax0TG81o9O1d0RExhYl9BcB+0umu8OysdwDfL9kOmdmXWa22cxuq7SCma0Nl+nq6emJUKWx6Th9EZGxTTi8A1iFsorjJ2b2CaAT+EBJ8QXufsDMLgJ+ZGYvuPveU17M/XHgcYDOzs4pjc2kdfSOiMiYovT0u4HFJdMdwIHyhczsRuAPgFvdfXCk3N0PhD/3AZuAq6ZQ3wmlUxrTFxEZS5TQ3wIsM7OlZpYB7gROOQrHzK4CvkoQ+O+UlM8zs2z4vA24DijdATztNKYvIjK2CYd33D1vZvcDTwFJYJ277zCzh4Eud98A/CnQCHzTzABed/dbgcuAr5pZkWAD8/myo36mncb0RUTGFmVMH3ffCGwsK3uo5PmNY6z3DPDeqVTwTOk4fRGRscXvjNxkguGC465xfRGRcrEL/UwqaNKwduaKiJwmdqGfTgZHmGqIR0TkdDEM/ZGevkJfRKRcbEN/SKEvInKa2IV+JqkxfRGRscQu9NOpcExfl2IQETlN7EI/k0wCGtMXEakkdqE/cvSOxvRFRE4Xv9DXcfoiImOKXeiP7Mgd0pi+iMhp4hf6KYW+iMhYYhf6uVSwI3dguFDlmoiInHtiF/rZdNCkgbxCX0SkXOxC/2RPX8M7IiLl4hf6Iz19De+IiJwmUuib2Woz22Vme8zswQrzP2lmO83seTP7oZldWDLvbjPbHT7uns7KV5JNa0xfRGQsE4a+mSWBR4GbgRXAXWa2omyxrUCnu18JfAt4JFy3BfgMcA2wCviMmc2bvuqfbqSnP6ijd0REThOlp78K2OPu+9x9CFgPrCldwN1/7O4nwsnNQEf4/KPA0+5+yN0PA08Dq6en6pVlkgnMYFA9fRGR00QJ/UXA/pLp7rBsLPcA35/kulNmZuRSSQbU0xcROU2UG6NbhbKK1zgws08AncAHzmRdM1sLrAW44IILIlRpfLl0QmP6IiIVROnpdwOLS6Y7gAPlC5nZjcAfALe6++CZrOvuj7t7p7t3tre3R637mLKppEJfRKSCKKG/BVhmZkvNLAPcCWwoXcDMrgK+ShD475TMegr4iJnNC3fgfiQsm1FBT1/DOyIi5SYc3nH3vJndTxDWSWCdu+8ws4eBLnffAPwp0Ah808wAXnf3W939kJl9lmDDAfCwux+akZaUyKXV0xcRqSTKmD7uvhHYWFb2UMnzG8dZdx2wbrIVnIxsWjtyRUQqid0ZuQC5lHbkiohUEs/QTyd1nL6ISAUxDX3tyBURqSSmoZ9kUJdWFhE5TSxDP5tST19EpJJYhn4undRNVEREKohv6GtHrojIaeIZ+uHwjnvFSwSJiNSsWIb+yI1UdE19EZFTxTL0cyOhr525IiKniGnoh/fJ1c5cEZFTxDP0U7pProhIJbEM/exIT1/DOyIip4hl6KunLyJSWTxDX0fviIhUFNPQHxneUU9fRKRUTENfwzsiIpVECn0zW21mu8xsj5k9WGH+DWb2MzPLm9kdZfMKZrYtfGwoX3cmnDxkU8M7IiKlJrxdopklgUeBm4BuYIuZbXD3nSWLvQ78JvCpCi/R7+4rp6GukWW1I1dEpKIo98hdBexx930AZrYeWAOMhr67vxrOOye61ifPyFXoi4iUijK8swjYXzLdHZZFlTOzLjPbbGa3VVrAzNaGy3T19PScwUtXpuP0RUQqixL6VqHsTC5feYG7dwK/DnzJzN5z2ou5P+7une7e2d7efgYvXVl92NM/MaSevohIqSih3w0sLpnuAA5EfQN3PxD+3AdsAq46g/pNSiqZIJNKcGIoP9NvJSIyq0QJ/S3AMjNbamYZ4E4g0lE4ZjbPzLLh8zbgOkr2BcykxmyK44MKfRGRUhOGvrvngfuBp4AXgSfdfYeZPWxmtwKY2dVm1g18HPiqme0IV78M6DKznwM/Bj5fdtTPjGnIJjW8IyJSJsrRO7j7RmBjWdlDJc+3EAz7lK/3DPDeKdZxUhoy6umLiJSL5Rm5AA3ZlMb0RUTKxDr0jw9qeEdEpFR8Qz+TpE/DOyIip4hv6GdTnFDoi4icIr6hn0lqR66ISJn4hn42Rd9QAfczOXlYRCTeYh36haLr7lkiIiViG/qN2eAUBA3xiIicFNvQb64LQv/YgEJfRGREfEM/lwbgaP9wlWsiInLuiG/o14WhP6DQFxEZEd/QH+3pa3hHRGREfEM/HNNXT19E5KT4hr7G9EVEThPb0K/PJEkmTD19EZESsQ19M6M5l9KYvohIiUihb2arzWyXme0xswcrzL/BzH5mZnkzu6Ns3t1mtjt83D1dFY+iuS7NMfX0RURGTRj6ZpYEHgVuBlYAd5nZirLFXgd+E3iibN0W4DPANcAq4DNmNm/q1Y5mTl2awycU+iIiI6L09FcBe9x9n7sPAeuBNaULuPur7v48UH6hm48CT7v7IXc/DDwNrJ6Gekcyrz7DkRNDZ+vtRETOeVFCfxGwv2S6OyyLYirrTllLQ4ZDCn0RkVFRQt8qlEW9XnGkdc1srZl1mVlXT09PxJee2Lz6DIf7NLwjIjIiSuh3A4tLpjuAAxFfP9K67v64u3e6e2d7e3vEl55YS0Oa44N5hnR5ZRERIFrobwGWmdlSM8sAdwIbIr7+U8BHzGxeuAP3I2HZWTGvIQOgcX0RkdCEoe/ueeB+grB+EXjS3XeY2cNmdiuAmV1tZt3Ax4GvmtmOcN1DwGcJNhxbgIfDsrOipT4IfY3ri4gEUlEWcveNwMaysodKnm8hGLqptO46YN0U6jhpIz39Q8cV+iIiEOMzcgHaGoPQ7zk+WOWaiIicG2Id+uc15wB456hCX0QEYh76TdkUdekkbx8dqHZVRETOCbEOfTNjfnOWt4+ppy8iAjEPfQiGeNTTFxEJxD705yv0RURGxT70z5+T483eAYrFqFeOEBGJr9iH/uKWeobyRd7RuL6ISPxD/4KWegBeP3SiyjUREak+hb6ISA2JfeifP7eOhMFrB/uqXRURkaqLfehnUgmWtDXw8tvHql0VEZGqi33oA1y6oImX3lLoi4jUROhfMr+Z1w+doG8wX+2qiIhUVU2E/hWLmnGH7W/0VrsqIiJVVROh/74L5gHQ9drhKtdERKS6aiL05zVkeE97A1tePWs37RIROSdFCn0zW21mu8xsj5k9WGF+1sy+Ec7/qZktCcuXmFm/mW0LH49Nb/Wju2F5O8/sPahxfRGpaROGvpklgUeBm4EVwF1mtqJssXuAw+5+MfBF4E9K5u1195Xh495pqvcZu2nFfIbyRTbt6qlWFUREqi5KT38VsMfd97n7ELAeWFO2zBrgr8Pn3wI+bGY2fdWcumuWtrJwTo5vdO2vdlVERKomSugvAkqTsjssq7iMu+eBXqA1nLfUzLaa2T+b2S9VegMzW2tmXWbW1dMzMz3xZMK4a9UF/N+Xe/jZ69qhKyK1KUroV+qxl1+neKxl3gQucPergE8CT5hZ82kLuj/u7p3u3tne3h6hSpPzW9cvpb0py8P/sFOXWhaRmhQl9LuBxSXTHcCBsZYxsxQwBzjk7oPufhDA3Z8D9gLLp1rpyWrMpvj91Zeybf8R/vonr1arGiIiVRMl9LcAy8xsqZllgDuBDWXLbADuDp/fAfzI3d3M2sMdwZjZRcAyYN/0VH1ybn/fIm687Dz+eONL7Digk7VEpLZMGPrhGP39wFPAi8CT7r7DzB42s1vDxb4GtJrZHoJhnJHDOm8AnjeznxPs4L3X3at6sLyZ8cgdv8Dc+jT/5YmtHDkxVM3qiIicVeZ+bo1td3Z2eldX14y/z0/3HeQ3vvYsVyxq5u9++1rqMskZf08RkZliZs+5e+dEy9XEGbmVXHNRK1+6cyVb9x/ht/9mi3r8IlITajb0AW5570Ieuf1Knn3lELf82b/wj9vf4lz75iMiMp1qOvQBPt65mG/d+29oyqW5938/x689vpkNPz/AwHCh2lUTEZl2NTumXy5fKPK3m1/jL//lFd440k8qYSyf38SVHXO4+LxG2hqztDZmuOL8OfQcH6RQdJIJI2FGMmEkzUgm7ZQTFso/2bE+65GTl4tFp1B0ih48CkXCn447ZNMJsqkEI+9SdGeoUGQoX2QwX2AwXwQglUiQTDBat9E6hs8BevuH6O0fxjCa61LMq8/Q1pSlKZvCzCgWnXzRyReL5ItOsejMqUtzqG+IXW8dI51KML8px+KWOqKcfF0oOoP5Avmij77HeAbzBd443M/COXVntL+lWHTeONLPwjk5Usma79NIDYk6pq/QL1MsOs/sPchP9r3L8929vPBGL0dODFetPtWQTSVGNyBRNOdSJMINSvAAM8KNUbBRypecDHdeU5b5zTly6QTZVJJcOsFQwUkYuMNbvQPs7TlOvuiYwdLWBi5Z0MTc+gyXLWziwJEBMkkjm06yuKWenQeOcqhvkN3vHOflt47RN1SgOZfi+mVtXLqgmZaGDA3ZYMPhDkUPNsBO8PvuGwo2ML39wxwdGGZBc46Fc3OcP6eOBXNyNOfSDOQLFItOJpUgYYYZ5AtO/3CBplyw0cylk2Hbg/mjnwUnp/PFIn2DBTKpBE3Z4HOTqXuzt5/uw/1cvaSl2lWpGoX+NHF3jg7keff4ID977TB/+PfbueP9HVx/cRsFP9kzLxShUDw9KK38ZOXy/3EHD78TlPbIzcJvD4kwRIChQpHB4ZPvYRbcAzibSpJJJciEPdui+2jvvFB0Ch4+D+sLMKcuzdz6DEV3jvYPc6hviO9sfYN/2f0uAP/photorkuTTBiphPHy28d4squb1Zcv4NevuQAz+I2vPQvA8vmNXHtRa/gNJfjMisWT30xG6phNBfXb9fYxDvcNMZgvMjAcfENJJgx3SCSM1oYMly5oYmlbA28c6eelN4+x6+1jvHawj7FOpG5vynJRWwOXLWzmwtZgQ7D5lYPsP9Qf6fecSyeYW5dhTl2at44O0Ns/8xv6hAW/h9bGLHPq0qQSRib8jAaHi7x7fJCeY4Nk0wly6SS5dJKWhgznz8lRl0mRTEDSgnUO9Q2TD//+ToQbscZcipb6DG1NGRbOqeO8piz5opNNJahLJ+kbKvDu8UEOHh/k3eNDJMxob8rS2pDhvOYsbY3ZcGOWoC6TpOjwVm8/+ULw9/VW7wCHTwxhZlzYUs/ilnoubK0nnUxQKDrppJ32jS7YyObZ9dYxkgnj/LlBvaZyqa7tb/Ty777y/xguOP/r16/iV648f/K/lClw99G/4WqIGvqps1GZ2czMmFOXZk5dmve0N7L6igU05dLVrtaMeOfY4Gjo/+6Ny08ZVnF3fuv6pVwyv2n0H3TVkhaeffUQa1Yu4r5fvnjG6/fh/7mJvT19fGB5O3/1H6/moe/u4G83v0YyYTz73z9cMTgGhgsc7R+mb6iAEWwoR3reI73vXDrJvPr0Kev3DeZ5s7eft3oHOTowTF06SSppDA4XKYQdpVQiWPfYwDCHTwwzlC+ODscFnxknN4QEgZBMGA2ZJIP5IkdODHP4xBCHTwRDbcMFp28wT9GDb1uXLmzihuXtDBWKDAwV6B8ucPD4EF2vHWYwXxwdghsYLtCQTVGfSZJMGNlUgkVz6zg+mGdvz3E2vzI47rfVXDpBW2OWfMF59/jgKd/KzlTCOGXDPNJpSIXfBPuG8hU33G2NwYapMZuiKZeiIZsa/f2kwg5BMmmkE0ai5PW2vn6ErfsPM6cuQ75Y5P4ntrLjwFHmN2VJJhMkw/UTiaADNZQv0phNk0klGC4UGS4UOdQ3xM4DR3n57WMc7BviivPn8L4L53L5+XOozyTDTkuCTDJJOmWkSzpXg8NFXj90gr/f+gabXu5hYLjAPdcv5fb3dZBIGIvm1kX63PKFIq8dOkHPsUGuvah14hWmQKF/huIa+ADNJW3LpU8dDzczLl1w2mWTALiorWFG6zViTl169KeZMbc+mG7Kjb2PYKSHfKYasikuPq+Ji89rmnyFzxJ3n7Cn3D9U4J1jA6STwdDdiaE8jdkUbY1Z6jPJU9Y/1Dc02vvv7R9mYDjY4BTdWTS3jkwqQSqRYG59moVzcuQLzr53j9N9uJ+9PX0AZJLGcCHcJxR+Myi605gNAn1JawPb9h/hsX/eCwT7odoaMxwfzAf3sx7KBxvNcMM2Mpw28g02H35z7Zhbx+3v6+D+D13MS28e47f/pos/37T3jD/D1oYMly5soqOlnhe6e/nHHW+d0fqphPHRyxdwfDDPl36wmy/9YDcQ/B/l0kly4bfx0Q2QndwQGca+nuP0DRW4+LxGnv6vN0zpm8+EdZ2xV5ZZZyRUgUh/dJ1L5vHsq4dYUoXQL/2ZPLeu4n3WRfld1WWSXNga7ffU0pChpSHDsvnR6/D+hhbef2H05SEY/hsJ/b+5ZxXL509tA9sxr56t/+MmmuvSHO0fHt3QlG4o0kmjt38Yd0gnE6STRmMuRXvjqUNMPccG2f3OMQaHw/1ShSLDIz8LxfBbY/AtYnFLPZcsaKKtMQvAi28e5Sd7DzKYL3Kob5DBfDAsO5AvlAwHnxwSduDKjkUsn9/E6isWzGjgg0JfSpSGfhSfvGk5H7l8AZctrPwNYLqNfMsaqWfzGdZXzi1zS35/I9/apmpeQ+aUn5V0zJv4ddqbsrQ3ZSdVh8sWNp+1/4nJ0DFtMqq57sz6AKlkgpWL585QbSq8X7iDrLynL7PTvPqTwTy3buyQluml0JdRsyVEm3LBxmm21FcqKw39kaOWZObpk5ZRzbNkJ3WirMcvs9PIxlvOLoW+jDrXx8jLj/QbCf1z60wTiUonplWHQl9GJRPGpQua+OxtV1S7KhXdsLwNgBXhTrKR0F99xYKq1UlkttEZuTKrHOoboqXkyIy3egdobcyMnjAjs8t3t73B3PoMH1g+c/fGrhXTej19M1ttZrvMbI+ZPVhhftbMvhHO/6mZLSmZ9+mwfJeZffRMGiFSrqXsULwFc3IK/FlszcpFCvyzbML/lvAet48CNwMrgLvMbEXZYvcAh939YuCLwJ+E664guKfu5cBq4Csj98wVEZGzL0oXaRWwx933ufsQsB5YU7bMGuCvw+ffAj5swWlla4D17j7o7q8Ae8LXExGRKogS+ouA/SXT3WFZxWXCG6n3Aq0R1xURkbMkSuhXOq6qfO/vWMtEWRczW2tmXWbW1dPTE6FKIiIyGVFCvxtYXDLdARwYaxkzSwFzgEMR18XdH3f3TnfvbG/XTh0RkZkSJfS3AMvMbKmZZQh2zG4oW2YDcHf4/A7gRx4cC7oBuDM8umcpsAx4dnqqLiIiZ2rC86DdPW9m9wNPAUlgnbvvMLOHgS533wB8DfhbM9sTSouWAAAD3ElEQVRD0MO/M1x3h5k9CewE8sB97q47jouIVIlOzhIRiYFZe49cM+sBXpvCS7QB705TdWYLtbk2qM21YbJtvtDdJ9wpes6F/lSZWVeUrV2cqM21QW2uDTPdZp2/LiJSQxT6IiI1JI6h/3i1K1AFanNtUJtrw4y2OXZj+iIiMrY49vRFRGQMsQn9ia75P1uZ2Toze8fMtpeUtZjZ02a2O/w5Lyw3M/ty+Bk8b2bvq17NJ8/MFpvZj83sRTPbYWYPhOWxbbeZ5czsWTP7edjmPwrLl4b3qNgd3rMiE5aPeQ+L2cbMkma21cy+F07Hus1m9qqZvWBm28ysKyw7a3/bsQj9iNf8n63+iuBeBKUeBH7o7suAH4bTELR/WfhYC/z5WarjdMsDv+fulwHXAveFv884t3sQ+JC7/wKwElhtZtcS3Jvii2GbDxPcuwLGuIfFLPUA8GLJdC20+ZfdfWXJoZln72/b3Wf9A/hF4KmS6U8Dn652vaaxfUuA7SXTu4CF4fOFwK7w+VeBuyotN5sfwHeBm2ql3UA98DPgGoKTdFJh+ejfOcFlUX4xfJ4Kl7Nq130Sbe0IQ+5DwPcIrswb9za/CrSVlZ21v+1Y9PSpvev2z3f3NwHCn+eF5bH7HMKv8FcBPyXm7Q6HObYB7wBPA3uBIx7cowJObddY97CYbb4E/DegGE63Ev82O/BPZvacma0Ny87a3/aEF1ybJSJdt78GxOpzMLNG4NvA77r70eBmbJUXrVA269rtwcUIV5rZXOA7wGWVFgt/zvo2m9mvAO+4+3Nm9sGR4gqLxqbNoevc/YCZnQc8bWYvjbPstLc5Lj39SNftj5G3zWwhQPjznbA8Np+DmaUJAv/v3P3/hMWxbzeAux8BNhHsz5gb3qMCTm3XWPewmE2uA241s1cJbsP6IYKef5zbjLsfCH++Q7BxX8VZ/NuOS+hHueZ/nJTev+BugjHvkfL/EO7xvxboHfnKOJtY0KX/GvCiu3+hZFZs221m7WEPHzOrA24k2Ln5Y4J7VMDpba50D4tZw90/7e4d7r6E4H/2R+7+74lxm82swcyaRp4DHwG2czb/tqu9U2Mad47cArxMMA76B9WuzzS26+vAm8AwwVb/HoJxzB8Cu8OfLeGyRnAU017gBaCz2vWfZJuvJ/gK+zywLXzcEud2A1cCW8M2bwceCssvIrjx0B7gm0A2LM+F03vC+RdVuw1TbP8Hge/Fvc1h234ePnaMZNXZ/NvWGbkiIjUkLsM7IiISgUJfRKSGKPRFRGqIQl9EpIYo9EVEaohCX0Skhij0RURqiEJfRKSG/H/rAn1gYKa0hwAAAABJRU5ErkJggg==\n",
188 |       "text/plain": [
189 |        "<Figure size 432x288 with 1 Axes>"
190 |       ]
191 |      },
192 |      "metadata": {
193 |       "needs_background": "light"
194 |      },
195 |      "output_type": "display_data"
196 |     }
197 |    ],
198 |    "source": [
199 |     "\n",
200 |     "loss_list  = []\n",
201 |     "\n",
202 |     "for epoch in range(n_epochs):\n",
203 |     "    \n",
204 |     "    epoch +=1\n",
205 |     "    \n",
206 |     "    epoch_loss = 0\n",
207 |     "    optimizer.zero_grad()\n",
208 |     "    \n",
209 |     "    for x, y in data_loader:\n",
210 |     "        \n",
211 |     "        p_y = model(x)\n",
212 |     "        \n",
213 |     "        loss = loss_function(p_y, y)\n",
214 |     "        loss.backward()\n",
215 |     "        optimizer.step()\n",
216 |     "        \n",
217 |     "        epoch_loss += loss.data[0]\n",
218 |     "        \n",
219 |     "    #print('epoch {}, loss {}'.format(epoch,epoch_loss))\n",
220 |     "    loss_list.append(epoch_loss)\n",
221 |     "    \n",
222 |     "\n",
223 |     "x_train = data_loader.dataset.tensors[0]\n",
224 |     "predicted =model(x_train).data.numpy()\n",
225 |     "y_correct = data_loader.dataset.tensors[1].data.numpy()\n",
226 |     "\n",
227 |     "\n",
228 |     "plt.plot(y_correct, label = 'from data', alpha = .5)\n",
229 |     "plt.plot(predicted, label = 'prediction', alpha = 0.5)\n",
230 |     "plt.legend()\n",
231 |     "plt.show()\n",
232 |     "#print(model.state_dict())\n",
233 |     "\n",
234 |     "plt.plot(loss_list, label = 'train')\n",
235 |     "plt.legend();"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": []
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "predictions = [{'pred': float(x[0]), 'act':float(act_list[i][0])} for i, x in enumerate(pred_list)]"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "# Click predictions vs actual if prediction >= 10. \n",
261 |     "predictions"
262 |    ]
263 |   }
264 |  ],
265 |  "metadata": {
266 |   "kernelspec": {
267 |    "display_name": "Python [conda env:Pytorch]",
268 |    "language": "python",
269 |    "name": "conda-env-Pytorch-py"
270 |   },
271 |   "language_info": {
272 |    "codemirror_mode": {
273 |     "name": "ipython",
274 |     "version": 3
275 |    },
276 |    "file_extension": ".py",
277 |    "mimetype": "text/x-python",
278 |    "name": "python",
279 |    "nbconvert_exporter": "python",
280 |    "pygments_lexer": "ipython3",
281 |    "version": "3.6.6"
282 |   }
283 |  },
284 |  "nbformat": 4,
285 |  "nbformat_minor": 2
286 | }
287 | 


--------------------------------------------------------------------------------
/Tutorials/Pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLTSEO/MLTS/c44328d2a83faea03abcd3ff3960b184fd83de96/Tutorials/Pytorch/__init__.py


--------------------------------------------------------------------------------
/Tutorials/Todo.txt:
--------------------------------------------------------------------------------
1 | https://github.com/udacity/deep-learning
2 | 


--------------------------------------------------------------------------------
/Tutorials/Training Data/Get Images in Chrome DevTools/README.md:
--------------------------------------------------------------------------------
 1 | ## Google-DevTools-Snippets
 2 | Google DevTools Snippets for SEO &amp; ML
 3 | 
 4 | # Chrome DevTools Snippet Example
 5 | 
 6 | #### #&#x2060;1 Copy the get-image-urls.js script and paste it into a new snippet in your Google DevTools Console:
 7 | 
 8 | <img width="1439" alt="screen shot 2019-02-28 at 4 50 36 pm" src="https://user-images.githubusercontent.com/5594118/53609026-d7da4b80-3b79-11e9-8a30-03a440e1315d.png">
 9 | 
10 | #### #&#x2060;2 Command (⌘) + Save (s) to save the snippet for future applications. 
11 | 
12 | Sidenote: Right click on the Snippet title you can 'rename' to whatever you would like.
13 | 
14 | #### #&#x2060;3 To use your new image snippet do a Google Image Search and scroll down to however many images you would like to save. 
15 | 
16 | #### #&#x2060;4 Navigate to Chrome DevTools Sources > Snippets > right click your saved snippet and click 'Run'. 
17 | 
18 | #### #&#x2060;5 A .txt file of all the image URLs will automatically download. 
19 | 
20 | <img width="662" alt="screen shot 2019-02-28 at 5 18 55 pm" src="https://user-images.githubusercontent.com/5594118/53609756-3523cc00-3b7d-11e9-9f56-62145433b6e7.png">
21 | 
22 | 
23 | # Download Images From URLs
24 | 
25 | ## Use Brew wget
26 | 
27 | #### #&#x2060;1 Install Homebrew
28 | 
29 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
30 | 
31 | #### #&#x2060;2 Install wget
32 | 
33 | brew install wget
34 | 
35 | #### #&#x2060;3 Add your urls.txt into a new folder. Navigate to that folder in your terminal (using 'ls & cd' commands in your Terminal). https://wsvincent.com/terminal-command-line-for-beginners/
36 | 
37 | #### #&#x2060;4 Run 'wget -i urls.txt' into your terminal (use whatever name your url.txt file is).
38 | 
39 | #### #&#x2060;5 All images will be downloaded into the folder.
40 | 
41 | <img width="832" alt="screen shot 2019-02-28 at 6 33 26 pm" src="https://user-images.githubusercontent.com/5594118/53612504-7caf5580-3b87-11e9-9ed1-22600273ffb8.png">
42 | <img width="775" alt="screen shot 2019-02-28 at 6 35 08 pm" src="https://user-images.githubusercontent.com/5594118/53612535-9a7cba80-3b87-11e9-97ca-eba88cbe72eb.png">
43 | 
44 | # WOOOOHOOOOOOOO!!!! Training Data For AAALLLLL
45 | 


--------------------------------------------------------------------------------
/Tutorials/Training Data/Get Images in Chrome DevTools/get-image-urls.js:
--------------------------------------------------------------------------------
 1 | // Modified Adrian Rosebrock repo by Britney Muller -- Feb 2019.
 2 | // pull down jquery into the JavaScript console
 3 | var script = document.createElement('script');
 4 | script.src = "https://ajax.googleapis.com/ajax/libs/jquery/2.2.0/jquery.min.js";
 5 | document.getElementsByTagName('head')[0].appendChild(script);
 6 | 
 7 | 
 8 | // grab the Google Image URLs
 9 | var urls = $('.rg_di .rg_meta').map(function() { return JSON.parse($(this).text()).ou; });
10 | 
11 | 
12 | // write the image URls to a txt tile (one per line)
13 | var textToSave = urls.toArray().join('\n');
14 | var hiddenElement = document.createElement('a');
15 | hiddenElement.href = 'data:attachment/text,' + encodeURI(textToSave);
16 | hiddenElement.target = '_blank';
17 | hiddenElement.download = 'urls.txt';
18 | hiddenElement.click();
19 | 
20 | // Find the urls.txt file in your downloads folder. Very useful for labeled training data SOOONNN!
21 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from api.google_search_console import GscClient
 4 | from api.python_semrush import SEMRushClient
 5 | from api.google_analytics import GaClient
 6 | from api.watson import WatsonClient
 7 | 
 8 | from api.scrape import extract_url_data
 9 | 
10 | from api.google import authenticate
11 | from config import config
12 | 
13 | client = authenticate(config)
14 | gaservice = GaClient(client)
15 | semrushservice = SEMRushClient(config)
16 | gscservice = GscClient(client)
17 | watsonservice = WatsonClient(config)
18 | 


--------------------------------------------------------------------------------
/api/google/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function, unicode_literals
 2 | 
 3 | import logging
 4 | import warnings
 5 | 
 6 | import httplib2
 7 | from apiclient import errors
 8 | from apiclient.discovery import build
 9 | from oauth2client.client import OAuth2WebServerFlow
10 | from oauth2client.file import Storage
11 | from urllib.error import HTTPError
12 | logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
13 | logging.getLogger('oauth2client._helpers').setLevel(logging.ERROR)
14 | warnings.filterwarnings("ignore")
15 | 
16 | 
17 | # Call GSC Service
18 | def authenticate(config):
19 | 
20 |     # Check https://developers.google.com/webmaster-tools/search-console-api-original/v3/ for all available scopes
21 |     OAUTH_SCOPE = config['OAUTH_SCOPE']
22 | 
23 |     # Redirect URI for installed apps
24 |     REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'
25 | 
26 |     # Create a credential storage object.  You pick the filename.
27 |     storage = Storage(config['GOOGLE_CREDENTIALS'])
28 | 
29 |     # Attempt to load existing credentials.  Null is returned if it fails.
30 |     credentials = storage.get()
31 | 
32 |     # Only attempt to get new credentials if the load failed.
33 |     if not credentials:
34 | 
35 |         # Run through the OAuth flow and retrieve credentials
36 |         flow = OAuth2WebServerFlow(config['CLIENT_ID'], config['CLIENT_SECRET'], OAUTH_SCOPE, REDIRECT_URI)
37 |         authorize_url = flow.step1_get_authorize_url()
38 |         print ('Go to the following link in your browser: ' + authorize_url)
39 |         code = input('Enter verification code: ').strip()
40 |         credentials = flow.step2_exchange(code)
41 |         storage.put(credentials)
42 |         if storage.get():
43 |             print('Credentials saved for later.')
44 | 
45 |     # Create an httplib2.Http object and authorize it with our credentials
46 |     http = httplib2.Http()
47 | 
48 |     client = credentials.authorize(http)
49 | 
50 |     return client
51 | 


--------------------------------------------------------------------------------
/api/google_analytics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Wrapper around the GA API."""
 3 | 
 4 | import httplib2
 5 | from apiclient.discovery import build
 6 | from googleanalytics import utils, account, auth
 7 | import addressable
 8 | from config import config
 9 | from .errors import *
10 | 
11 | 
12 | class GaClient(object):
13 | 
14 |     def __init__(self, *kwargs):
15 |         if len(kwargs) > 0:
16 |             client = kwargs[0]
17 |         if not isinstance(client,httplib2.Http):
18 |             from api.google import authenticate
19 |             try:
20 |                 client = authenticate()
21 |             except:
22 |                 raise GaConfigError('Make sure that CLIENT_ID and CLIENT_SECRET is set in config.py')
23 | 
24 |         self.DATA_FOLDER = config['DATA_FOLDER']
25 |         self.ROW_LIMIT = config['ROW_LIMIT']
26 |         self.accounts = []
27 |         self.client = client
28 | 
29 |         self.get_ga_service()
30 | 
31 |     # Call GA Service
32 |     def get_ga_service(self):
33 |         service = build('analytics', 'v3', http=self.client)
34 |         raw_accounts = service.management().accounts().list().execute()['items']
35 |         accounts = [account.Account(raw, service, self.client.credentials) for raw in raw_accounts]
36 | 
37 |         self.accounts = addressable.List(accounts, indices=['id', 'name'], insensitive=True)
38 | 
39 |         return addressable.List(accounts, indices=['id', 'name'], insensitive=True)
40 | 
41 | 
42 |     def get_profile(self, account=None, webproperty=None, profile=None, default_profile=True):
43 | 
44 |         return auth.navigate(self.accounts, account=account, webproperty=webproperty, profile=profile, default_profile=default_profile)
45 | 


--------------------------------------------------------------------------------
/api/google_analytics/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function, unicode_literals
 2 | 
 3 | 
 4 | class BaseGaError(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | class GaConfigError(BaseGaError):
 9 |     pass
10 | 
11 | 
12 | class GaApiError(BaseGaError):
13 |     pass
14 | 


--------------------------------------------------------------------------------
/api/google_search_console/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Wrapper around the GSC API."""
 3 | 
 4 | 
 5 | from collections import namedtuple
 6 | from .gsc import GscClient
 7 | 
 8 | version_info_t = namedtuple(
 9 |     'version_info_t', ('major', 'minor', 'micro', 'releaselevel', 'serial'),
10 | )
11 | 
12 | 
13 | 
14 | VERSION = version_info_t(0, 1, 2, '', '')
15 | __version__ = '{0.major}.{0.minor}.{0.micro}{0.releaselevel}'.format(VERSION)
16 | __author__ = 'JR Oakes'
17 | __contact__ = 'jroakes@gmail.com'
18 | __homepage__ = 'http://github.com/jroakes/'
19 | __docformat__ = 'restructuredtext'
20 | 
21 | # -eof meta-
22 | 


--------------------------------------------------------------------------------
/api/google_search_console/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function, unicode_literals
 2 | 
 3 | 
 4 | class BaseGscError(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | class GscConfigError(BaseGscError):
 9 |     pass
10 | 
11 | 
12 | class GscApiError(BaseGscError):
13 |     pass
14 | 


--------------------------------------------------------------------------------
/api/google_search_console/gsc.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, print_function, unicode_literals
  2 | 
  3 | import sys
  4 | import httplib2
  5 | import pandas as pd
  6 | import time
  7 | import re
  8 | from tqdm import tqdm
  9 | import datetime as dt
 10 | import os
 11 | from datetime import timedelta, date
 12 | from urllib.error import HTTPError
 13 | from apiclient import errors
 14 | from apiclient.discovery import build
 15 | 
 16 | from config import config
 17 | from .errors import *
 18 | 
 19 | class GscClient(object):
 20 | 
 21 |     def __init__(self, *kwargs):
 22 |         if len(kwargs) > 0:
 23 |             client = kwargs[0]
 24 |         if not isinstance(client,httplib2.Http):
 25 |             from api.google import authenticate
 26 |             try:
 27 |                 client = authenticate()
 28 |             except:
 29 |                 raise GscConfigError('Make sure that CLIENT_ID and CLIENT_SECRET is set in config.py')
 30 | 
 31 |         self.DATA_FOLDER = config['DATA_FOLDER']
 32 |         self.ROW_LIMIT = config['ROW_LIMIT']
 33 |         self.client = client
 34 | 
 35 |     # Call GSC Service
 36 |     def get_gsc_service(self):
 37 | 
 38 |         webmasters_service = build('webmasters', 'v3', http=self.client)
 39 | 
 40 |         return webmasters_service
 41 | 
 42 |     @staticmethod
 43 |     def daterange(start_date, end_date):
 44 |         for n in range(int ((end_date - start_date).days)):
 45 |             yield start_date + timedelta(n)
 46 | 
 47 |     @staticmethod
 48 |     def execute_request(service, property_uri, request):
 49 |         """
 50 |           Executes a searchAnalytics.query request.
 51 |           Args:
 52 |             service: The webmasters service to use when executing the query.
 53 |             property_uri: The site or app URI to request data for.
 54 |             request: The request to be executed.
 55 |           Returns:
 56 |             An array of response rows.
 57 |         """
 58 |         return service.searchanalytics().query(
 59 |           siteUrl=property_uri, body=request).execute()
 60 |     
 61 |     
 62 |     
 63 |     '''
 64 |     Parameters:
 65 | 
 66 |     Positional:
 67 |     clienturl: (str) The domain URL property name in Google Search Console.
 68 |     days_back: (int) How many days history to pull.
 69 | 
 70 |     Keyword:
 71 |     thresholdtype: (str)  'click' or 'impression'. Default: impression
 72 |     threshold: (int) Keep pulling, daily until less than this number of either clicks or impressions. Default: 1
 73 |     poslimit: (int) Omit results above this limit. Default: None
 74 |     country: (str) Country. Default: usa
 75 |     outputFn: (str) Name of the output file.  If not set, a unique name will be chosen.
 76 |     '''
 77 |     def get_site_data(self, clienturl, days, **data):
 78 | 
 79 |         thresholdtype = data.get('threshold_type', 'impression')
 80 |         threshold = data.get('threshold', 1)
 81 |         poslimit = data.get('pos_limit', None)
 82 |         country = data.get('country', 'usa')
 83 |         outputFn = data.get('output_fn', "".join([self.DATA_FOLDER, "/", "gsc_", re.sub('[^0-9a-zA-Z]+', '_', clienturl), dt.date.today().strftime("%Y_%m"), ".csv"]))
 84 | 
 85 |         if (self.DATA_FOLDER + "/") not in outputFn and os.path.isdir(self.DATA_FOLDER):
 86 |             outputFn = "".join([self.DATA_FOLDER, "/",outputFn])
 87 | 
 88 |         start_date = (dt.date.today()-dt.timedelta(days = (days+3) ))
 89 |         end_date = (dt.date.today()-dt.timedelta(days = 3))
 90 | 
 91 |         row_limit = self.ROW_LIMIT
 92 | 
 93 |         if os.path.isfile(outputFn):
 94 |             print('Reloading Existing: ' + outputFn)
 95 |             df = pd.read_csv(outputFn, encoding = "utf-8")
 96 |             if poslimit is not None:
 97 |                 return df[df.position <= poslimit]
 98 |             return df
 99 | 
100 |         output = []
101 | 
102 |         print("Building new {} file".format(outputFn));
103 |         print('Getting Webmaster Service')
104 |         webmasters_service = self.get_gsc_service()
105 |         time.sleep(1)
106 | 
107 |         pbar = tqdm(total=int((end_date - start_date).days), desc='Pulling Google Search Console Data', file=sys.stdout)
108 | 
109 |         for single_date in self.daterange(start_date, end_date):
110 | 
111 |             month_date = str(single_date.strftime("%Y-%m"))
112 |             single_date = str(single_date)
113 |             pbar.update()
114 | 
115 |             try:
116 |                     n = 0
117 |                     Count = 11
118 |                     startRow = 0
119 |                     while (Count >= threshold):
120 | 
121 |                         #print("-----Executing------- " + str(startRow))
122 |                         request = {
123 |                             'startDate': single_date,
124 |                             'endDate': single_date,
125 |                             'dimensions': ['query', 'page'],
126 |                              'dimensionFilterGroups': [
127 |                               {
128 |                                'filters': [
129 |                                 {
130 |                                  'dimension': 'country',
131 |                                  'expression': country
132 |                                 }
133 |                                ]
134 |                               }
135 |                               ],
136 |                             'rowLimit': row_limit,
137 |                             'startRow': int(startRow)
138 |                         }
139 |                         try:
140 |                             response = self.execute_request(webmasters_service, clienturl, request)
141 |                         except Exception as e:
142 |                             print("API Error:", str(e))
143 |                             time.sleep(30)
144 |                             continue
145 | 
146 |                         startRow = startRow + (row_limit)
147 |                         tCount, NewOutput = self.handle_response(response, clienturl, thresholdtype, threshold, month_date)
148 |                         output = output + NewOutput
149 | 
150 |                         n = n + 1
151 |                         if (n % 3 == 0):
152 |                             time.sleep(1)
153 |                         Count = int(tCount)
154 | 
155 | 
156 |             except Exception as e:
157 |                     raise GscApiError(str(e))
158 | 
159 | 
160 |         pbar.close()
161 | 
162 |         df = pd.DataFrame(output)
163 |         print("Total rows found: {}. Saving to csv.".format(str( len(df) ) ) );
164 |         df.to_csv(outputFn, header=True, index=False, encoding='utf-8')
165 | 
166 |         if poslimit:
167 |             return df[df.position <= poslimit]
168 | 
169 |         return df
170 | 
171 |     @staticmethod
172 |     def handle_response(response, clienturl, thresholdtype, threshold, month_date):
173 | 
174 |         output = []
175 |         tCount = -1
176 | 
177 |         if 'rows' not in response:
178 |             return int(tCount), output
179 | 
180 |         rows = response['rows']
181 |         row_format = '{:<20}' + '{:>20}' * 4
182 |         for row in rows:
183 |             keys = ''
184 | 
185 |             if 'keys' in row:
186 | 
187 |                 if thresholdtype == 'click':
188 |                     tcheck = int(row['clicks'])
189 |                 else:
190 |                     tcheck = int(row['impressions'])
191 | 
192 |                 if tcheck < int(threshold):
193 |                     continue
194 | 
195 |                 query = str(row['keys'][0])
196 |                 page = str(row['keys'][1])
197 |                 dict = {'clientID': clienturl, 'query': query, 'page': page,
198 |                         'clicks': row['clicks'], 'impressions': row['impressions'], 'ctr': row['ctr'],
199 |                         'position': int(row['position']), 'month':str(month_date)}
200 | 
201 |                 output.append(dict)
202 |                 tCount = tcheck
203 | 
204 |         return int(tCount), output
205 | 


--------------------------------------------------------------------------------
/api/python_semrush/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Wrapper around the SEMrush API."""
 3 | # :copyright: (c) 2015 Jeremy Storer and individual contributors,
 4 | #                 All rights reserved.
 5 | # :license:   MIT License, see LICENSE for more details.
 6 | 
 7 | 
 8 | from collections import namedtuple
 9 | from .semrush import SEMRushClient
10 | 
11 | version_info_t = namedtuple(
12 |     'version_info_t', ('major', 'minor', 'micro', 'releaselevel', 'serial'),
13 | )
14 | 
15 | VERSION = version_info_t(0, 1, 2, '', '')
16 | __version__ = '{0.major}.{0.minor}.{0.micro}{0.releaselevel}'.format(VERSION)
17 | __author__ = 'Jeremy Storer'
18 | __contact__ = 'storerjeremy@gmail.com'
19 | __homepage__ = 'http://github.com/storerjeremy/python-semrush'
20 | __docformat__ = 'restructuredtext'
21 | 
22 | # -eof meta-
23 | 


--------------------------------------------------------------------------------
/api/python_semrush/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, print_function, unicode_literals
 2 | 
 3 | 
 4 | class BaseSemrushError(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | class SemRushKeyError(BaseSemrushError):
 9 |     pass
10 | 
11 | 
12 | class SemRushRegionalDatabaseError(BaseSemrushError):
13 |     pass


--------------------------------------------------------------------------------
/api/python_semrush/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/api/python_semrush/tests/response.txt:
--------------------------------------------------------------------------------
1 | Date;Database;Domain;Rank;Organic Keywords;Organic Traffic;Organic Cost;Adwords Keywords;Adwords Traffic;Adwords Cost;PLA keywords;PLA uniques\r\n20160109;au;roi.com.au;63497;88;429;2945;1092;905;13658;0;0\r\n20160109;ca;roi.com.au;2531198;0;0;0;3;0;0;0;0\r\n20160109;es;roi.com.au;0;2;0;0;0;0;0;0;0\r\n20160109;il;roi.com.au;268908;1;0;0;0;0;0;0;0\r\n20160109;in;roi.com.au;1095780;4;3;20;0;0;0;0;0\r\n20160109;jp;roi.com.au;1527052;1;0;0;0;0;0;0;0\r\n20160109;mobile-us;roi.com.au;5410614;13;23;0;6;12;24;0;0\r\n20160109;sg;roi.com.au;316149;1;0;0;0;0;0;0;0\r\n20160109;uk;roi.com.au;2688625;0;0;0;3;0;0;0;0\r\n20160108;us;roi.com.au;3658541;628;26;7;61;0;1;0;0\r\n


--------------------------------------------------------------------------------
/api/python_semrush/tests/test_semrush.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import, print_function, unicode_literals
 3 | import os
 4 | from unittest import TestCase
 5 | try:
 6 |     from unittest.mock import patch
 7 | except:
 8 |     from mock import patch
 9 | from python_semrush.semrush import SemrushClient
10 | from requests import Response
11 | 
12 | 
13 | def semrush_response_bytes(filename):
14 |     with open(os.path.join(os.path.dirname(__file__), filename), 'rb') as f:
15 |         return f.read()
16 | 
17 | 
18 | class SemrushTestCase(TestCase):
19 | 
20 |     def test_parse_response(self):
21 |         with open(os.path.join(os.path.dirname(__file__), 'response.txt'), 'rb') as f:
22 |             response = SemrushClient.parse_response(f.read())
23 |             self.assertEqual(response.__class__, list)
24 |             self.assertEqual(len(response), 10)
25 | 
26 |     @patch('requests.get')
27 |     def test_domain_ranks(self, RequestsGet):
28 |         contents = semrush_response_bytes('response.txt')
29 | 
30 |         RequestsGet.return_value = Response()
31 |         RequestsGet.return_value.status_code = 200
32 |         RequestsGet.return_value._content = contents
33 | 
34 |         s = SemrushClient(key='fdjsaiorghrtbnjvlouhsdlf')
35 |         result = s.domain_ranks('example.com')
36 |         self.assertEqual(len(result), 10)
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/api/scrape.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
 3 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 4 | 
 5 | from goose3 import Goose
 6 | from bs4 import BeautifulSoup
 7 | from lxml import etree
 8 | 
 9 | import json
10 | import pandas as pd
11 | import urllib
12 | from urllib.parse import urlparse
13 | import re
14 | 
15 | 
16 | def parse_html(html):
17 | 
18 |     article = None
19 | 
20 |     try:
21 |         extractor = Goose()
22 |         article = extractor.extract(raw_html=html)
23 |         clean_text = article.cleaned_text
24 |         print("Extracted {} words.".format( str(len(clean_text.split(' '))) ))
25 | 
26 |     except Exception as e:
27 |         print ("Clean HTML Error:", e)
28 |         clean_text = None
29 | 
30 |     return clean_text, article
31 | 
32 | 
33 | 
34 | def extract_url_data(url, **data):
35 | 
36 |     timeout = data.get('timeout', 10)
37 |     render_endpoint = data.get('render_endpoint', '')
38 |     user_agent = data.get('user_agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36')
39 | 
40 |     render_url = render_endpoint + url
41 | 
42 |     try:
43 |         print('loading url {}'.format(render_url))
44 |         headers = {'user-agent': user_agent}
45 |         response = requests.get(render_url, headers=headers, timeout=timeout)
46 |         status = response.status_code
47 |         html = response.text
48 | 
49 |         text, article = parse_html(html)
50 | 
51 |         infos = article.infos
52 | 
53 |         soup = BeautifulSoup(html, 'lxml')
54 |         infos['h1'] = [x.text for x in soup.findAll("h1")]
55 |         infos['h2'] = [x.text for x in soup.findAll("h2")]
56 |         infos['html'] = html
57 | 
58 |     except Exception as e:
59 |         print('Extract Text URL Error: ',str(e))
60 |         return None, None
61 | 
62 |     return text, infos
63 | 


--------------------------------------------------------------------------------
/api/todo.txt:
--------------------------------------------------------------------------------
1 | https://github.com/debrouwere/google-analytics
2 | https://github.com/storerjeremy/python-semrush
3 | Moz
4 | Ahrefs
5 | Majestic
6 | 


--------------------------------------------------------------------------------
/api/watson.py:
--------------------------------------------------------------------------------
 1 | from watson_developer_cloud import NaturalLanguageUnderstandingV1
 2 | from watson_developer_cloud.natural_language_understanding_v1 import Features, EntitiesOptions, KeywordsOptions
 3 | import pandas as pd
 4 | from config import config
 5 | 
 6 | 
 7 | class BaseWatsonError(Exception):
 8 | 	pass
 9 | 
10 | class WatsonConfigError(BaseWatsonError):
11 | 	pass
12 | 
13 | class WatsonApiError(BaseWatsonError):
14 | 	pass
15 | 
16 | 
17 | class WatsonClient(object):
18 | 
19 | 	def __init__(self, config):
20 | 
21 | 		if not config or 'IBM_WATSON_CREDENTIALS' not in config:
22 | 			raise WatsonConfigError('No config data provided. Make sure that IBM_WATSON_CREDENTIALS is set in config.py')
23 | 
24 | 		credentials = config['IBM_WATSON_CREDENTIALS']
25 | 
26 | 		try:
27 | 			self.natural_language_understanding = NaturalLanguageUnderstandingV1 (
28 | 			version=credentials['version'],
29 | 			username= credentials['username'],
30 | 			password=credentials['password']
31 | 			)
32 | 
33 | 		except Exception as e:
34 | 			raise WatsonApiError(str(e))
35 | 
36 | 	def watson_keywords(self, html, **data):
37 | 
38 | 		try:
39 | 			response = self.natural_language_understanding.analyze(html=html,features=Features(keywords=KeywordsOptions()))
40 | 			if "keywords" in response:
41 | 				keywords = response["keywords"]
42 | 				return pd.DataFrame(keywords)
43 | 			else:
44 | 				return pd.DataFrame()
45 | 
46 | 		except Exception as e:
47 | 			raise WatsonApiError(str(e))
48 | 
49 | 	def watson_entities(self, html, **data):
50 | 
51 | 		try:
52 | 			response = self.natural_language_understanding.analyze(html=html,features=Features(entities=EntitiesOptions()))
53 | 			if "entities" in response:
54 | 				entities = response["entities"]
55 | 				return pd.DataFrame(entities)
56 | 			else:
57 | 				return pd.DataFrame()
58 | 
59 | 		except Exception as e:
60 | 			raise WatsonApiError(str(e))
61 | 


--------------------------------------------------------------------------------
/config-demo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | config= {
 3 |     # Google Cloud Credentials
 4 |     'CLIENT_ID': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.apps.googleusercontent.com',
 5 |     'CLIENT_SECRET': 'XXXXXXXXXXXXXXXXXXXXXXXXXX',
 6 |     'OAUTH_SCOPE': ['https://www.googleapis.com/auth/webmasters.readonly','https://www.googleapis.com/auth/analytics.readonly'],
 7 |     'GOOGLE_CREDENTIALS': 'XXXXXXXXXXXXXXXXXXXXXX',
 8 | 
 9 |     # GSC data
10 |     'ROW_LIMIT': 25000,
11 |     'DATA_FOLDER': 'data',
12 | 
13 |     # SEMRush
14 |     'SEMRUSH_KEY': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX',
15 |     'SEMRUSH_API_URL': 'http://api.semrush.com/',
16 | 
17 |     # Proxy Service
18 |     'PROXY_ENDPOINT': 'http://a-proxy-service/endpoint',
19 | 
20 |     'RANDOM_SEED': 12345,
21 | 
22 |     # IBM Watson
23 |     'IBM_WATSON_CREDENTIALS':  {
24 |                           "url": "https://gateway.watsonplatform.net/natural-language-understanding/api",
25 |                           "version": "2018-03-19",
26 |                           "username": "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
27 |                           "password": "XXXXXXXXXXXXXXXX"
28 |                          }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from torchtext import *
4 | from dataset.bert_ds import load_bert_df
5 | from dataset.pandas_ds import load_pandas
6 | 


--------------------------------------------------------------------------------
/dataset/bert_ds.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Extract pre-computed feature vectors from a PyTorch BERT model."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import argparse
 22 | import collections
 23 | import logging
 24 | import json
 25 | import re
 26 | import pandas as pd
 27 | 
 28 | import torch
 29 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 30 | from torch.utils.data.distributed import DistributedSampler
 31 | 
 32 | from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer
 33 | from pytorch_pretrained_bert.modeling import BertModel
 34 | 
 35 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 36 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 37 |                     level = logging.INFO)
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | class InputExample(object):
 42 | 
 43 |     def __init__(self, unique_id, text_a, text_b):
 44 |         self.unique_id = unique_id
 45 |         self.text_a = text_a
 46 |         self.text_b = text_b
 47 | 
 48 | 
 49 | class InputFeatures(object):
 50 |     """A single set of features of data."""
 51 | 
 52 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 53 |         self.unique_id = unique_id
 54 |         self.tokens = tokens
 55 |         self.input_ids = input_ids
 56 |         self.input_mask = input_mask
 57 |         self.input_type_ids = input_type_ids
 58 | 
 59 | 
 60 | def convert_examples_to_features(examples, seq_length, tokenizer):
 61 |     """Loads a data file into a list of `InputBatch`s."""
 62 | 
 63 |     features = []
 64 |     for (ex_index, example) in enumerate(examples):
 65 |         tokens_a = tokenizer.tokenize(example.text_a)
 66 | 
 67 |         tokens_b = None
 68 |         if example.text_b:
 69 |             tokens_b = tokenizer.tokenize(example.text_b)
 70 | 
 71 |         if tokens_b:
 72 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 73 |             # length is less than the specified length.
 74 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 75 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 76 |         else:
 77 |             # Account for [CLS] and [SEP] with "- 2"
 78 |             if len(tokens_a) > seq_length - 2:
 79 |                 tokens_a = tokens_a[0:(seq_length - 2)]
 80 | 
 81 |         # The convention in BERT is:
 82 |         # (a) For sequence pairs:
 83 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
 84 |         #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
 85 |         # (b) For single sequences:
 86 |         #  tokens:   [CLS] the dog is hairy . [SEP]
 87 |         #  type_ids: 0   0   0   0  0     0 0
 88 |         #
 89 |         # Where "type_ids" are used to indicate whether this is the first
 90 |         # sequence or the second sequence. The embedding vectors for `type=0` and
 91 |         # `type=1` were learned during pre-training and are added to the wordpiece
 92 |         # embedding vector (and position vector). This is not *strictly* necessary
 93 |         # since the [SEP] token unambigiously separates the sequences, but it makes
 94 |         # it easier for the model to learn the concept of sequences.
 95 |         #
 96 |         # For classification tasks, the first vector (corresponding to [CLS]) is
 97 |         # used as as the "sentence vector". Note that this only makes sense because
 98 |         # the entire model is fine-tuned.
 99 |         tokens = []
100 |         input_type_ids = []
101 |         tokens.append("[CLS]")
102 |         input_type_ids.append(0)
103 |         for token in tokens_a:
104 |             tokens.append(token)
105 |             input_type_ids.append(0)
106 |         tokens.append("[SEP]")
107 |         input_type_ids.append(0)
108 | 
109 |         if tokens_b:
110 |             for token in tokens_b:
111 |                 tokens.append(token)
112 |                 input_type_ids.append(1)
113 |             tokens.append("[SEP]")
114 |             input_type_ids.append(1)
115 | 
116 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
117 | 
118 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
119 |         # tokens are attended to.
120 |         input_mask = [1] * len(input_ids)
121 | 
122 |         # Zero-pad up to the sequence length.
123 |         while len(input_ids) < seq_length:
124 |             input_ids.append(0)
125 |             input_mask.append(0)
126 |             input_type_ids.append(0)
127 | 
128 |         assert len(input_ids) == seq_length
129 |         assert len(input_mask) == seq_length
130 |         assert len(input_type_ids) == seq_length
131 | 
132 |         if ex_index < 5:
133 |             logger.info("*** Example ***")
134 |             logger.info("unique_id: %s" % (example.unique_id))
135 |             logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
136 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
137 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
138 |             logger.info(
139 |                 "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
140 | 
141 |         features.append(
142 |             InputFeatures(
143 |                 unique_id=example.unique_id,
144 |                 tokens=tokens,
145 |                 input_ids=input_ids,
146 |                 input_mask=input_mask,
147 |                 input_type_ids=input_type_ids))
148 |     return features
149 | 
150 | 
151 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
152 |     """Truncates a sequence pair in place to the maximum length."""
153 | 
154 |     # This is a simple heuristic which will always truncate the longer sequence
155 |     # one token at a time. This makes more sense than truncating an equal percent
156 |     # of tokens from each, since if one sequence is very short then each token
157 |     # that's truncated likely contains more information than a longer sequence.
158 |     while True:
159 |         total_length = len(tokens_a) + len(tokens_b)
160 |         if total_length <= max_length:
161 |             break
162 |         if len(tokens_a) > len(tokens_b):
163 |             tokens_a.pop()
164 |         else:
165 |             tokens_b.pop()
166 | 
167 | 
168 | def read_examples_df(df, col):
169 |     """Read a list of `InputExample`s from an input file."""
170 |     examples = []
171 |     unique_id = 0
172 |     for i, row in df.iterrows():
173 |         line = convert_to_unicode(row[col])
174 |         if not line:
175 |             break
176 |         line = line.strip()
177 |         text_a = None
178 |         text_b = None
179 |         m = re.match(r"^(.*) \|\|\| (.*)$", line)
180 |         if m is None:
181 |             text_a = line
182 |         else:
183 |             text_a = m.group(1)
184 |             text_b = m.group(2)
185 |         examples.append(
186 |             InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
187 |         unique_id += 1
188 |     return examples
189 | 
190 | 
191 | def load_bert_df(**data):
192 | 
193 |     input_df = data.get('input_df',None)
194 |     input_row = data.get('input_row',None)
195 | 
196 |     '''
197 |     Bert pre-trained model selected in the list:
198 |         bert-base-uncased,bert-large-uncased, bert-base-cased,
199 |         bert-base-multilingual, bert-base-chinese.
200 |     '''
201 |     bert_model = data.get('bert_model',"bert-base-uncased")
202 | 
203 |     '''
204 |     The maximum total input sequence length after WordPiece tokenization.
205 |     Sequences longer than this will be truncated, and sequences shorter
206 |     than this will be padded.
207 |     '''
208 |     max_seq_length = data.get('max_seq_length',128)
209 | 
210 |     batch_size = data.get('batch_size',32)
211 | 
212 |     # Keep these settings unless using distributed Cuda
213 |     local_rank = data.get('local_rank',-1)
214 |     no_cuda = data.get('no_cuda',True)
215 | 
216 | 
217 |     if local_rank == -1 or no_cuda:
218 |         device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
219 |         n_gpu = torch.cuda.device_count()
220 |     else:
221 |         device = torch.device("cuda", local_rank)
222 |         n_gpu = 1
223 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
224 |         torch.distributed.init_process_group(backend='nccl')
225 | 
226 |     logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(local_rank != -1)))
227 | 
228 |     tokenizer = BertTokenizer.from_pretrained(bert_model)
229 |     examples = read_examples_df(input_df, input_row)
230 |     features = convert_examples_to_features(
231 |         examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)
232 | 
233 |     unique_id_to_feature = {}
234 |     for feature in features:
235 |         unique_id_to_feature[feature.unique_id] = feature
236 | 
237 |     model = BertModel.from_pretrained(bert_model)
238 |     model.to(device)
239 | 
240 |     if local_rank != -1:
241 |         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
242 |                                                           output_device=local_rank)
243 |     elif n_gpu > 1:
244 |         model = torch.nn.DataParallel(model)
245 | 
246 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
247 |     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
248 |     all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
249 | 
250 |     eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
251 |     if local_rank == -1:
252 |         eval_sampler = SequentialSampler(eval_data)
253 |     else:
254 |         eval_sampler = DistributedSampler(eval_data)
255 | 
256 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)
257 | 
258 |     model.eval()
259 | 
260 |     output = []
261 | 
262 |     for input_ids, input_mask, example_indices in eval_dataloader:
263 |         input_ids = input_ids.to(device)
264 |         input_mask = input_mask.to(device)
265 | 
266 |         all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
267 |         all_encoder_layers = all_encoder_layers
268 | 
269 |         for b, example_index in enumerate(example_indices):
270 |             feature = features[example_index.item()]
271 |             unique_id = int(feature.unique_id)
272 |             # feature = unique_id_to_feature[unique_id]
273 |             output_data = {}
274 |             output_data["linex_index"] = unique_id
275 |             output_data["tokens"] = " ".join(feature.tokens)
276 | 
277 |             layer_output = all_encoder_layers[-2].detach().cpu().numpy()
278 |             layer_output = layer_output[b]
279 | 
280 |             output_data['embedding'] = layer_output.mean(axis=0)
281 | 
282 |             output.append(output_data)
283 | 
284 | 
285 |     return eval_dataloader, pd.DataFrame(output)
286 | 


--------------------------------------------------------------------------------
/dataset/pandas_ds.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | from torch.utils.data import TensorDataset
 4 | from torch.utils.data import DataLoader
 5 | import torch.nn.functional as F
 6 | 
 7 | # From https://blog.godatadriven.com/fairness-in-pytorch
 8 | class PandasDataSet(TensorDataset):
 9 | 
10 |     '''
11 |     Usage:
12 |     train_data = PandasDataSet(features_train, label_train)
13 |     test_data = PandasDataSet(features_test, label_test)
14 |     '''
15 | 
16 |     def __init__(self, df_features, df_labels):
17 |         tensors = (self._df_to_tensor(df_features), self._df_to_tensor(df_labels, dim=0))
18 |         super(PandasDataSet, self).__init__(*tensors)
19 | 
20 |     def _df_to_tensor(self, df, dim=1):
21 |         if isinstance(df, pd.Series):
22 |             df = df.to_frame()
23 |         x = torch.from_numpy(df.values).float()
24 | 
25 |         return F.normalize(x, p=2, dim=dim)
26 | 
27 | 
28 | def load_pandas(df_features, df_labels, **data):
29 |     
30 |     norm_label = data.get('norm_label', False)
31 | 
32 |     tensors = PandasDataSet(df_features, df_labels)
33 |     loader = DataLoader(tensors, **data)
34 | 
35 |     print('# training samples:', len(tensors))
36 |     print('# batches:', len(loader))
37 | 
38 |     return loader
39 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # This is where the program will be kicked off.
2 | # Python 3.5+
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gensim
 2 | nltk
 3 | googleanalytics
 4 | goose3
 5 | Spacy
 6 | pytorch
 7 | torchtext
 8 | watson_developer_cloud
 9 | pytorch-pretrained-bert
10 | 


--------------------------------------------------------------------------------