├── data_musk
    └── snapshot
    │   └── snapshot_musk_100.h5
├── data_harvey
    └── snapshot
    │   └── snapshot_harvey_250.h5
├── .gitignore
├── modules
    ├── ng_functions.py
    ├── nlp_functions.py
    └── tpqdna.py
├── README.md
├── setup_dna_nlp.sh
├── code
    ├── 04_harvey
    │   ├── 04_harvey_01_data.ipynb
    │   ├── 04_harvey_04_ng.ipynb
    │   └── 04_harvey_03_oie.ipynb
    ├── 03_musk
    │   ├── 03_musk_04_ng.ipynb
    │   ├── 03_musk_03_oie.ipynb
    │   └── 03_musk_01_data.ipynb
    └── 02_nlp
    │   └── 02_nlp_openie.ipynb
└── data
    └── eikon_eod_tsla_data.csv


/data_musk/snapshot/snapshot_musk_100.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhilpisch/dnanlp/master/data_musk/snapshot/snapshot_musk_100.h5


--------------------------------------------------------------------------------
/data_harvey/snapshot/snapshot_harvey_250.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yhilpisch/dnanlp/master/data_harvey/snapshot/snapshot_harvey_250.h5


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Specifics
  2 | *.swp
  3 | _build/
  4 | *.pkl
  5 | *.txt
  6 | dna_api_key.h5
  7 | data_harvey/results/
  8 | data_harvey/tokens/
  9 | data_musk/results/
 10 | data_musk/tokens/
 11 | *.avro
 12 | help/
 13 | *.ipynb~
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | 


--------------------------------------------------------------------------------
/modules/ng_functions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Network Graph Helper Functions
 3 | #
 4 | # The Python Quants GmbH
 5 | #
 6 | import pandas as pd
 7 | import networkx as nx
 8 | from pyvis.network import Network
 9 | 
10 | def create_graph(data, labels=False):
11 |     '''Create a NetworkX graph object from a pandas DataFrame.
12 |     '''
13 |     G = nx.DiGraph()
14 |    
15 |     vals = data[['Node1', 'Relation', 'Node2']].values
16 |     G.add_edges_from([(v[0], v[2], {'relation': v[1]}) for v in vals])
17 |    
18 |     if labels:
19 |         vals = data[['Label1', 'Node1', 'Label2', 'Node2']].values
20 |         for v in vals:
21 |             G.node[v[1]]['type'] = v[0]
22 |             G.node[v[3]]['type'] = v[2]
23 |     return G
24 | 
25 | def plot_graph(graph, background_color='white', 
26 |                font_color='grey', with_edge_label=True,
27 |                central_gravity=2.0, solver='',
28 |                height='750px', width='100%', filter_=['']):
29 |     ''' Creates a pyvis interactive Network Graph from a 
30 |         NetworkX graph object.
31 |     '''
32 |     G = Network(notebook=True, height=height, width=width, 
33 |                 bgcolor=background_color, font_color=font_color)
34 |     
35 |     color = {0:'#fb217f', 1:'#fb217f', 2:'#88b1fb', 3:'#88b1fb', 4:'#88b1fb'}
36 |     deg = dict(graph.in_degree())
37 |     
38 |     for node in graph:
39 |         md = max(deg.values())
40 |         color_id = min(deg[node], 4)
41 |         G.add_node(node, title=node, label=node,
42 |                    size=(md - deg[node] + 1) * 4,
43 |                    color=color[color_id])
44 |         
45 |     for edge in graph.edges():
46 |         if with_edge_label:
47 |             label = graph.get_edge_data(edge[0], edge[1])['relation']
48 |         else:
49 |             label=''
50 |         G.add_edge(edge[0], edge[1], label=label)
51 |     if solver == 'barnes_hut':
52 |         G.barnes_hut(central_gravity=central_gravity)
53 |     else:
54 |         G.force_atlas_2based(central_gravity=central_gravity)
55 |     G.show_buttons(filter_=filter_)
56 |     return G
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Unlocking the Hidden Potential of Unstructuctured News Data with NLP
 2 | 
 3 | This repository provides Python codes and Jupyter Notebooks for the Dow Jones applied research paper "Unlocking the Hidden Potential of Unstructured News Data with NLP &mdash; Understanding Advanced Analytics through Real-World Case Studies".
 4 | 
 5 | <img src="http://hilpisch.com/images/dna_paper_cover.png" width="500">
 6 | 
 7 | ## Applied Research Paper Download
 8 | 
 9 | To **download** the PDF version of the paper visit http://go.dowjones.com/dna-research-paper.
10 | 
11 | ## Setup and Installation
12 | 
13 | The instructions that follow assume that you run a **Docker container** or a **cloud instance** with the latest version of Ubunutu (18.10 at the time of this writing).
14 | 
15 | The execution of (parts of) the codes and Jupyter Notebooks requires enough **compute and memory resources**. Overall, it is recommended to have at least **four CPU cores and 16GB of RAM** available. The introductory examples can be executed with fewer resources.
16 | 
17 | ### Cloud Instance
18 | 
19 | The following assumes that you have set up a **cloud instance** (e.g. on DigitalOcean) and have used `ssh` to login as `root`. You can then execute on the shell:
20 | 
21 |     cd /root
22 |     wget http://hilpisch.com/nlp/setup_dna_nlp.sh
23 |     bash setup_dna_nlp.sh
24 | 
25 | Follow the **instructions** of the script and e.g. provide a password for the Jupyter Notebook server.
26 | 
27 | After the installation, you can access the **Jupyter Notebook server** under
28 | 
29 |     http://CLOUD_IP_ADDRESS:9999
30 | 
31 | with your chosen password. Navigate via Jupyter to the code folder and open a notebook to get started.
32 | 
33 | ### Docker Container
34 | 
35 | Alternatively, you can start a **Docker container** locally (with enough resources allocated). To do so e.g. execute on the shell:
36 | 
37 |     docker run -ti -h dnanlp -p 9999:9999 ubuntu:latest /bin/bash
38 | 
39 | Make sure that the container has **enough resources** allocated (e.g. via editing your Docker preferences). Then on the shell of the Docker container execute the following:
40 | 
41 |     cd root
42 |     apt-get update
43 |     apt-get upgrade -y
44 |     apt-get install -y wget
45 |     wget http://hilpisch.com/nlp/setup_dna_nlp.sh
46 |     bash setup_dna_nlp.sh
47 | 
48 | Then follow the **instructions** of the script to e.g. provide a password for the Jupyter Notebook server.
49 | 
50 | After the installation, you can access the **Jupyter Notebook server** under
51 | 
52 |     http://localhost:9999
53 | 
54 | with your chosen password. Navigate via Jupyter to the code folder and open a notebook to get started.
55 | 
56 | ## Security Risks and Disclaimer
57 | 
58 | The approach chosen to run the Jupyter Notebook server is for **illustration purposes** only. There are no security measures configured beyond password protection. For example, there is no SSL encryption configured. In addition, the Jupyter Notebook server is run as `root`. As a consequence, a number of **security risks** result from the approach chosen.
59 | 
60 | All codes and Jupyter notebooks come with **no representations or warranties**, to the extent permitted by applicable law.
61 | 
62 | This repository with all its scripts, codes and Jupyter notebooks is for **illustration purposes** only.
63 | 
64 | ## Company Information
65 | 
66 | © Dr. Yves J. Hilpisch \| The Python Quants GmbH
67 | 
68 | http://tpq.io \| team@tpq.io \|
69 | http://twitter.com/dyjh \| http://pqp.io
70 | 
71 | **Python for Finance & Algorithmic Trading online trainings** \| http://training.tpq.io
72 | 
73 | **University Certificate Program in Python for Algorithmic Trading** \| http://certificate.tpq.io
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/modules/nlp_functions.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # NLP Helper Functions
  3 | #
  4 | # The Python Quants GmbH
  5 | #
  6 | import re
  7 | import nltk
  8 | import string
  9 | import pandas as pd
 10 | from pylab import plt
 11 | from wordcloud import WordCloud
 12 | from nltk.corpus import stopwords
 13 | from nltk.corpus import wordnet as wn
 14 | from lxml.html.clean import Cleaner
 15 | from sklearn.feature_extraction.text import TfidfVectorizer
 16 | plt.style.use('seaborn')
 17 | 
 18 | cleaner = Cleaner(style=True, links=True, allow_tags=[''],
 19 |                   remove_unknown_tags=False)
 20 | 
 21 | stop_words = stopwords.words('english')
 22 | stop_words.extend(['new', 'old', 'pro', 'open', 'menu', 'close'])
 23 | 
 24 | 
 25 | def remove_non_ascii(s):
 26 |     ''' Removes all non-ascii characters.
 27 |     '''
 28 |     return ''.join(i for i in s if ord(i) < 128)
 29 | 
 30 | def clean_up_html(t):
 31 |     t = cleaner.clean_html(t)
 32 |     t = re.sub('[\n\t\r]', ' ', t)
 33 |     t = re.sub(' +', ' ', t)
 34 |     t = re.sub('<.*?>', '', t)
 35 |     t = remove_non_ascii(t)
 36 |     return t
 37 | 
 38 | def clean_up_text(t, numbers=False, punctuation=False):
 39 |     ''' Cleans up a text, e.g. HTML document,
 40 |         from HTML tags and also cleans up the 
 41 |         text body.
 42 |     '''
 43 |     try:
 44 |         t = clean_up_html(t)
 45 |     except:
 46 |         pass
 47 |     t = t.lower()
 48 |     t = re.sub(r"what's", "what is ", t)
 49 |     t = t.replace('(ap)', '')
 50 |     t = re.sub(r"\'ve", " have ", t)
 51 |     t = re.sub(r"can't", "cannot ", t)
 52 |     t = re.sub(r"n't", " not ", t)
 53 |     t = re.sub(r"i'm", "i am ", t)
 54 |     t = re.sub(r"\'s", "", t)
 55 |     t = re.sub(r"\'re", " are ", t)
 56 |     t = re.sub(r"\'d", " would ", t)
 57 |     t = re.sub(r"\'ll", " will ", t)
 58 |     t = re.sub(r'\s+', ' ', t)
 59 |     t = re.sub(r"\\", "", t)
 60 |     t = re.sub(r"\'", "", t)    
 61 |     t = re.sub(r"\"", "", t)
 62 |     if numbers:
 63 |         t = re.sub('[^a-zA-Z ?!]+', '', t)
 64 |     if punctuation:
 65 |         t = re.sub(r'\W+', ' ', t)
 66 |     t = remove_non_ascii(t)
 67 |     t = t.strip()
 68 |     return t
 69 | 
 70 | def nltk_lemma(word):
 71 |     ''' If one exists, returns the lemma of a word.
 72 |         I.e. the base or dictionary version of it.
 73 |     '''
 74 |     lemma = wn.morphy(word)
 75 |     if lemma is None:
 76 |         return word
 77 |     else:
 78 |         return lemma
 79 |     
 80 | def tokenize(text, min_char=3, lemma=True, stop=True,
 81 |              numbers=False):
 82 |     ''' Tokenizes a text and implements some
 83 |         transformations.
 84 |     '''
 85 |     tokens = nltk.word_tokenize(text)
 86 |     tokens = [t for t in tokens if len(t) >= min_char]
 87 |     if numbers:
 88 |         tokens = [t for t in tokens if t[0].lower()
 89 |                   in string.ascii_lowercase]
 90 |     if stop:
 91 |         tokens = [t for t in tokens if t not in stop_words]
 92 |     if lemma:
 93 |         tokens = [nltk_lemma(t) for t in tokens]
 94 |     return tokens
 95 | 
 96 | def generate_word_cloud(text, no, name=None):
 97 |     ''' Generates a word cloud bitmap given a
 98 |         text document (string).
 99 |         It uses the Term Frequency (TF) and
100 |         Inverse Document Frequency (IDF) 
101 |         vectorization approach to derive the
102 |         importance of a word -- represented
103 |         by the size of the word in the word cloud.
104 |         
105 |     Parameters
106 |     ==========
107 |     text: str
108 |         text as the basis
109 |     no: int
110 |         number of words to be included
111 |     '''
112 |     tokens = tokenize(text)
113 |     vec = TfidfVectorizer(min_df=2,
114 |                       analyzer='word',
115 |                       ngram_range=(1, 2),
116 |                       stop_words='english'
117 |                      )
118 |     vec.fit_transform(tokens)
119 |     wc = pd.DataFrame({'words': vec.get_feature_names(),
120 |                        'tfidf': vec.idf_})
121 |     words = ' '.join(wc.sort_values('tfidf', ascending=True)['words'].head(no))
122 |     wordcloud = WordCloud(max_font_size=110,
123 |                       background_color='white',
124 |                       width=1024, height=768,
125 |                       margin=10, max_words=150).generate(words)
126 |     plt.figure(figsize=(10, 10))
127 |     plt.imshow(wordcloud, interpolation='bilinear')
128 |     plt.axis('off')
129 |     plt.show()
130 |     if name is not None:
131 |         plt.imsave(name, wordcloud)
132 | 


--------------------------------------------------------------------------------
/setup_dna_nlp.sh:
--------------------------------------------------------------------------------
  1 | # Script to Install
  2 | # Linux System Tools and
  3 | # Basic Python Components
  4 | # as well as to 
  5 | # Start Jupyter Notebook Server
  6 | #
  7 | # Python for Algorithmic Trading
  8 | # (c) Dr. Yves J. Hilpisch
  9 | # The Python Quants GmbH
 10 | #
 11 | # GENERAL LINUX
 12 | printf "Installing system tools.\n\n"
 13 | apt-get update  # updates the package index cache
 14 | apt-get upgrade -y  # updates packages
 15 | # installs system tools
 16 | apt-get install -y git screen htop wget vim bzip2
 17 | apt-get install -y build-essential gcc zip default-jre
 18 | apt-get install -y poppler-utils  # pdf file conversion
 19 | apt-get upgrade -y bash  # upgrades bash if necessary
 20 | 
 21 | printf "Cleaning up package index cache.\n\n"
 22 | apt-get clean  # cleans up the package index cache
 23 | 
 24 | # INSTALLING MINICONDA
 25 | printf "Installing Miniconda.\n\n"
 26 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O \
 27 |   Miniconda.sh
 28 | bash Miniconda.sh -b  # installs Miniconda
 29 | rm Miniconda.sh  # removes the installer
 30 | # prepends the new path for current session
 31 | export PATH="/root/miniconda3/bin:$PATH"
 32 | # prepends the new path in the shell configuration
 33 | echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc
 34 | echo "conda activate" >> ~/.bashrc
 35 | 
 36 | printf "Updating miniconda\n\n"
 37 | conda update -y conda 
 38 | 
 39 | # INSTALLING PYTHON PACKAGES
 40 | printf "Installing Python packages.\n\n"
 41 | conda install -y jupyter  # Python coding in the browser
 42 | conda install -y pytables  # HDF5 database wrapper
 43 | conda install -y pandas  # data analysis package
 44 | conda install -y matplotlib  # plotting package
 45 | conda install -y scikit-learn  # machine learning package
 46 | conda install -y nltk=3.2.5  # nlp package
 47 | conda install -y gensim  # nlp package
 48 | conda install -y networkx  # network graph
 49 | conda install -y lxml  # xml/html parsing
 50 | 
 51 | pip install --upgrade pip
 52 | pip install Cython
 53 | pip install cufflinks  # combining plotly with pandas
 54 | pip install wordcloud
 55 | pip install pyvis
 56 | 
 57 | # NLTK PACKAGES
 58 | python -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')"
 59 | python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('wordnet')"
 60 | 
 61 | # INSTALLING APACHE'S AVRO PACKAGE
 62 | printf "Installing avro package.\n"
 63 | wget http://mirror.synyx.de/apache/avro/stable/py3/avro-python3-1.8.2.tar.gz
 64 | tar xvf avro-python3-1.8.2.tar.gz
 65 | cd avro-python3-1.8.2
 66 | python setup.py install
 67 | cd ..
 68 | rm avro-python3-1.8.2.tar.gz
 69 | rm -rf avro-python3-1.8.2
 70 | 
 71 | # COPYING FILES AND CREATING DIRECTORIES
 72 | mkdir /root/.jupyter
 73 | mkdir /root/.jupyter/custom
 74 | 
 75 | cd /root/.jupyter
 76 | wget -q http://hilpisch.com/nlp/jupyter_setup.py
 77 | printf "Please provide a new password for your Jupyter server.\n"
 78 | printf "New password [ENTER]: "
 79 | read -s password
 80 | printf "\n"
 81 | 
 82 | printf "Repeat password [ENTER]: "
 83 | read -s rep_password
 84 | printf "\n"
 85 | 
 86 | while [ "$password" = "" -o "$password"  != "$rep_password" ]
 87 | do
 88 | printf "The passwords are empty or not equal, please try again!\n"
 89 | printf "New password [ENTER]: "
 90 | read -s password
 91 | printf "\n"
 92 | 
 93 | printf "Repeat password [ENTER]: "
 94 | read -s rep_password
 95 | printf "\n"
 96 | done
 97 | 
 98 | JUPYTER_URL=$(python jupyter_setup.py $password)
 99 | 
100 | mkdir /root/notebook
101 | cd /root/notebook
102 | 
103 | # CLONING THE REPO
104 | printf "Cloning the DNA NLP Git repository.\n"
105 | git clone --depth=1 http://github.com/yhilpisch/dnanlp
106 | 
107 | printf "Downloading additional files.\n"
108 | cd /root/notebook/dnanlp/modules
109 | wget -q http://hilpisch.com/nlp/soiepy.zip
110 | unzip soiepy.zip
111 | rm soiepy.zip
112 | 
113 | cd /root/notebook/
114 | printf "Success.\n"
115 | 
116 | # CREATE A SWAP PARTITION
117 | # comment out these lines if not required
118 | wget -q http://hilpisch.com/nlp/create_swap.sh
119 | /bin/bash /root/notebook/create_swap.sh
120 | rm /root/notebook/create_swap.sh
121 | 
122 | # STARTING JUPYTER NOTEBOOK
123 | wget -q http://hilpisch.com/nlp/custom.css
124 | mv custom.css /root/.jupyter/custom/custom.css
125 | mkdir logs
126 | touch logs/jupyter.log
127 | nohup jupyter notebook --allow-root > logs/jupyter.log &
128 | 
129 | printf "\n\n"
130 | printf "Your Jupyter Server is running. To access it, please visit:\n\n"
131 | printf "$JUPYTER_URL\n\n"
132 | 


--------------------------------------------------------------------------------
/modules/tpqdna.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Wrapper Functions for the
  3 | # Dow Jones DNA Snapshot API
  4 | #
  5 | # The Python Quants GmbH
  6 | #
  7 | import os
  8 | import json
  9 | import time
 10 | import requests
 11 | import avro.schema
 12 | import pandas as pd
 13 | from avro.io import DatumReader
 14 | from avro.datafile import DataFileReader
 15 | 
 16 | # snapshot planning
 17 | explain_url = 'https://api.dowjones.com/alpha/extractions/documents/_explain'
 18 | 
 19 | # analytics end point
 20 | analytics_url = 'https://api.dowjones.com/alpha/analytics'
 21 | 
 22 | # snapshot creation
 23 | snapshot_create_url = 'https://api.dowjones.com/alpha/extractions/documents/'
 24 | 
 25 | # snapshot extraction & download list
 26 | snapshot_extraction_list_url = 'https://api.dowjones.com/alpha/extractions/'
 27 | 
 28 | djdna_avro_schema = {
 29 |     "type": "record",
 30 |     "name": "Delivery",
 31 |     "namespace": "com.dowjones.dna.avro",
 32 |     "doc":
 33 |         "Avro schema for extraction content used by Dow Jones' SyndicationHub",
 34 |     "fields": [
 35 |         {"name": "an", "type": ["string", "null"]},
 36 |         {"name": "modification_datetime", "type": ["long", "null"]},
 37 |         {"name": "ingestion_datetime", "type": ["long", "null"]},
 38 |         {"name": "publication_date", "type": ["long", "null"]},
 39 |         {"name": "publication_datetime", "type": ["long", "null"]},
 40 |         {"name": "snippet", "type": ["string", "null"]},
 41 |         {"name": "body", "type": ["string", "null"]},
 42 |         {"name": "art", "type": ["string", "null"]},
 43 |         {"name": "action", "type": ["string", "null"]},
 44 |         {"name": "credit", "type": ["string", "null"]},
 45 |         {"name": "byline", "type": ["string", "null"]},
 46 |         {"name": "document_type", "type": ["string", "null"]},
 47 |         {"name": "language_code", "type": ["string", "null"]},
 48 |         {"name": "title", "type": ["string", "null"]},
 49 |         {"name": "copyright", "type": ["string", "null"]},
 50 |         {"name": "dateline", "type": ["string", "null"]},
 51 |         {"name": "source_code", "type": ["string", "null"]},
 52 |         {"name": "modification_date", "type": ["long", "null"]},
 53 |         {"name": "section", "type": ["string", "null"]},
 54 |         {"name": "company_codes", "type": ["string", "null"]},
 55 |         {"name": "publisher_name", "type": ["string", "null"]},
 56 |         {"name": "region_of_origin", "type": ["string", "null"]},
 57 |         {"name": "word_count", "type": ["int", "null"]},
 58 |         {"name": "subject_codes", "type": ["string", "null"]},
 59 |         {"name": "region_codes", "type": ["string", "null"]},
 60 |         {"name": "industry_codes", "type": ["string", "null"]},
 61 |         {"name": "person_codes", "type": ["string", "null"]},
 62 |         {"name": "currency_codes", "type": ["string", "null"]},
 63 |         {"name": "market_index_codes", "type": ["string", "null"]},
 64 |         {"name": "company_codes_about", "type": ["string", "null"]},
 65 |         {"name": "company_codes_association", "type": ["string", "null"]},
 66 |         {"name": "company_codes_lineage", "type": ["string", "null"]},
 67 |         {"name": "company_codes_occur", "type": ["string", "null"]},
 68 |         {"name": "company_codes_relevance", "type": ["string", "null"]},
 69 |         {"name": "source_name", "type": ["string", "null"]}
 70 |     ]
 71 | }
 72 | 
 73 | 
 74 | def create_snapshot(query, headers):
 75 |     ''' Specifies a DNA snapshot.
 76 |     '''
 77 |     response = requests.request(
 78 |         'POST', snapshot_create_url, data=query, headers=headers)
 79 |     response = response.json()
 80 |     print(response)
 81 |     snapshot_create_job_url = response['links']['self']
 82 |     # job_status = response['data']['attributes']['current_state']
 83 |     return snapshot_create_job_url
 84 | 
 85 | 
 86 | def run_snapshot(snapshot_url, headers):
 87 |     ''' Runs the specified DNA snapshot process.
 88 |     '''
 89 |     old_status = ''
 90 |     job_status = ''
 91 |     response = ''
 92 |     while job_status != 'JOB_STATE_DONE':
 93 |         if job_status != old_status:
 94 |             print('Job status changed:')
 95 |             print(job_status)
 96 |             if job_status == 'JOB_STATE_FAILED':
 97 |                 print('Job failed')
 98 |                 print(response)
 99 |                 break
100 |             old_status = job_status
101 | 
102 |         time.sleep(60)
103 |         response = requests.request('GET', snapshot_url, headers=headers)
104 |         response = response.json()
105 |         job_status = response['data']['attributes']['current_state']
106 | 
107 |     snapshot_files_list = list(response['data']['attributes']['files'])
108 |     return snapshot_files_list
109 | 
110 | 
111 | def download_snapshots(snapshot_files, path, headers, verbose=True):
112 |     ''' Downloads DNA snapshot data file-by-file given the files list.
113 |     '''
114 |     for download_file in snapshot_files:
115 |         url = download_file['uri']
116 |         if url[-5:] != '.avro':
117 |             continue
118 |         filename = url.split('/')[-1]
119 |         if verbose:
120 |             print('Downloading file {} \r'.format(filename), end='')
121 |         download = requests.get(url, headers=headers,
122 |                                 allow_redirects=True, stream=True)
123 |         filename = os.path.join(path, filename)
124 |         with open(filename, 'wb') as fd:
125 |             for chunk in download.iter_content(chunk_size=128):
126 |                 fd.write(chunk)
127 | 
128 | 
129 | def avro2dataframe(path, verbose=False):
130 |     ''' Transforms DNA snapshot data in a pandas DataFrame object.
131 |     '''
132 |     read_schema = avro.schema.Parse(json.dumps(djdna_avro_schema))
133 |     file_content = list()
134 |     files = sorted(os.listdir(path))
135 |     for avro_file in files:
136 |         if (os.path.isfile(os.path.join(path, avro_file)) and
137 |                 avro_file.split('.')[-1] == 'avro'):
138 |             if verbose:
139 |                 print('Reading file {} \r'.format(avro_file), end='')
140 |             file_path = os.path.join(path, avro_file)
141 |             reader = DataFileReader(
142 |                 open(file_path, 'rb'), DatumReader(read_schema))
143 |             # new_schema = reader.GetMeta('avro.schema')
144 |             users = []
145 |             for user in reader:
146 |                 users.append(user)
147 |             file_content.append(users)
148 |             reader.close()
149 |     data = [pd.DataFrame(content) for content in file_content]
150 |     data = pd.concat(data, ignore_index=True)
151 |     return data
152 | 


--------------------------------------------------------------------------------
/code/04_harvey/04_harvey_01_data.ipynb:
--------------------------------------------------------------------------------
1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Hurricane Harvey._\n", "\n", "**Data Retrieval**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import json\n", "import nltk\n", "import pickle\n", "import tpqdna\n", "import warnings\n", "warnings.simplefilter('ignore')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Creation"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Authentication"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["api_key = pickle.load(open('../dna_api_key.pkl', 'rb'))\n", "headers = {\n", "    'user-key': api_key,\n", "    'content-type': 'application/json',\n", "    'cache-control': 'no-cache'\n", "}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Specification"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["where = '(body like \"%Hurricane Harvey%\") AND language_code=\"en\" '\n", "where += 'AND language_code=\"en\" '\n", "where += 'AND publication_date >= \"2017-08-01 00:00:00\" '\n", "where += 'AND publication_date <= \"2017-12-31 00:00:00\" '"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["includes = {} \n", "excludes = {}\n", "limit = 250"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["query = {'query': \n", "           {'where': where,\n", "            'includes': includes,\n", "            'exludes': excludes,\n", "            'limit': limit\n", "         }}"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["query = json.dumps(query)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'data': {'attributes': {'current_state': 'JOB_QUEUED', 'extraction_type': 'documents'}, 'id': 'dj-synhub-extraction-feccd780582a0af8b40e86439b3ee921-hdlz7k82ki', 'type': 'snapshot'}, 'links': {'self': 'https://api.dowjones.com/alpha/extractions/documents/dj-synhub-extraction-feccd780582a0af8b40e86439b3ee921-hdlz7k82ki'}}\n", "CPU times: user 40 ms, sys: 4 ms, total: 44 ms\n", "Wall time: 16.1 s\n"]}], "source": ["%time qurl = tpqdna.create_snapshot(query, headers)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Job status changed:\n", "JOB_QUEUED\n", "Job status changed:\n", "JOB_VALIDATING\n", "Job status changed:\n", "JOB_STATE_RUNNING\n", "CPU times: user 2.8 s, sys: 224 ms, total: 3.02 s\n", "Wall time: 1h 45min 15s\n"]}], "source": ["%time fl = tpqdna.run_snapshot(qurl, headers)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Paths"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["project = 'harvey_{}'.format(limit)"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["base_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(base_path, 'data_harvey')\n", "if not os.path.isdir(data_path):\n", "    os.mkdir(data_path)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["meta_path = os.path.join(data_path, 'meta')\n", "if not os.path.isdir(meta_path):\n", "    os.mkdir(meta_path)\n", "fn = os.path.join(meta_path, 'file_list_{}.pkl'.format(project))"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["# with open(fn, 'wb') as f:\n", "#     pickle.dump(fl, f)"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["with open(fn, 'rb') as f:\n", "    fl = pickle.load(f)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Retrieval"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot')\n", "if not os.path.isdir(snapshot_path):\n", "    os.mkdir(snapshot_path)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 1.1 s, sys: 68 ms, total: 1.17 s\n", "Wall time: 31.6 s\n"]}], "source": ["%time tpqdna.download_snapshots(fl, snapshot_path, headers)"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 208 ms, sys: 4 ms, total: 212 ms\n", "Wall time: 216 ms\n"]}], "source": ["%time data = tpqdna.avro2dataframe(snapshot_path)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 250 entries, 0 to 249\n", "Data columns (total 35 columns):\n", "action                       250 non-null object\n", "an                           250 non-null object\n", "art                          250 non-null object\n", "body                         250 non-null object\n", "byline                       250 non-null object\n", "company_codes                250 non-null object\n", "company_codes_about          250 non-null object\n", "company_codes_association    250 non-null object\n", "company_codes_lineage        250 non-null object\n", "company_codes_occur          250 non-null object\n", "company_codes_relevance      250 non-null object\n", "copyright                    250 non-null object\n", "credit                       250 non-null object\n", "currency_codes               250 non-null object\n", "dateline                     10 non-null object\n", "document_type                250 non-null object\n", "industry_codes               250 non-null object\n", "ingestion_datetime           250 non-null int64\n", "language_code                250 non-null object\n", "market_index_codes           250 non-null object\n", "modification_date            0 non-null object\n", "modification_datetime        250 non-null int64\n", "person_codes                 250 non-null object\n", "publication_date             250 non-null int64\n", "publication_datetime         250 non-null int64\n", "publisher_name               250 non-null object\n", "region_codes                 250 non-null object\n", "region_of_origin             250 non-null object\n", "section                      250 non-null object\n", "snippet                      250 non-null object\n", "source_code                  250 non-null object\n", "source_name                  250 non-null object\n", "subject_codes                250 non-null object\n", "title                        250 non-null object\n", "word_count                   250 non-null int64\n", "dtypes: int64(5), object(30)\n", "memory usage: 68.4+ KB\n"]}], "source": ["data.info()"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fn, 'data', complevel=5, complib='blosc')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2}


--------------------------------------------------------------------------------
/code/03_musk/03_musk_04_ng.ipynb:
--------------------------------------------------------------------------------
1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Network Graph Analysis**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import pandas as pd\n", "import ng_functions as ng\n", "import nlp_functions as nlp"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Reading the Data"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["project = 'musk_100'"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["abs_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(abs_path, 'data_musk')"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["results_path = os.path.join(data_path, 'results')"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["fnr = os.path.join(results_path, 'relations_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["data = pd.read_hdf(fnr, 'data')"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>Node1</th>\n", "      <th>Relation</th>\n", "      <th>Node2</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>11</th>\n", "      <td>musk</td>\n", "      <td>wrote in</td>\n", "      <td>oct. 4 tweet</td>\n", "    </tr>\n", "    <tr>\n", "      <th>13</th>\n", "      <td>entrepreneur</td>\n", "      <td>hate</td>\n", "      <td>sellers</td>\n", "    </tr>\n", "    <tr>\n", "      <th>21</th>\n", "      <td>option</td>\n", "      <td>bet on</td>\n", "      <td>stock decline</td>\n", "    </tr>\n", "    <tr>\n", "      <th>24</th>\n", "      <td>ways</td>\n", "      <td>bet on</td>\n", "      <td>fall</td>\n", "    </tr>\n", "    <tr>\n", "      <th>25</th>\n", "      <td>tesla short interest</td>\n", "      <td>approach</td>\n", "      <td>40 million shares</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                   Node1  Relation              Node2\n", "11                  musk  wrote in       oct. 4 tweet\n", "13          entrepreneur      hate            sellers\n", "21                option    bet on      stock decline\n", "24                  ways    bet on               fall\n", "25  tesla short interest  approach  40 million shares"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["data.head()"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["1887"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Network Graph"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Full Graph"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(data)"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", "                  with_edge_label=True,\n", "                  height='600px', width='80%',\n", "                  filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["# G.show('ng_musk_01.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Focused Graph"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["entities = ['musk', 'sec', 'tesla', 'tweet']"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["sel = data[data['Node1'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(sel)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", "                  with_edge_label=True,\n", "                  height='600px', width='80%',\n", "                  filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "        <iframe\n", "            width=\"80%\"\n", "            height=\"600px\"\n", "            src=\"ng_musk_02.html\"\n", "            frameborder=\"0\"\n", "            allowfullscreen\n", "        ></iframe>\n", "        "], "text/plain": ["<IPython.lib.display.IFrame at 0x7f31ce148c50>"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["G.show('ng_musk_02.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Focused Graph"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["entities = ['sec', 'settlement']"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["sel_1 = data[data['Node1'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": ["sel_2 = data[data['Node2'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": ["sel = pd.concat((sel_1, sel_2), ignore_index=True)"]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": ["sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": ["sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()"]}, {"cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(sel)"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", "                  with_edge_label=True,\n", "                  height='600px', width='80%',\n", "                  filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 29, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "        <iframe\n", "            width=\"80%\"\n", "            height=\"600px\"\n", "            src=\"ng_musk_03.html\"\n", "            frameborder=\"0\"\n", "            allowfullscreen\n", "        ></iframe>\n", "        "], "text/plain": ["<IPython.lib.display.IFrame at 0x7f31d2d128d0>"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["G.show('ng_musk_03.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2}


--------------------------------------------------------------------------------
/code/03_musk/03_musk_03_oie.ipynb:
--------------------------------------------------------------------------------
1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Information Extraction**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import nltk\n", "import pandas as pd\n", "import soiepy.main as ie\n", "import nlp_functions as nlp"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Data"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["project = 'musk_100'"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["abs_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(abs_path, 'data_musk')"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot')"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["raw = pd.read_hdf(fn, 'data')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Preprocessing"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 246 ms, sys: 7.59 ms, total: 253 ms\n", "Wall time: 252 ms\n"]}], "source": ["%time raw['body'] = raw['body'].apply(nlp.clean_up_text) "]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["data = raw['body'].values.tolist()  "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 200 ms, sys: 0 ns, total: 200 ms\n", "Wall time: 199 ms\n"]}], "source": ["%%time\n", "s = [nltk.sent_tokenize(a) for a in data]  \n", "s = [_ for sl in s for _ in sl]  "]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["['after six years of reflection, he returned to the subject.',\n", " 'the last several years have taught me that they are indeed reasonably maligned, musk wrote in an oct. 4 tweet.']"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["s[:2]"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["token_path = os.path.join(data_path, 'tokens')  \n", "if not os.path.isdir(token_path):\n", "    os.mkdir(token_path)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(token_path, 'tokens_{}_{}.txt')  "]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["steps = 250\n", "for c, i in enumerate(range(0, len(s), steps)):\n", "    with open(fn.format(project, c), 'w') as f:\n", "        f.writelines([_ + '\\n' for _ in s[i:i + steps - 1]])  "]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Relations"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["results_path = os.path.join(data_path, 'results')  \n", "if not os.path.isdir(results_path):\n", "    os.mkdir(results_path)\n", "fnr = os.path.join(results_path, 'relations_{}.h5'.format(project))  "]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["fl = sorted(os.listdir(token_path))\n", "d = pd.DataFrame()\n", "fno = len(fl)"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 18.7 ms, sys: 292 \u00b5s, total: 19 ms\n", "Wall time: 17.2 ms\n"]}], "source": ["%%time\n", "try:\n", "    d = pd.read_hdf(fnr, 'raw')  \n", "except:\n", "    for i, fn in enumerate(fl):\n", "        filename = os.path.join(token_path, fn)\n", "        msg = 'Processing file {} of {} \\r'\n", "        print(msg.format(i + 1, fno), end='')\n", "        r = ie.stanford_ie(filename, verbose=False)  \n", "        dt = pd.DataFrame(r)\n", "        if len(d) == 0:\n", "            d = dt\n", "        else:\n", "            d = pd.concat((d, dt), ignore_index=True)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": ["d = d.iloc[:, :3]"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": ["d.columns = ['Node1', 'Relation', 'Node2']"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [{"data": {"text/plain": ["11385"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["len(d)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Post Processing"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["data = d.copy()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Basic Processing"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": ["data = data.applymap(lambda s: s.strip())  "]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": ["data = data[data.applymap(lambda s: not s in nlp.stop_words)].dropna()  "]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": ["data = data[data.applymap(lambda s: not s.startswith('http'))].dropna()  "]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": ["data = data.applymap(lambda s: nlp.nltk_lemma(s))  "]}, {"cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [{"data": {"text/plain": ["7935"]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Removing Duplicates"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": ["def join_columns(row):\n", "    return ' '.join([row['Node1'], row['Relation'], row['Node2']])  "]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": ["vec = nlp.TfidfVectorizer(stop_words='english')"]}, {"cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": ["data['Join'] = data.apply(lambda row: join_columns(row), axis=1)  "]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": ["mat = vec.fit_transform(data['Join'].values.tolist())"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 113 ms, sys: 374 ms, total: 487 ms\n", "Wall time: 485 ms\n"]}], "source": ["%time sim = (mat * mat.T).A  "]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": ["data['Keep'] = True"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 1min 2s, sys: 103 ms, total: 1min 2s\n", "Wall time: 1min 2s\n"]}], "source": ["%%time\n", "for i, ind_i in enumerate(data.index):\n", "    for j, ind_j in enumerate(data.index):\n", "        if j > i:\n", "            simsc = sim[i, j]\n", "            if simsc > 0.5:\n", "                data.loc[ind_j, 'Keep'] = False  "]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": ["data = data.iloc[:, :3][data['Keep'] == True]  "]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [{"data": {"text/plain": ["1887"]}, "execution_count": 36, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>Node1</th>\n", "      <th>Relation</th>\n", "      <th>Node2</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>11</th>\n", "      <td>musk</td>\n", "      <td>wrote in</td>\n", "      <td>oct. 4 tweet</td>\n", "    </tr>\n", "    <tr>\n", "      <th>13</th>\n", "      <td>entrepreneur</td>\n", "      <td>hate</td>\n", "      <td>sellers</td>\n", "    </tr>\n", "    <tr>\n", "      <th>21</th>\n", "      <td>option</td>\n", "      <td>bet on</td>\n", "      <td>stock decline</td>\n", "    </tr>\n", "    <tr>\n", "      <th>24</th>\n", "      <td>ways</td>\n", "      <td>bet on</td>\n", "      <td>fall</td>\n", "    </tr>\n", "    <tr>\n", "      <th>25</th>\n", "      <td>tesla short interest</td>\n", "      <td>approach</td>\n", "      <td>40 million shares</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                   Node1  Relation              Node2\n", "11                  musk  wrote in       oct. 4 tweet\n", "13          entrepreneur      hate            sellers\n", "21                option    bet on      stock decline\n", "24                  ways    bet on               fall\n", "25  tesla short interest  approach  40 million shares"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["data.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Storing Results"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["d.to_hdf(fnr, 'raw', complevel=5, complib='blosc')  "]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fnr, 'data', complevel=5, complib='blosc')  "]}, {"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2}


--------------------------------------------------------------------------------
/data/eikon_eod_tsla_data.csv:
--------------------------------------------------------------------------------
  1 | Date,CLOSE
  2 | 2017-01-03,216.99
  3 | 2017-01-04,226.99
  4 | 2017-01-05,226.75
  5 | 2017-01-06,229.01
  6 | 2017-01-09,231.28
  7 | 2017-01-10,229.87
  8 | 2017-01-11,229.73
  9 | 2017-01-12,229.59
 10 | 2017-01-13,237.75
 11 | 2017-01-17,235.58
 12 | 2017-01-18,238.36
 13 | 2017-01-19,243.76
 14 | 2017-01-20,244.73
 15 | 2017-01-23,248.92
 16 | 2017-01-24,254.61
 17 | 2017-01-25,254.47
 18 | 2017-01-26,252.51
 19 | 2017-01-27,252.95
 20 | 2017-01-30,250.63
 21 | 2017-01-31,251.93
 22 | 2017-02-01,249.24
 23 | 2017-02-02,251.55
 24 | 2017-02-03,251.33
 25 | 2017-02-06,257.77
 26 | 2017-02-07,257.48
 27 | 2017-02-08,262.08
 28 | 2017-02-09,269.2
 29 | 2017-02-10,269.23
 30 | 2017-02-13,280.6
 31 | 2017-02-14,280.98
 32 | 2017-02-15,279.76
 33 | 2017-02-16,268.95
 34 | 2017-02-17,272.23
 35 | 2017-02-21,277.39
 36 | 2017-02-22,273.51
 37 | 2017-02-23,255.99
 38 | 2017-02-24,257.0
 39 | 2017-02-27,246.23
 40 | 2017-02-28,249.99
 41 | 2017-03-01,250.02
 42 | 2017-03-02,250.48
 43 | 2017-03-03,251.57
 44 | 2017-03-06,251.21
 45 | 2017-03-07,248.59
 46 | 2017-03-08,246.87
 47 | 2017-03-09,244.9
 48 | 2017-03-10,243.69
 49 | 2017-03-13,246.17
 50 | 2017-03-14,258.0
 51 | 2017-03-15,255.73
 52 | 2017-03-16,262.05
 53 | 2017-03-17,261.5
 54 | 2017-03-20,261.92
 55 | 2017-03-21,250.68
 56 | 2017-03-22,255.01
 57 | 2017-03-23,254.78
 58 | 2017-03-24,263.16
 59 | 2017-03-27,270.22
 60 | 2017-03-28,277.45
 61 | 2017-03-29,277.38
 62 | 2017-03-30,277.92
 63 | 2017-03-31,278.3
 64 | 2017-04-03,298.52
 65 | 2017-04-04,303.7
 66 | 2017-04-05,295.0
 67 | 2017-04-06,298.7
 68 | 2017-04-07,302.54
 69 | 2017-04-10,312.39
 70 | 2017-04-11,308.71
 71 | 2017-04-12,296.84
 72 | 2017-04-13,304.0
 73 | 2017-04-17,301.44
 74 | 2017-04-18,300.25
 75 | 2017-04-19,305.52
 76 | 2017-04-20,302.51
 77 | 2017-04-21,305.6
 78 | 2017-04-24,308.03
 79 | 2017-04-25,313.79
 80 | 2017-04-26,310.17
 81 | 2017-04-27,308.63
 82 | 2017-04-28,314.07
 83 | 2017-05-01,322.83
 84 | 2017-05-02,318.89
 85 | 2017-05-03,311.02
 86 | 2017-05-04,295.46
 87 | 2017-05-05,308.35
 88 | 2017-05-08,307.19
 89 | 2017-05-09,321.26
 90 | 2017-05-10,325.22
 91 | 2017-05-11,323.1
 92 | 2017-05-12,324.81
 93 | 2017-05-15,315.88
 94 | 2017-05-16,317.01
 95 | 2017-05-17,306.11
 96 | 2017-05-18,313.06
 97 | 2017-05-19,310.83
 98 | 2017-05-22,310.35
 99 | 2017-05-23,303.86
100 | 2017-05-24,310.22
101 | 2017-05-25,316.83
102 | 2017-05-26,325.14
103 | 2017-05-30,335.1
104 | 2017-05-31,341.01
105 | 2017-06-01,340.37
106 | 2017-06-02,339.85
107 | 2017-06-05,347.32
108 | 2017-06-06,352.85
109 | 2017-06-07,359.65
110 | 2017-06-08,370.0
111 | 2017-06-09,357.32
112 | 2017-06-12,359.01
113 | 2017-06-13,375.95
114 | 2017-06-14,380.66
115 | 2017-06-15,375.34
116 | 2017-06-16,371.4
117 | 2017-06-19,369.8
118 | 2017-06-20,372.24
119 | 2017-06-21,376.4
120 | 2017-06-22,382.61
121 | 2017-06-23,383.45
122 | 2017-06-26,377.49
123 | 2017-06-27,362.37
124 | 2017-06-28,371.24
125 | 2017-06-29,360.75
126 | 2017-06-30,361.61
127 | 2017-07-03,352.62
128 | 2017-07-05,327.09
129 | 2017-07-06,308.83
130 | 2017-07-07,313.22
131 | 2017-07-10,316.05
132 | 2017-07-11,327.22
133 | 2017-07-12,329.52
134 | 2017-07-13,323.41
135 | 2017-07-14,327.78
136 | 2017-07-17,319.57
137 | 2017-07-18,328.24
138 | 2017-07-19,325.26
139 | 2017-07-20,329.92
140 | 2017-07-21,328.4
141 | 2017-07-24,342.52
142 | 2017-07-25,339.6
143 | 2017-07-26,343.85
144 | 2017-07-27,334.46
145 | 2017-07-28,335.07
146 | 2017-07-31,323.47
147 | 2017-08-01,319.57
148 | 2017-08-02,325.89
149 | 2017-08-03,347.09
150 | 2017-08-04,356.91
151 | 2017-08-07,355.17
152 | 2017-08-08,365.22
153 | 2017-08-09,363.53
154 | 2017-08-10,355.4
155 | 2017-08-11,357.87
156 | 2017-08-14,363.8
157 | 2017-08-15,362.33
158 | 2017-08-16,362.91
159 | 2017-08-17,351.92
160 | 2017-08-18,347.46
161 | 2017-08-21,337.86
162 | 2017-08-22,341.35
163 | 2017-08-23,352.77
164 | 2017-08-24,352.93
165 | 2017-08-25,348.05
166 | 2017-08-28,345.66
167 | 2017-08-29,347.36
168 | 2017-08-30,353.18
169 | 2017-08-31,355.9
170 | 2017-09-01,355.4
171 | 2017-09-05,349.59
172 | 2017-09-06,344.53
173 | 2017-09-07,350.61
174 | 2017-09-08,343.4
175 | 2017-09-11,363.69
176 | 2017-09-12,362.75
177 | 2017-09-13,366.23
178 | 2017-09-14,377.64
179 | 2017-09-15,379.81
180 | 2017-09-18,385.0
181 | 2017-09-19,375.1
182 | 2017-09-20,373.91
183 | 2017-09-21,366.48
184 | 2017-09-22,351.09
185 | 2017-09-25,344.99
186 | 2017-09-26,345.25
187 | 2017-09-27,340.97
188 | 2017-09-28,339.6
189 | 2017-09-29,341.1
190 | 2017-10-02,341.53
191 | 2017-10-03,348.14
192 | 2017-10-04,355.01
193 | 2017-10-05,355.33
194 | 2017-10-06,356.88
195 | 2017-10-09,342.94
196 | 2017-10-10,355.59
197 | 2017-10-11,354.6
198 | 2017-10-12,355.68
199 | 2017-10-13,355.57
200 | 2017-10-16,350.6
201 | 2017-10-17,355.75
202 | 2017-10-18,359.65
203 | 2017-10-19,351.81
204 | 2017-10-20,345.1
205 | 2017-10-23,337.02
206 | 2017-10-24,337.34
207 | 2017-10-25,325.84
208 | 2017-10-26,326.17
209 | 2017-10-27,320.87
210 | 2017-10-30,320.08
211 | 2017-10-31,331.53
212 | 2017-11-01,321.08
213 | 2017-11-02,299.26
214 | 2017-11-03,306.09
215 | 2017-11-06,302.78
216 | 2017-11-07,306.05
217 | 2017-11-08,304.39
218 | 2017-11-09,302.99
219 | 2017-11-10,302.99
220 | 2017-11-13,315.4
221 | 2017-11-14,308.7
222 | 2017-11-15,311.3
223 | 2017-11-16,312.5
224 | 2017-11-17,315.05
225 | 2017-11-20,308.74
226 | 2017-11-21,317.81
227 | 2017-11-22,312.6
228 | 2017-11-24,315.55
229 | 2017-11-27,316.81
230 | 2017-11-28,317.55
231 | 2017-11-29,307.54
232 | 2017-11-30,308.85
233 | 2017-12-01,306.53
234 | 2017-12-04,305.2
235 | 2017-12-05,303.7
236 | 2017-12-06,313.26
237 | 2017-12-07,311.24
238 | 2017-12-08,315.13
239 | 2017-12-11,328.91
240 | 2017-12-12,341.03
241 | 2017-12-13,339.03
242 | 2017-12-14,337.89
243 | 2017-12-15,343.45
244 | 2017-12-18,338.87
245 | 2017-12-19,331.1
246 | 2017-12-20,328.98
247 | 2017-12-21,331.66
248 | 2017-12-22,325.2
249 | 2017-12-26,317.29
250 | 2017-12-27,311.64
251 | 2017-12-28,315.36
252 | 2017-12-29,311.35
253 | 2018-01-02,320.53
254 | 2018-01-03,317.25
255 | 2018-01-04,314.62
256 | 2018-01-05,316.58
257 | 2018-01-08,336.41
258 | 2018-01-09,333.69
259 | 2018-01-10,334.8
260 | 2018-01-11,337.95
261 | 2018-01-12,336.22
262 | 2018-01-16,340.06
263 | 2018-01-17,347.16
264 | 2018-01-18,344.57
265 | 2018-01-19,350.02
266 | 2018-01-22,351.56
267 | 2018-01-23,352.79
268 | 2018-01-24,345.89
269 | 2018-01-25,337.64
270 | 2018-01-26,342.85
271 | 2018-01-29,349.53
272 | 2018-01-30,345.82
273 | 2018-01-31,354.31
274 | 2018-02-01,349.25
275 | 2018-02-02,343.75
276 | 2018-02-05,333.13
277 | 2018-02-06,333.97
278 | 2018-02-07,345.0
279 | 2018-02-08,315.23
280 | 2018-02-09,310.42
281 | 2018-02-12,315.73
282 | 2018-02-13,323.66
283 | 2018-02-14,322.31
284 | 2018-02-15,334.065
285 | 2018-02-16,335.49
286 | 2018-02-20,334.77
287 | 2018-02-21,333.3
288 | 2018-02-22,346.17
289 | 2018-02-23,352.05
290 | 2018-02-26,357.42
291 | 2018-02-27,350.99
292 | 2018-02-28,343.06
293 | 2018-03-01,330.93
294 | 2018-03-02,335.12
295 | 2018-03-05,333.35
296 | 2018-03-06,328.2
297 | 2018-03-07,332.3
298 | 2018-03-08,329.1
299 | 2018-03-09,327.17
300 | 2018-03-12,345.51
301 | 2018-03-13,341.84
302 | 2018-03-14,326.63
303 | 2018-03-15,325.6
304 | 2018-03-16,321.35
305 | 2018-03-19,313.56
306 | 2018-03-20,310.55
307 | 2018-03-21,316.53
308 | 2018-03-22,309.1
309 | 2018-03-23,301.54
310 | 2018-03-26,304.18
311 | 2018-03-27,279.18
312 | 2018-03-28,257.78
313 | 2018-03-29,266.13
314 | 2018-04-02,252.48
315 | 2018-04-03,267.53
316 | 2018-04-04,286.94
317 | 2018-04-05,305.72
318 | 2018-04-06,299.3
319 | 2018-04-09,289.66
320 | 2018-04-10,304.7
321 | 2018-04-11,300.93
322 | 2018-04-12,294.08
323 | 2018-04-13,300.34
324 | 2018-04-16,291.21
325 | 2018-04-17,287.69
326 | 2018-04-18,293.35
327 | 2018-04-19,300.08
328 | 2018-04-20,290.24
329 | 2018-04-23,283.37
330 | 2018-04-24,283.46
331 | 2018-04-25,280.69
332 | 2018-04-26,285.48
333 | 2018-04-27,294.075
334 | 2018-04-30,293.9
335 | 2018-05-01,299.92
336 | 2018-05-02,301.15
337 | 2018-05-03,284.45
338 | 2018-05-04,294.09
339 | 2018-05-07,302.77
340 | 2018-05-08,301.97
341 | 2018-05-09,306.85
342 | 2018-05-10,305.02
343 | 2018-05-11,301.06
344 | 2018-05-14,291.97
345 | 2018-05-15,284.18
346 | 2018-05-16,286.48
347 | 2018-05-17,284.54
348 | 2018-05-18,276.82
349 | 2018-05-21,284.49
350 | 2018-05-22,275.01
351 | 2018-05-23,279.07
352 | 2018-05-24,277.85
353 | 2018-05-25,278.85
354 | 2018-05-29,283.76
355 | 2018-05-30,291.72
356 | 2018-05-31,284.73
357 | 2018-06-01,291.82
358 | 2018-06-04,296.74
359 | 2018-06-05,291.13
360 | 2018-06-06,319.5
361 | 2018-06-07,316.09
362 | 2018-06-08,317.66
363 | 2018-06-11,332.1
364 | 2018-06-12,342.77
365 | 2018-06-13,344.78
366 | 2018-06-14,357.72
367 | 2018-06-15,358.17
368 | 2018-06-18,370.83
369 | 2018-06-19,352.55
370 | 2018-06-20,362.22
371 | 2018-06-21,347.51
372 | 2018-06-22,333.63
373 | 2018-06-25,333.01
374 | 2018-06-26,342.0
375 | 2018-06-27,344.5
376 | 2018-06-28,349.93
377 | 2018-06-29,342.95
378 | 2018-07-02,335.07
379 | 2018-07-03,310.86
380 | 2018-07-05,309.16
381 | 2018-07-06,308.9
382 | 2018-07-09,318.51
383 | 2018-07-10,322.47
384 | 2018-07-11,318.96
385 | 2018-07-12,316.71
386 | 2018-07-13,318.87
387 | 2018-07-16,310.1
388 | 2018-07-17,322.69
389 | 2018-07-18,323.85
390 | 2018-07-19,320.23
391 | 2018-07-20,313.58
392 | 2018-07-23,303.2
393 | 2018-07-24,297.43
394 | 2018-07-25,308.74
395 | 2018-07-26,306.65
396 | 2018-07-27,297.18
397 | 2018-07-30,290.17
398 | 2018-07-31,298.14
399 | 2018-08-01,300.84
400 | 2018-08-02,349.54
401 | 2018-08-03,348.17
402 | 2018-08-06,341.99
403 | 2018-08-07,379.57
404 | 2018-08-08,370.34
405 | 2018-08-09,352.45
406 | 2018-08-10,355.49
407 | 2018-08-13,356.41
408 | 2018-08-14,347.64
409 | 2018-08-15,338.69
410 | 2018-08-16,335.45
411 | 2018-08-17,305.5
412 | 2018-08-20,308.44
413 | 2018-08-21,321.9
414 | 2018-08-22,321.64
415 | 2018-08-23,320.1
416 | 2018-08-24,322.82
417 | 2018-08-27,319.27
418 | 2018-08-28,311.86
419 | 2018-08-29,305.01
420 | 2018-08-30,303.15
421 | 2018-08-31,301.66
422 | 2018-09-04,288.95
423 | 2018-09-05,280.74
424 | 2018-09-06,280.95
425 | 2018-09-07,263.24
426 | 2018-09-10,285.5
427 | 2018-09-11,279.44
428 | 2018-09-12,290.54
429 | 2018-09-13,289.46
430 | 2018-09-14,295.2
431 | 2018-09-17,294.84
432 | 2018-09-18,284.96
433 | 2018-09-19,299.02
434 | 2018-09-20,298.33
435 | 2018-09-21,299.1
436 | 2018-09-24,299.68
437 | 2018-09-25,300.99
438 | 2018-09-26,309.58
439 | 2018-09-27,307.52
440 | 2018-09-28,264.77
441 | 2018-10-01,310.7
442 | 2018-10-02,301.02
443 | 2018-10-03,294.8
444 | 2018-10-04,281.83
445 | 2018-10-05,261.95
446 | 2018-10-08,250.56
447 | 2018-10-09,262.8
448 | 2018-10-10,256.88
449 | 2018-10-11,252.23
450 | 2018-10-12,258.78
451 | 2018-10-15,259.59
452 | 2018-10-16,276.59
453 | 2018-10-17,271.78
454 | 2018-10-18,263.91
455 | 2018-10-19,260.0
456 | 2018-10-22,260.95
457 | 2018-10-23,294.14
458 | 2018-10-24,288.5
459 | 2018-10-25,314.86
460 | 2018-10-26,330.9
461 | 2018-10-29,334.85
462 | 2018-10-30,329.9
463 | 2018-10-31,337.32
464 | 


--------------------------------------------------------------------------------
/code/03_musk/03_musk_01_data.ipynb:
--------------------------------------------------------------------------------
1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Data Retrieval**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules/')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import json\n", "import nltk\n", "import pickle\n", "import tpqdna\n", "import warnings\n", "warnings.simplefilter('ignore')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Creation"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Authentication"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["# expects the DNA API key to be stored in plain text as a Python pickle file\n", "api_key = pickle.load(open('../dna_api_key.pkl', 'rb'))  \n", "headers = {\n", "    'user-key': api_key,\n", "    'content-type': 'application/json',\n", "    'cache-control': 'no-cache'\n", "}  "]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Specification"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["where = '(body like \"%Musk.%\" OR body like \"%Musk,%\" '\n", "where += 'OR body like \"%Musk %\" '\n", "where += 'OR body like \"%Tesla.%\" OR body like \"%Tesla,%\" '\n", "where += 'OR body like \"%Tesla %\") '\n", "where += 'AND (body like \"%tweet.%\" OR body like \"%tweet,%\" '\n", "where += 'OR body like \"%tweet %\" '\n", "where += 'OR body like \"%Twitter.%\" OR body like \"%Twitter,%\" '\n", "where += 'OR body like \"%Twitter %\" ) ' \n", "where += 'AND language_code=\"en\" '\n", "where += 'AND publication_date >= \"2018-07-23 00:00:00\" '\n", "where += 'AND publication_date <= \"2018-10-29 23:59:59\" '  "]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["includes = {} \n", "excludes = {}\n", "limit = 100"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["query = {'query': \n", "           {'where': where,\n", "            'includes': includes,\n", "            'exludes': excludes,\n", "            'limit': limit\n", "         }}"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["query = json.dumps(query)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["# %time qurl = tpqdna.create_snapshot(query, headers)  "]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# %time fl = tpqdna.run_snapshot(qurl, headers)  "]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Paths"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["project = 'musk_{}'.format(limit)  "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["base_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(base_path, 'data_musk')  \n", "if not os.path.isdir(data_path):\n", "    os.mkdir(data_path)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["meta_path = os.path.join(data_path, 'meta')  \n", "if not os.path.isdir(meta_path):\n", "    os.mkdir(meta_path)\n", "fn = os.path.join(meta_path, 'file_list_{}.pkl'.format(project))"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["# with open(fn, 'wb') as f:\n", "#    pickle.dump(fl, f)  "]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["with open(fn, 'rb') as f:\n", "    fl = pickle.load(f)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Retrieval"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot')  \n", "if not os.path.isdir(snapshot_path):\n", "    os.mkdir(snapshot_path)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["# %time tpqdna.download_snapshots(fl, snapshot_path, headers)  "]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 143 ms, sys: 7.56 ms, total: 151 ms\n", "Wall time: 155 ms\n"]}], "source": ["%time data = tpqdna.avro2dataframe(snapshot_path)  "]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 100 entries, 0 to 99\n", "Data columns (total 35 columns):\n", "action                       100 non-null object\n", "an                           100 non-null object\n", "art                          100 non-null object\n", "body                         100 non-null object\n", "byline                       100 non-null object\n", "company_codes                100 non-null object\n", "company_codes_about          100 non-null object\n", "company_codes_association    100 non-null object\n", "company_codes_lineage        100 non-null object\n", "company_codes_occur          100 non-null object\n", "company_codes_relevance      100 non-null object\n", "copyright                    100 non-null object\n", "credit                       100 non-null object\n", "currency_codes               100 non-null object\n", "dateline                     5 non-null object\n", "document_type                100 non-null object\n", "industry_codes               100 non-null object\n", "ingestion_datetime           100 non-null int64\n", "language_code                100 non-null object\n", "market_index_codes           100 non-null object\n", "modification_date            0 non-null object\n", "modification_datetime        100 non-null int64\n", "person_codes                 100 non-null object\n", "publication_date             100 non-null int64\n", "publication_datetime         100 non-null int64\n", "publisher_name               100 non-null object\n", "region_codes                 100 non-null object\n", "region_of_origin             100 non-null object\n", "section                      100 non-null object\n", "snippet                      100 non-null object\n", "source_code                  100 non-null object\n", "source_name                  100 non-null object\n", "subject_codes                100 non-null object\n", "title                        100 non-null object\n", "word_count                   100 non-null int64\n", "dtypes: int64(5), object(30)\n", "memory usage: 27.4+ KB\n"]}], "source": ["data.info()  "]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>source_name</th>\n", "      <th>title</th>\n", "      <th>word_count</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>Barron's</td>\n", "      <td>Why Musk Is Wrong About Shorts</td>\n", "      <td>1031</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>San Francisco Chronicle: Web Edition</td>\n", "      <td>Tesla owners fume about delays for service</td>\n", "      <td>1016</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>ArabianBusiness.com</td>\n", "      <td>Saudi fund PIF said to mull investment in Tesl...</td>\n", "      <td>240</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>Digit</td>\n", "      <td>SpaceX to announce the name of its first touri...</td>\n", "      <td>417</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>U-Wire</td>\n", "      <td>We\u2019ve All Been Pronouncing Chrissy Teigen\u2019s La...</td>\n", "      <td>292</td>\n", "    </tr>\n", "    <tr>\n", "      <th>5</th>\n", "      <td>Dow Jones Institutional News</td>\n", "      <td>Public Bravado, Private Doubts: How Elon Musk'...</td>\n", "      <td>1786</td>\n", "    </tr>\n", "    <tr>\n", "      <th>6</th>\n", "      <td>Benzinga.com</td>\n", "      <td>Tesla Zaps Go-Private Plans; Wall Street Reacts</td>\n", "      <td>581</td>\n", "    </tr>\n", "    <tr>\n", "      <th>7</th>\n", "      <td>The Canadian Press</td>\n", "      <td>Tesla stock drops closer to pre-Musk tweet level</td>\n", "      <td>308</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                            source_name  \\\n", "0                              Barron's   \n", "1  San Francisco Chronicle: Web Edition   \n", "2                   ArabianBusiness.com   \n", "3                                 Digit   \n", "4                                U-Wire   \n", "5          Dow Jones Institutional News   \n", "6                          Benzinga.com   \n", "7                    The Canadian Press   \n", "\n", "                                               title  word_count  \n", "0                     Why Musk Is Wrong About Shorts        1031  \n", "1         Tesla owners fume about delays for service        1016  \n", "2  Saudi fund PIF said to mull investment in Tesl...         240  \n", "3  SpaceX to announce the name of its first touri...         417  \n", "4  We\u2019ve All Been Pronouncing Chrissy Teigen\u2019s La...         292  \n", "5  Public Bravado, Private Doubts: How Elon Musk'...        1786  \n", "6    Tesla Zaps Go-Private Plans; Wall Street Reacts         581  \n", "7   Tesla stock drops closer to pre-Musk tweet level         308  "]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["data[['source_name', 'title', 'word_count']].head(8)"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))  "]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fn, 'data', complevel=5, complib='blosc')  "]}, {"cell_type": "markdown", "metadata": {}, "source": ["<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2}


--------------------------------------------------------------------------------
/code/04_harvey/04_harvey_04_ng.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Dow Jones DNA NLP Case Study\n",
 15 |     "\n",
 16 |     "_Based on news articles related to Hurricane Harvey._\n",
 17 |     "\n",
 18 |     "**Network Graph Analysis**\n",
 19 |     "\n",
 20 |     "Dr Yves J Hilpisch | Michael Schwed\n",
 21 |     "\n",
 22 |     "The Python Quants GmbH"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## The Imports"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "import sys\n",
 40 |     "sys.path.append('../../modules')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd\n",
 50 |     "import ng_functions as ng\n",
 51 |     "import nlp_functions as nlp"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "project = 'harvey_250'"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "abs_path = os.path.abspath('../../')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "data_path = os.path.join(abs_path, 'data_harvey')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 6,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "results_path = os.path.join(data_path, 'results')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Reading the Data"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 8,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "relations_harvey_250.h5\r\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "!ls ../../data_harvey/results"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 9,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "data = pd.read_hdf(fn, 'data')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/html": [
140 |        "<div>\n",
141 |        "<style scoped>\n",
142 |        "    .dataframe tbody tr th:only-of-type {\n",
143 |        "        vertical-align: middle;\n",
144 |        "    }\n",
145 |        "\n",
146 |        "    .dataframe tbody tr th {\n",
147 |        "        vertical-align: top;\n",
148 |        "    }\n",
149 |        "\n",
150 |        "    .dataframe thead th {\n",
151 |        "        text-align: right;\n",
152 |        "    }\n",
153 |        "</style>\n",
154 |        "<table border=\"1\" class=\"dataframe\">\n",
155 |        "  <thead>\n",
156 |        "    <tr style=\"text-align: right;\">\n",
157 |        "      <th></th>\n",
158 |        "      <th>Node1</th>\n",
159 |        "      <th>Relation</th>\n",
160 |        "      <th>Node2</th>\n",
161 |        "    </tr>\n",
162 |        "  </thead>\n",
163 |        "  <tbody>\n",
164 |        "    <tr>\n",
165 |        "      <th>0</th>\n",
166 |        "      <td>hurricane irma</td>\n",
167 |        "      <td>has strengthened to</td>\n",
168 |        "      <td>category</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>1</th>\n",
172 |        "      <td>people</td>\n",
173 |        "      <td>is in</td>\n",
174 |        "      <td>leeward islands of caribbean</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>2</th>\n",
178 |        "      <td>home</td>\n",
179 |        "      <td>is in</td>\n",
180 |        "      <td>affected areas</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>6</th>\n",
184 |        "      <td>significant event</td>\n",
185 |        "      <td>director of</td>\n",
186 |        "      <td>caribbean disaster emergency management agency</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>14</th>\n",
190 |        "      <td>forecaster</td>\n",
191 |        "      <td>expect</td>\n",
192 |        "      <td>storm</td>\n",
193 |        "    </tr>\n",
194 |        "  </tbody>\n",
195 |        "</table>\n",
196 |        "</div>"
197 |       ],
198 |       "text/plain": [
199 |        "                Node1             Relation  \\\n",
200 |        "0      hurricane irma  has strengthened to   \n",
201 |        "1              people                is in   \n",
202 |        "2                home                is in   \n",
203 |        "6   significant event          director of   \n",
204 |        "14         forecaster               expect   \n",
205 |        "\n",
206 |        "                                             Node2  \n",
207 |        "0                                         category  \n",
208 |        "1                     leeward islands of caribbean  \n",
209 |        "2                                   affected areas  \n",
210 |        "6   caribbean disaster emergency management agency  \n",
211 |        "14                                           storm  "
212 |       ]
213 |      },
214 |      "execution_count": 10,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "data.head()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "## Network Graph"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "### Full Graph"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 11,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "g = ng.create_graph(data.iloc[:1000])"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 12,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "G = ng.plot_graph(g, central_gravity=0.01,\n",
253 |     "                  with_edge_label=True,\n",
254 |     "                  height='600px', width='80%',\n",
255 |     "                  filter_=['physics'])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 13,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# G.show('ng_harvey_01.html')"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "### Focused Graph"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 14,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "entities = ['hurricane', 'houston', 'government','trump']"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 15,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "sel_1 = data[data['Node1'].apply(lambda s: s in entities)].copy()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 16,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "sel_2 = data[data['Node2'].apply(lambda s: s in entities)].copy()"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 17,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "sel = pd.concat((sel_1, sel_2), ignore_index=True)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 18,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 19,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 20,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "g = ng.create_graph(sel)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 21,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "G = ng.plot_graph(g, central_gravity=0.01,\n",
344 |     "                  with_edge_label=True,\n",
345 |     "                  height='600px', width='80%',\n",
346 |     "                  filter_=['physics'])"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 22,
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "data": {
356 |       "text/html": [
357 |        "\n",
358 |        "        <iframe\n",
359 |        "            width=\"80%\"\n",
360 |        "            height=\"600px\"\n",
361 |        "            src=\"ng_harvey_02.html\"\n",
362 |        "            frameborder=\"0\"\n",
363 |        "            allowfullscreen\n",
364 |        "        ></iframe>\n",
365 |        "        "
366 |       ],
367 |       "text/plain": [
368 |        "<IPython.lib.display.IFrame at 0x7fbae235bf60>"
369 |       ]
370 |      },
371 |      "execution_count": 22,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "G.show('ng_harvey_02.html')"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
385 |    ]
386 |   }
387 |  ],
388 |  "metadata": {
389 |   "kernelspec": {
390 |    "display_name": "Python 3",
391 |    "language": "python",
392 |    "name": "python3"
393 |   },
394 |   "language_info": {
395 |    "codemirror_mode": {
396 |     "name": "ipython",
397 |     "version": 3
398 |    },
399 |    "file_extension": ".py",
400 |    "mimetype": "text/x-python",
401 |    "name": "python",
402 |    "nbconvert_exporter": "python",
403 |    "pygments_lexer": "ipython3",
404 |    "version": "3.6.7"
405 |   }
406 |  },
407 |  "nbformat": 4,
408 |  "nbformat_minor": 2
409 | }
410 | 


--------------------------------------------------------------------------------
/code/04_harvey/04_harvey_03_oie.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Dow Jones DNA NLP Case Study\n",
 15 |     "\n",
 16 |     "_Based on news articles related to Hurricane Harvey._\n",
 17 |     "\n",
 18 |     "**Information Extraction**\n",
 19 |     "\n",
 20 |     "Dr Yves J Hilpisch | Michael Schwed\n",
 21 |     "\n",
 22 |     "The Python Quants GmbH"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## The Imports"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "import sys\n",
 40 |     "sys.path.append('../../modules')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import nltk\n",
 50 |     "import pandas as pd\n",
 51 |     "import soiepy.main as ie\n",
 52 |     "import nlp_functions as nlp"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Snapshot Data"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "project = 'harvey_250'"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "abs_path = os.path.abspath('../../')"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "data_path = os.path.join(abs_path, 'data_harvey')"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "snapshot_path = os.path.join(data_path, 'snapshot')"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 8,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "raw = pd.read_hdf(fn, 'data')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Preprocessing"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 9,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "CPU times: user 954 ms, sys: 31 ms, total: 985 ms\n",
133 |       "Wall time: 984 ms\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "%time raw['body'] = raw['body'].apply(nlp.clean_up_text)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 10,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "data = raw['body'].values.tolist()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 11,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "CPU times: user 576 ms, sys: 11.7 ms, total: 587 ms\n",
160 |       "Wall time: 588 ms\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "%%time\n",
166 |     "s = [nltk.sent_tokenize(a) for a in data]\n",
167 |     "s = [_ for sl in s for _ in sl]"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 12,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "['hurricane irma has strengthened to a category 5 storm - the most severe designation for a hurricane, the national hurricane center said tuesday.',\n",
179 |        " 'people in the leeward islands of the caribbean are preparing for the irma arrival late tuesday or early wednesday, as areas to the northwest, from puerto rico to cuba to the coastal united states, wait to see the track the storm will take as the week progresses.']"
180 |       ]
181 |      },
182 |      "execution_count": 12,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "s[:2]"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 13,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "token_path = os.path.join(data_path, 'tokens')\n",
198 |     "if not os.path.isdir(token_path):\n",
199 |     "    os.mkdir(token_path)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 14,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "fn = os.path.join(token_path, 'tokens_{}_{}.txt')"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 15,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "steps = 250\n",
218 |     "for c, i in enumerate(range(0, len(s), steps)):\n",
219 |     "    with open(fn.format(project, c), 'w') as f:\n",
220 |     "        f.writelines([_ + '\\n' for _ in s[i:i + steps - 1]])"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "## Relations"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 16,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "results_path = os.path.join(data_path, 'results')\n",
237 |     "if not os.path.isdir(results_path):\n",
238 |     "    os.mkdir(results_path)\n",
239 |     "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 17,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "fl = sorted(os.listdir(token_path))\n",
249 |     "d = pd.DataFrame()\n",
250 |     "fno = len(fl)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 18,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "CPU times: user 3.03 s, sys: 943 ms, total: 3.97 s\n",
263 |       "Wall time: 28min 25s\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "%%time\n",
269 |     "try:\n",
270 |     "    d = pd.read_hdf(fn, 'raw')\n",
271 |     "except:\n",
272 |     "    for i, fn in enumerate(fl):\n",
273 |     "        filename = os.path.join(token_path, fn)\n",
274 |     "        msg = 'Processing file {} of {} \\r'\n",
275 |     "        print(msg.format(i + 1, fno), end='')\n",
276 |     "        r = ie.stanford_ie(filename, verbose=False)\n",
277 |     "        dt = pd.DataFrame(r)\n",
278 |     "        if len(d) == 0:\n",
279 |     "            d = dt\n",
280 |     "        else:\n",
281 |     "            d = pd.concat((d, dt), ignore_index=True)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 19,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "d = d.iloc[:, :3]"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 20,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "d.columns = ['Node1', 'Relation', 'Node2']"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 21,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "50660"
311 |       ]
312 |      },
313 |      "execution_count": 21,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "len(d)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "## Post Processing"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 22,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "data = d.copy()"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "### Basic Processing"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 23,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "data = data.applymap(lambda s: s.strip())"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 24,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "data = data[data.applymap(lambda s: not s in nlp.stop_words)].dropna()"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 25,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "data = data[data.applymap(lambda s: not s.startswith('http'))].dropna()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 26,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "data = data.applymap(lambda s: nlp.nltk_lemma(s))"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 27,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "data": {
388 |       "text/plain": [
389 |        "31921"
390 |       ]
391 |      },
392 |      "execution_count": 27,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "len(data)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "metadata": {},
404 |    "source": [
405 |     "### Removing Duplicates"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 28,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "def join_columns(row):\n",
415 |     "    return ' '.join([row['Node1'], row['Relation'], row['Node2']])"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 29,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "vec = nlp.TfidfVectorizer(stop_words='english')"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 30,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "data['Join'] = data.apply(lambda row: join_columns(row), axis=1)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 31,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "mat = vec.fit_transform(data['Join'].values.tolist())"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 32,
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "name": "stdout",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "CPU times: user 2.17 s, sys: 6.89 s, total: 9.06 s\n",
455 |       "Wall time: 8.74 s\n"
456 |      ]
457 |     }
458 |    ],
459 |    "source": [
460 |     "%time sim = (mat * mat.T).A"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 33,
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": [
469 |     "data['Keep'] = True"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 34,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "name": "stdout",
479 |      "output_type": "stream",
480 |      "text": [
481 |       "CPU times: user 19min 48s, sys: 6.08 s, total: 19min 54s\n",
482 |       "Wall time: 19min 54s\n"
483 |      ]
484 |     }
485 |    ],
486 |    "source": [
487 |     "%%time\n",
488 |     "for i, ind_i in enumerate(data.index):\n",
489 |     "    for j, ind_j in enumerate(data.index):\n",
490 |     "        if j > i:\n",
491 |     "            simsc = sim[i, j]\n",
492 |     "            if simsc > 0.5:\n",
493 |     "                data.loc[ind_j, 'Keep'] = False"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 35,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "data = data.iloc[:, :3][data['Keep'] == True]"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "raw",
507 |    "metadata": {},
508 |    "source": [
509 |     "# tag::HARVEY_01[]"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 36,
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "data": {
519 |       "text/plain": [
520 |        "6520"
521 |       ]
522 |      },
523 |      "execution_count": 36,
524 |      "metadata": {},
525 |      "output_type": "execute_result"
526 |     }
527 |    ],
528 |    "source": [
529 |     "len(data)"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 37,
535 |    "metadata": {},
536 |    "outputs": [
537 |     {
538 |      "data": {
539 |       "text/html": [
540 |        "<div>\n",
541 |        "<style scoped>\n",
542 |        "    .dataframe tbody tr th:only-of-type {\n",
543 |        "        vertical-align: middle;\n",
544 |        "    }\n",
545 |        "\n",
546 |        "    .dataframe tbody tr th {\n",
547 |        "        vertical-align: top;\n",
548 |        "    }\n",
549 |        "\n",
550 |        "    .dataframe thead th {\n",
551 |        "        text-align: right;\n",
552 |        "    }\n",
553 |        "</style>\n",
554 |        "<table border=\"1\" class=\"dataframe\">\n",
555 |        "  <thead>\n",
556 |        "    <tr style=\"text-align: right;\">\n",
557 |        "      <th></th>\n",
558 |        "      <th>Node1</th>\n",
559 |        "      <th>Relation</th>\n",
560 |        "      <th>Node2</th>\n",
561 |        "    </tr>\n",
562 |        "  </thead>\n",
563 |        "  <tbody>\n",
564 |        "    <tr>\n",
565 |        "      <th>50569</th>\n",
566 |        "      <td>flood waters</td>\n",
567 |        "      <td>are contaminated with</td>\n",
568 |        "      <td>bacteria</td>\n",
569 |        "    </tr>\n",
570 |        "    <tr>\n",
571 |        "      <th>50571</th>\n",
572 |        "      <td>jack gillis</td>\n",
573 |        "      <td>spokesman for</td>\n",
574 |        "      <td>consumer federation of america of car book</td>\n",
575 |        "    </tr>\n",
576 |        "    <tr>\n",
577 |        "      <th>50574</th>\n",
578 |        "      <td>your mechanic</td>\n",
579 |        "      <td>examine</td>\n",
580 |        "      <td>vehicle</td>\n",
581 |        "    </tr>\n",
582 |        "    <tr>\n",
583 |        "      <th>50580</th>\n",
584 |        "      <td>check</td>\n",
585 |        "      <td>is in</td>\n",
586 |        "      <td>wheel wells</td>\n",
587 |        "    </tr>\n",
588 |        "    <tr>\n",
589 |        "      <th>50583</th>\n",
590 |        "      <td>used car dealer</td>\n",
591 |        "      <td>is licensed by</td>\n",
592 |        "      <td>state</td>\n",
593 |        "    </tr>\n",
594 |        "  </tbody>\n",
595 |        "</table>\n",
596 |        "</div>"
597 |       ],
598 |       "text/plain": [
599 |        "                 Node1               Relation  \\\n",
600 |        "50569     flood waters  are contaminated with   \n",
601 |        "50571      jack gillis          spokesman for   \n",
602 |        "50574    your mechanic                examine   \n",
603 |        "50580            check                  is in   \n",
604 |        "50583  used car dealer         is licensed by   \n",
605 |        "\n",
606 |        "                                            Node2  \n",
607 |        "50569                                    bacteria  \n",
608 |        "50571  consumer federation of america of car book  \n",
609 |        "50574                                     vehicle  \n",
610 |        "50580                                 wheel wells  \n",
611 |        "50583                                       state  "
612 |       ]
613 |      },
614 |      "execution_count": 37,
615 |      "metadata": {},
616 |      "output_type": "execute_result"
617 |     }
618 |    ],
619 |    "source": [
620 |     "data.tail()"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "raw",
625 |    "metadata": {},
626 |    "source": [
627 |     "# end::HARVEY_01[]"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "markdown",
632 |    "metadata": {},
633 |    "source": [
634 |     "## Storing Results"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 38,
640 |    "metadata": {},
641 |    "outputs": [],
642 |    "source": [
643 |     "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 39,
649 |    "metadata": {},
650 |    "outputs": [],
651 |    "source": [
652 |     "d.to_hdf(fn, 'raw', complevel=5, complib='blosc')"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": 40,
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": [
661 |     "data.to_hdf(fn, 'data', complevel=5, complib='blosc')"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "markdown",
666 |    "metadata": {},
667 |    "source": [
668 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
669 |    ]
670 |   }
671 |  ],
672 |  "metadata": {
673 |   "kernelspec": {
674 |    "display_name": "Python 3",
675 |    "language": "python",
676 |    "name": "python3"
677 |   },
678 |   "language_info": {
679 |    "codemirror_mode": {
680 |     "name": "ipython",
681 |     "version": 3
682 |    },
683 |    "file_extension": ".py",
684 |    "mimetype": "text/x-python",
685 |    "name": "python",
686 |    "nbconvert_exporter": "python",
687 |    "pygments_lexer": "ipython3",
688 |    "version": "3.6.7"
689 |   }
690 |  },
691 |  "nbformat": 4,
692 |  "nbformat_minor": 2
693 | }
694 | 


--------------------------------------------------------------------------------
/code/02_nlp/02_nlp_openie.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Natural Language Processing\n",
 15 |     "\n",
 16 |     "**Open Information Extraction**\n",
 17 |     "\n",
 18 |     "_Illustrated based on a simple example and the texts from three Apple press releases._\n",
 19 |     "\n",
 20 |     "Dr Yves J Hilpisch | Michael Schwed\n",
 21 |     "\n",
 22 |     "The Python Quants GmbH"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Simple Example"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "import nltk\n",
 40 |     "import requests\n",
 41 |     "import pandas as pd"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import sys\n",
 51 |     "sys.path.append('../../modules/')\n",
 52 |     "import soiepy.main as ie  \n",
 53 |     "import ng_functions as ng  \n",
 54 |     "import nlp_functions as nlp"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "t = '''\n",
 64 |     "Peter studies data science.\n",
 65 |     "Peter knows Java.\n",
 66 |     "Peter prefers Python.\n",
 67 |     "Peter works as a data scientist.\n",
 68 |     "Peter applies machine learning.\n",
 69 |     "A data scientist uses Python.\n",
 70 |     "Python revolutionized data science.\n",
 71 |     "Python is preferred for NLP.\n",
 72 |     "Python is used for machine learning.\n",
 73 |     "'''"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "s = nltk.sent_tokenize(t)  "
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "['\\nPeter studies data science.', 'Peter knows Java.', 'Peter prefers Python.']"
 94 |       ]
 95 |      },
 96 |      "execution_count": 5,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "s[:3]  "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "s = [nlp.clean_up_text(_) for _ in s]  \n",
112 |     "s = [' '.join(nlp.tokenize(_)) + '.' for _ in s]  "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "['peter study data science.', 'peter know java.', 'peter prefer python.']"
124 |       ]
125 |      },
126 |      "execution_count": 7,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "s[:3]  "
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 8,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "abs_path = os.path.abspath('../../')"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 9,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "data_path = os.path.join(abs_path, 'data')\n",
151 |     "tokens_path = os.path.join(data_path, 'tokens')\n",
152 |     "if not os.path.isdir(tokens_path):\n",
153 |     "    os.mkdir(tokens_path)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 10,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "fn = os.path.join(tokens_path, 'tokens_example.txt')  "
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 11,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "with open(fn, 'w') as f:\n",
172 |     "    f.writelines([_ + '\\n' for _ in s])  "
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 12,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "Executing command = cd /root/notebook/dnanlp/modules/soiepy/;cd stanford-openie; java -mx4g -cp \"stanford-openie.jar:stanford-openie-models.jar:lib/*\" edu.stanford.nlp.naturalli.OpenIE /root/notebook/dnanlp/data/tokens/tokens_example.txt  -format ollie > /tmp/openie/out.txt\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "r = ie.stanford_ie(fn, verbose=True)  "
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 13,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "[['peter', ' know', ' java'],\n",
201 |        " ['peter', ' prefer', ' python'],\n",
202 |        " ['peter', ' works', ' data scientist']]"
203 |       ]
204 |      },
205 |      "execution_count": 13,
206 |      "metadata": {},
207 |      "output_type": "execute_result"
208 |     }
209 |    ],
210 |    "source": [
211 |     "r[:3]  "
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 14,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])  "
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 15,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "d = d.applymap(lambda _: _.strip())  "
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 16,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/html": [
240 |        "<div>\n",
241 |        "<style scoped>\n",
242 |        "    .dataframe tbody tr th:only-of-type {\n",
243 |        "        vertical-align: middle;\n",
244 |        "    }\n",
245 |        "\n",
246 |        "    .dataframe tbody tr th {\n",
247 |        "        vertical-align: top;\n",
248 |        "    }\n",
249 |        "\n",
250 |        "    .dataframe thead th {\n",
251 |        "        text-align: right;\n",
252 |        "    }\n",
253 |        "</style>\n",
254 |        "<table border=\"1\" class=\"dataframe\">\n",
255 |        "  <thead>\n",
256 |        "    <tr style=\"text-align: right;\">\n",
257 |        "      <th></th>\n",
258 |        "      <th>Node1</th>\n",
259 |        "      <th>Relation</th>\n",
260 |        "      <th>Node2</th>\n",
261 |        "    </tr>\n",
262 |        "  </thead>\n",
263 |        "  <tbody>\n",
264 |        "    <tr>\n",
265 |        "      <th>0</th>\n",
266 |        "      <td>peter</td>\n",
267 |        "      <td>know</td>\n",
268 |        "      <td>java</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>1</th>\n",
272 |        "      <td>peter</td>\n",
273 |        "      <td>prefer</td>\n",
274 |        "      <td>python</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>2</th>\n",
278 |        "      <td>peter</td>\n",
279 |        "      <td>works</td>\n",
280 |        "      <td>data scientist</td>\n",
281 |        "    </tr>\n",
282 |        "  </tbody>\n",
283 |        "</table>\n",
284 |        "</div>"
285 |       ],
286 |       "text/plain": [
287 |        "   Node1 Relation           Node2\n",
288 |        "0  peter     know            java\n",
289 |        "1  peter   prefer          python\n",
290 |        "2  peter    works  data scientist"
291 |       ]
292 |      },
293 |      "execution_count": 16,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "d.iloc[:3]"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 17,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "g = ng.create_graph(d)  "
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 18,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "G = ng.plot_graph(g, central_gravity=0.01)  "
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 19,
323 |    "metadata": {},
324 |    "outputs": [
325 |     {
326 |      "data": {
327 |       "text/html": [
328 |        "\n",
329 |        "        <iframe\n",
330 |        "            width=\"100%\"\n",
331 |        "            height=\"750px\"\n",
332 |        "            src=\"ng_example.html\"\n",
333 |        "            frameborder=\"0\"\n",
334 |        "            allowfullscreen\n",
335 |        "        ></iframe>\n",
336 |        "        "
337 |       ],
338 |       "text/plain": [
339 |        "<IPython.lib.display.IFrame at 0x7f56f82639e8>"
340 |       ]
341 |      },
342 |      "execution_count": 19,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "G.show('ng_example.html')  "
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "## Apple Press Releases"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 20,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "import requests"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 21,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "sources = [\n",
374 |     "    'https://nr.apple.com/dE0b1T5G3u',  # iPad Pro\n",
375 |     "    'https://nr.apple.com/dE4c7T6g1K',  # MacBook Air\n",
376 |     "    'https://nr.apple.com/dE4q4r8A2A',  # Mac Mini\n",
377 |     "]"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 22,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "html = [requests.get(url).text for url in sources]"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 23,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "sents = [nltk.sent_tokenize(h) for h in html]"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 24,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "s = []\n",
405 |     "for sent in sents:\n",
406 |     "    s.extend(sent)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 25,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "data": {
416 |       "text/plain": [
417 |        "200"
418 |       ]
419 |      },
420 |      "execution_count": 25,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "len(s)"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 26,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "s = [nlp.clean_up_text(se) for se in s]"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 27,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "s = [' '.join(nlp.tokenize(se)) + '.' for se in s]"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 28,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "s = [se for se in s if len(se) > 5]"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 29,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "fn = os.path.join(tokens_path, 'tokens_apple.txt')\n",
463 |     "with open(fn, 'w') as f:\n",
464 |     "    f.writelines([_ + '\\n' for _ in s])"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 30,
470 |    "metadata": {},
471 |    "outputs": [
472 |     {
473 |      "name": "stdout",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "CPU times: user 5.85 ms, sys: 16.3 ms, total: 22.1 ms\n",
477 |       "Wall time: 21.2 s\n"
478 |      ]
479 |     }
480 |    ],
481 |    "source": [
482 |     "%time r = ie.stanford_ie(fn, verbose=False)"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": 31,
488 |    "metadata": {},
489 |    "outputs": [
490 |     {
491 |      "data": {
492 |       "text/plain": [
493 |        "[['apple mac',\n",
494 |        "  ' ipad',\n",
495 |        "  ' iphone watch music support shopping bag newsroom archive press release'],\n",
496 |        " ['apple mac',\n",
497 |        "  ' ipad',\n",
498 |        "  ' watch music support shopping bag newsroom archive press release'],\n",
499 |        " ['today', ' introduce', ' design performance']]"
500 |       ]
501 |      },
502 |      "execution_count": 31,
503 |      "metadata": {},
504 |      "output_type": "execute_result"
505 |     }
506 |    ],
507 |    "source": [
508 |     "r[:3]"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 32,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 33,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "d = d.applymap(lambda x: x.strip())"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 34,
532 |    "metadata": {},
533 |    "outputs": [
534 |     {
535 |      "data": {
536 |       "text/html": [
537 |        "<div>\n",
538 |        "<style scoped>\n",
539 |        "    .dataframe tbody tr th:only-of-type {\n",
540 |        "        vertical-align: middle;\n",
541 |        "    }\n",
542 |        "\n",
543 |        "    .dataframe tbody tr th {\n",
544 |        "        vertical-align: top;\n",
545 |        "    }\n",
546 |        "\n",
547 |        "    .dataframe thead th {\n",
548 |        "        text-align: right;\n",
549 |        "    }\n",
550 |        "</style>\n",
551 |        "<table border=\"1\" class=\"dataframe\">\n",
552 |        "  <thead>\n",
553 |        "    <tr style=\"text-align: right;\">\n",
554 |        "      <th></th>\n",
555 |        "      <th>Node1</th>\n",
556 |        "      <th>Relation</th>\n",
557 |        "      <th>Node2</th>\n",
558 |        "    </tr>\n",
559 |        "  </thead>\n",
560 |        "  <tbody>\n",
561 |        "    <tr>\n",
562 |        "      <th>0</th>\n",
563 |        "      <td>apple mac</td>\n",
564 |        "      <td>ipad</td>\n",
565 |        "      <td>iphone watch music support shopping bag newsro...</td>\n",
566 |        "    </tr>\n",
567 |        "    <tr>\n",
568 |        "      <th>1</th>\n",
569 |        "      <td>apple mac</td>\n",
570 |        "      <td>ipad</td>\n",
571 |        "      <td>watch music support shopping bag newsroom arch...</td>\n",
572 |        "    </tr>\n",
573 |        "    <tr>\n",
574 |        "      <th>2</th>\n",
575 |        "      <td>today</td>\n",
576 |        "      <td>introduce</td>\n",
577 |        "      <td>design performance</td>\n",
578 |        "    </tr>\n",
579 |        "    <tr>\n",
580 |        "      <th>3</th>\n",
581 |        "      <td>today</td>\n",
582 |        "      <td>introduce</td>\n",
583 |        "      <td>ipad design performance</td>\n",
584 |        "    </tr>\n",
585 |        "    <tr>\n",
586 |        "      <th>4</th>\n",
587 |        "      <td>today</td>\n",
588 |        "      <td>introduce</td>\n",
589 |        "      <td>design next-generation performance</td>\n",
590 |        "    </tr>\n",
591 |        "    <tr>\n",
592 |        "      <th>5</th>\n",
593 |        "      <td>today</td>\n",
594 |        "      <td>introduce</td>\n",
595 |        "      <td>all-screen design performance</td>\n",
596 |        "    </tr>\n",
597 |        "    <tr>\n",
598 |        "      <th>6</th>\n",
599 |        "      <td>today</td>\n",
600 |        "      <td>introduce</td>\n",
601 |        "      <td>ipad design next-generation performance</td>\n",
602 |        "    </tr>\n",
603 |        "    <tr>\n",
604 |        "      <th>7</th>\n",
605 |        "      <td>today</td>\n",
606 |        "      <td>introduce</td>\n",
607 |        "      <td>ipad all-screen design next-generation perform...</td>\n",
608 |        "    </tr>\n",
609 |        "    <tr>\n",
610 |        "      <th>8</th>\n",
611 |        "      <td>today</td>\n",
612 |        "      <td>introduce</td>\n",
613 |        "      <td>ipad all-screen design performance</td>\n",
614 |        "    </tr>\n",
615 |        "    <tr>\n",
616 |        "      <th>9</th>\n",
617 |        "      <td>today</td>\n",
618 |        "      <td>introduce</td>\n",
619 |        "      <td>all-screen design next-generation performance</td>\n",
620 |        "    </tr>\n",
621 |        "  </tbody>\n",
622 |        "</table>\n",
623 |        "</div>"
624 |       ],
625 |       "text/plain": [
626 |        "       Node1   Relation                                              Node2\n",
627 |        "0  apple mac       ipad  iphone watch music support shopping bag newsro...\n",
628 |        "1  apple mac       ipad  watch music support shopping bag newsroom arch...\n",
629 |        "2      today  introduce                                 design performance\n",
630 |        "3      today  introduce                            ipad design performance\n",
631 |        "4      today  introduce                 design next-generation performance\n",
632 |        "5      today  introduce                      all-screen design performance\n",
633 |        "6      today  introduce            ipad design next-generation performance\n",
634 |        "7      today  introduce  ipad all-screen design next-generation perform...\n",
635 |        "8      today  introduce                 ipad all-screen design performance\n",
636 |        "9      today  introduce      all-screen design next-generation performance"
637 |       ]
638 |      },
639 |      "execution_count": 34,
640 |      "metadata": {},
641 |      "output_type": "execute_result"
642 |     }
643 |    ],
644 |    "source": [
645 |     "d.iloc[:10]"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": 35,
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": [
654 |     "d = d[d.applymap(lambda x: len(x) < 25)].dropna()"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 36,
660 |    "metadata": {},
661 |    "outputs": [
662 |     {
663 |      "data": {
664 |       "text/html": [
665 |        "<div>\n",
666 |        "<style scoped>\n",
667 |        "    .dataframe tbody tr th:only-of-type {\n",
668 |        "        vertical-align: middle;\n",
669 |        "    }\n",
670 |        "\n",
671 |        "    .dataframe tbody tr th {\n",
672 |        "        vertical-align: top;\n",
673 |        "    }\n",
674 |        "\n",
675 |        "    .dataframe thead th {\n",
676 |        "        text-align: right;\n",
677 |        "    }\n",
678 |        "</style>\n",
679 |        "<table border=\"1\" class=\"dataframe\">\n",
680 |        "  <thead>\n",
681 |        "    <tr style=\"text-align: right;\">\n",
682 |        "      <th></th>\n",
683 |        "      <th>Node1</th>\n",
684 |        "      <th>Relation</th>\n",
685 |        "      <th>Node2</th>\n",
686 |        "    </tr>\n",
687 |        "  </thead>\n",
688 |        "  <tbody>\n",
689 |        "    <tr>\n",
690 |        "      <th>2</th>\n",
691 |        "      <td>today</td>\n",
692 |        "      <td>introduce</td>\n",
693 |        "      <td>design performance</td>\n",
694 |        "    </tr>\n",
695 |        "    <tr>\n",
696 |        "      <th>3</th>\n",
697 |        "      <td>today</td>\n",
698 |        "      <td>introduce</td>\n",
699 |        "      <td>ipad design performance</td>\n",
700 |        "    </tr>\n",
701 |        "    <tr>\n",
702 |        "      <th>19</th>\n",
703 |        "      <td>workflows .2 apps design</td>\n",
704 |        "      <td>take</td>\n",
705 |        "      <td>advantage display</td>\n",
706 |        "    </tr>\n",
707 |        "    <tr>\n",
708 |        "      <th>45</th>\n",
709 |        "      <td>workflows .2 apps design</td>\n",
710 |        "      <td>take</td>\n",
711 |        "      <td>advantage large display</td>\n",
712 |        "    </tr>\n",
713 |        "    <tr>\n",
714 |        "      <th>47</th>\n",
715 |        "      <td>photoshop ipad</td>\n",
716 |        "      <td>coming</td>\n",
717 |        "      <td>2019 push user computer</td>\n",
718 |        "    </tr>\n",
719 |        "  </tbody>\n",
720 |        "</table>\n",
721 |        "</div>"
722 |       ],
723 |       "text/plain": [
724 |        "                       Node1   Relation                    Node2\n",
725 |        "2                      today  introduce       design performance\n",
726 |        "3                      today  introduce  ipad design performance\n",
727 |        "19  workflows .2 apps design       take        advantage display\n",
728 |        "45  workflows .2 apps design       take  advantage large display\n",
729 |        "47            photoshop ipad     coming  2019 push user computer"
730 |       ]
731 |      },
732 |      "execution_count": 36,
733 |      "metadata": {},
734 |      "output_type": "execute_result"
735 |     }
736 |    ],
737 |    "source": [
738 |     "d.iloc[:5]"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": 37,
744 |    "metadata": {},
745 |    "outputs": [],
746 |    "source": [
747 |     "g = ng.create_graph(d)"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": 38,
753 |    "metadata": {},
754 |    "outputs": [],
755 |    "source": [
756 |     "G = ng.plot_graph(g, with_edge_label=False,\n",
757 |     "                  font_color='grey', central_gravity=0.01)"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": 39,
763 |    "metadata": {
764 |     "scrolled": false
765 |    },
766 |    "outputs": [
767 |     {
768 |      "data": {
769 |       "text/html": [
770 |        "\n",
771 |        "        <iframe\n",
772 |        "            width=\"100%\"\n",
773 |        "            height=\"750px\"\n",
774 |        "            src=\"ng_apple.html\"\n",
775 |        "            frameborder=\"0\"\n",
776 |        "            allowfullscreen\n",
777 |        "        ></iframe>\n",
778 |        "        "
779 |       ],
780 |       "text/plain": [
781 |        "<IPython.lib.display.IFrame at 0x7f56f80cca58>"
782 |       ]
783 |      },
784 |      "execution_count": 39,
785 |      "metadata": {},
786 |      "output_type": "execute_result"
787 |     }
788 |    ],
789 |    "source": [
790 |     "G.show('ng_apple.html')"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "markdown",
795 |    "metadata": {},
796 |    "source": [
797 |     "<img src=\"http://hilpisch.com/tpq_logo.png\" width=\"36%\" align=\"right\" style=\"vertical-align: top;\">"
798 |    ]
799 |   }
800 |  ],
801 |  "metadata": {
802 |   "kernelspec": {
803 |    "display_name": "Python 3",
804 |    "language": "python",
805 |    "name": "python3"
806 |   },
807 |   "language_info": {
808 |    "codemirror_mode": {
809 |     "name": "ipython",
810 |     "version": 3
811 |    },
812 |    "file_extension": ".py",
813 |    "mimetype": "text/x-python",
814 |    "name": "python",
815 |    "nbconvert_exporter": "python",
816 |    "pygments_lexer": "ipython3",
817 |    "version": "3.6.7"
818 |   }
819 |  },
820 |  "nbformat": 4,
821 |  "nbformat_minor": 2
822 | }
823 | 


--------------------------------------------------------------------------------