├── data_musk └── snapshot │ └── snapshot_musk_100.h5 ├── data_harvey └── snapshot │ └── snapshot_harvey_250.h5 ├── .gitignore ├── modules ├── ng_functions.py ├── nlp_functions.py └── tpqdna.py ├── README.md ├── setup_dna_nlp.sh ├── code ├── 04_harvey │ ├── 04_harvey_01_data.ipynb │ ├── 04_harvey_04_ng.ipynb │ └── 04_harvey_03_oie.ipynb ├── 03_musk │ ├── 03_musk_04_ng.ipynb │ ├── 03_musk_03_oie.ipynb │ └── 03_musk_01_data.ipynb └── 02_nlp │ └── 02_nlp_openie.ipynb └── data └── eikon_eod_tsla_data.csv /data_musk/snapshot/snapshot_musk_100.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilpisch/dnanlp/master/data_musk/snapshot/snapshot_musk_100.h5 -------------------------------------------------------------------------------- /data_harvey/snapshot/snapshot_harvey_250.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilpisch/dnanlp/master/data_harvey/snapshot/snapshot_harvey_250.h5 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Specifics 2 | *.swp 3 | _build/ 4 | *.pkl 5 | *.txt 6 | dna_api_key.h5 7 | data_harvey/results/ 8 | data_harvey/tokens/ 9 | data_musk/results/ 10 | data_musk/tokens/ 11 | *.avro 12 | help/ 13 | *.ipynb~ 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | -------------------------------------------------------------------------------- /modules/ng_functions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Network Graph Helper Functions 3 | # 4 | # The Python Quants GmbH 5 | # 6 | import pandas as pd 7 | import networkx as nx 8 | from pyvis.network import Network 9 | 10 | def create_graph(data, labels=False): 11 | '''Create a NetworkX graph object from a pandas DataFrame. 12 | ''' 13 | G = nx.DiGraph() 14 | 15 | vals = data[['Node1', 'Relation', 'Node2']].values 16 | G.add_edges_from([(v[0], v[2], {'relation': v[1]}) for v in vals]) 17 | 18 | if labels: 19 | vals = data[['Label1', 'Node1', 'Label2', 'Node2']].values 20 | for v in vals: 21 | G.node[v[1]]['type'] = v[0] 22 | G.node[v[3]]['type'] = v[2] 23 | return G 24 | 25 | def plot_graph(graph, background_color='white', 26 | font_color='grey', with_edge_label=True, 27 | central_gravity=2.0, solver='', 28 | height='750px', width='100%', filter_=['']): 29 | ''' Creates a pyvis interactive Network Graph from a 30 | NetworkX graph object. 31 | ''' 32 | G = Network(notebook=True, height=height, width=width, 33 | bgcolor=background_color, font_color=font_color) 34 | 35 | color = {0:'#fb217f', 1:'#fb217f', 2:'#88b1fb', 3:'#88b1fb', 4:'#88b1fb'} 36 | deg = dict(graph.in_degree()) 37 | 38 | for node in graph: 39 | md = max(deg.values()) 40 | color_id = min(deg[node], 4) 41 | G.add_node(node, title=node, label=node, 42 | size=(md - deg[node] + 1) * 4, 43 | color=color[color_id]) 44 | 45 | for edge in graph.edges(): 46 | if with_edge_label: 47 | label = graph.get_edge_data(edge[0], edge[1])['relation'] 48 | else: 49 | label='' 50 | G.add_edge(edge[0], edge[1], label=label) 51 | if solver == 'barnes_hut': 52 | G.barnes_hut(central_gravity=central_gravity) 53 | else: 54 | G.force_atlas_2based(central_gravity=central_gravity) 55 | G.show_buttons(filter_=filter_) 56 | return G 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unlocking the Hidden Potential of Unstructuctured News Data with NLP 2 | 3 | This repository provides Python codes and Jupyter Notebooks for the Dow Jones applied research paper "Unlocking the Hidden Potential of Unstructured News Data with NLP — Understanding Advanced Analytics through Real-World Case Studies". 4 | 5 | 6 | 7 | ## Applied Research Paper Download 8 | 9 | To **download** the PDF version of the paper visit http://go.dowjones.com/dna-research-paper. 10 | 11 | ## Setup and Installation 12 | 13 | The instructions that follow assume that you run a **Docker container** or a **cloud instance** with the latest version of Ubunutu (18.10 at the time of this writing). 14 | 15 | The execution of (parts of) the codes and Jupyter Notebooks requires enough **compute and memory resources**. Overall, it is recommended to have at least **four CPU cores and 16GB of RAM** available. The introductory examples can be executed with fewer resources. 16 | 17 | ### Cloud Instance 18 | 19 | The following assumes that you have set up a **cloud instance** (e.g. on DigitalOcean) and have used `ssh` to login as `root`. You can then execute on the shell: 20 | 21 | cd /root 22 | wget http://hilpisch.com/nlp/setup_dna_nlp.sh 23 | bash setup_dna_nlp.sh 24 | 25 | Follow the **instructions** of the script and e.g. provide a password for the Jupyter Notebook server. 26 | 27 | After the installation, you can access the **Jupyter Notebook server** under 28 | 29 | http://CLOUD_IP_ADDRESS:9999 30 | 31 | with your chosen password. Navigate via Jupyter to the code folder and open a notebook to get started. 32 | 33 | ### Docker Container 34 | 35 | Alternatively, you can start a **Docker container** locally (with enough resources allocated). To do so e.g. execute on the shell: 36 | 37 | docker run -ti -h dnanlp -p 9999:9999 ubuntu:latest /bin/bash 38 | 39 | Make sure that the container has **enough resources** allocated (e.g. via editing your Docker preferences). Then on the shell of the Docker container execute the following: 40 | 41 | cd root 42 | apt-get update 43 | apt-get upgrade -y 44 | apt-get install -y wget 45 | wget http://hilpisch.com/nlp/setup_dna_nlp.sh 46 | bash setup_dna_nlp.sh 47 | 48 | Then follow the **instructions** of the script to e.g. provide a password for the Jupyter Notebook server. 49 | 50 | After the installation, you can access the **Jupyter Notebook server** under 51 | 52 | http://localhost:9999 53 | 54 | with your chosen password. Navigate via Jupyter to the code folder and open a notebook to get started. 55 | 56 | ## Security Risks and Disclaimer 57 | 58 | The approach chosen to run the Jupyter Notebook server is for **illustration purposes** only. There are no security measures configured beyond password protection. For example, there is no SSL encryption configured. In addition, the Jupyter Notebook server is run as `root`. As a consequence, a number of **security risks** result from the approach chosen. 59 | 60 | All codes and Jupyter notebooks come with **no representations or warranties**, to the extent permitted by applicable law. 61 | 62 | This repository with all its scripts, codes and Jupyter notebooks is for **illustration purposes** only. 63 | 64 | ## Company Information 65 | 66 | © Dr. Yves J. Hilpisch \| The Python Quants GmbH 67 | 68 | http://tpq.io \| team@tpq.io \| 69 | http://twitter.com/dyjh \| http://pqp.io 70 | 71 | **Python for Finance & Algorithmic Trading online trainings** \| http://training.tpq.io 72 | 73 | **University Certificate Program in Python for Algorithmic Trading** \| http://certificate.tpq.io 74 | 75 | 76 | -------------------------------------------------------------------------------- /modules/nlp_functions.py: -------------------------------------------------------------------------------- 1 | # 2 | # NLP Helper Functions 3 | # 4 | # The Python Quants GmbH 5 | # 6 | import re 7 | import nltk 8 | import string 9 | import pandas as pd 10 | from pylab import plt 11 | from wordcloud import WordCloud 12 | from nltk.corpus import stopwords 13 | from nltk.corpus import wordnet as wn 14 | from lxml.html.clean import Cleaner 15 | from sklearn.feature_extraction.text import TfidfVectorizer 16 | plt.style.use('seaborn') 17 | 18 | cleaner = Cleaner(style=True, links=True, allow_tags=[''], 19 | remove_unknown_tags=False) 20 | 21 | stop_words = stopwords.words('english') 22 | stop_words.extend(['new', 'old', 'pro', 'open', 'menu', 'close']) 23 | 24 | 25 | def remove_non_ascii(s): 26 | ''' Removes all non-ascii characters. 27 | ''' 28 | return ''.join(i for i in s if ord(i) < 128) 29 | 30 | def clean_up_html(t): 31 | t = cleaner.clean_html(t) 32 | t = re.sub('[\n\t\r]', ' ', t) 33 | t = re.sub(' +', ' ', t) 34 | t = re.sub('<.*?>', '', t) 35 | t = remove_non_ascii(t) 36 | return t 37 | 38 | def clean_up_text(t, numbers=False, punctuation=False): 39 | ''' Cleans up a text, e.g. HTML document, 40 | from HTML tags and also cleans up the 41 | text body. 42 | ''' 43 | try: 44 | t = clean_up_html(t) 45 | except: 46 | pass 47 | t = t.lower() 48 | t = re.sub(r"what's", "what is ", t) 49 | t = t.replace('(ap)', '') 50 | t = re.sub(r"\'ve", " have ", t) 51 | t = re.sub(r"can't", "cannot ", t) 52 | t = re.sub(r"n't", " not ", t) 53 | t = re.sub(r"i'm", "i am ", t) 54 | t = re.sub(r"\'s", "", t) 55 | t = re.sub(r"\'re", " are ", t) 56 | t = re.sub(r"\'d", " would ", t) 57 | t = re.sub(r"\'ll", " will ", t) 58 | t = re.sub(r'\s+', ' ', t) 59 | t = re.sub(r"\\", "", t) 60 | t = re.sub(r"\'", "", t) 61 | t = re.sub(r"\"", "", t) 62 | if numbers: 63 | t = re.sub('[^a-zA-Z ?!]+', '', t) 64 | if punctuation: 65 | t = re.sub(r'\W+', ' ', t) 66 | t = remove_non_ascii(t) 67 | t = t.strip() 68 | return t 69 | 70 | def nltk_lemma(word): 71 | ''' If one exists, returns the lemma of a word. 72 | I.e. the base or dictionary version of it. 73 | ''' 74 | lemma = wn.morphy(word) 75 | if lemma is None: 76 | return word 77 | else: 78 | return lemma 79 | 80 | def tokenize(text, min_char=3, lemma=True, stop=True, 81 | numbers=False): 82 | ''' Tokenizes a text and implements some 83 | transformations. 84 | ''' 85 | tokens = nltk.word_tokenize(text) 86 | tokens = [t for t in tokens if len(t) >= min_char] 87 | if numbers: 88 | tokens = [t for t in tokens if t[0].lower() 89 | in string.ascii_lowercase] 90 | if stop: 91 | tokens = [t for t in tokens if t not in stop_words] 92 | if lemma: 93 | tokens = [nltk_lemma(t) for t in tokens] 94 | return tokens 95 | 96 | def generate_word_cloud(text, no, name=None): 97 | ''' Generates a word cloud bitmap given a 98 | text document (string). 99 | It uses the Term Frequency (TF) and 100 | Inverse Document Frequency (IDF) 101 | vectorization approach to derive the 102 | importance of a word -- represented 103 | by the size of the word in the word cloud. 104 | 105 | Parameters 106 | ========== 107 | text: str 108 | text as the basis 109 | no: int 110 | number of words to be included 111 | ''' 112 | tokens = tokenize(text) 113 | vec = TfidfVectorizer(min_df=2, 114 | analyzer='word', 115 | ngram_range=(1, 2), 116 | stop_words='english' 117 | ) 118 | vec.fit_transform(tokens) 119 | wc = pd.DataFrame({'words': vec.get_feature_names(), 120 | 'tfidf': vec.idf_}) 121 | words = ' '.join(wc.sort_values('tfidf', ascending=True)['words'].head(no)) 122 | wordcloud = WordCloud(max_font_size=110, 123 | background_color='white', 124 | width=1024, height=768, 125 | margin=10, max_words=150).generate(words) 126 | plt.figure(figsize=(10, 10)) 127 | plt.imshow(wordcloud, interpolation='bilinear') 128 | plt.axis('off') 129 | plt.show() 130 | if name is not None: 131 | plt.imsave(name, wordcloud) 132 | -------------------------------------------------------------------------------- /setup_dna_nlp.sh: -------------------------------------------------------------------------------- 1 | # Script to Install 2 | # Linux System Tools and 3 | # Basic Python Components 4 | # as well as to 5 | # Start Jupyter Notebook Server 6 | # 7 | # Python for Algorithmic Trading 8 | # (c) Dr. Yves J. Hilpisch 9 | # The Python Quants GmbH 10 | # 11 | # GENERAL LINUX 12 | printf "Installing system tools.\n\n" 13 | apt-get update # updates the package index cache 14 | apt-get upgrade -y # updates packages 15 | # installs system tools 16 | apt-get install -y git screen htop wget vim bzip2 17 | apt-get install -y build-essential gcc zip default-jre 18 | apt-get install -y poppler-utils # pdf file conversion 19 | apt-get upgrade -y bash # upgrades bash if necessary 20 | 21 | printf "Cleaning up package index cache.\n\n" 22 | apt-get clean # cleans up the package index cache 23 | 24 | # INSTALLING MINICONDA 25 | printf "Installing Miniconda.\n\n" 26 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O \ 27 | Miniconda.sh 28 | bash Miniconda.sh -b # installs Miniconda 29 | rm Miniconda.sh # removes the installer 30 | # prepends the new path for current session 31 | export PATH="/root/miniconda3/bin:$PATH" 32 | # prepends the new path in the shell configuration 33 | echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc 34 | echo "conda activate" >> ~/.bashrc 35 | 36 | printf "Updating miniconda\n\n" 37 | conda update -y conda 38 | 39 | # INSTALLING PYTHON PACKAGES 40 | printf "Installing Python packages.\n\n" 41 | conda install -y jupyter # Python coding in the browser 42 | conda install -y pytables # HDF5 database wrapper 43 | conda install -y pandas # data analysis package 44 | conda install -y matplotlib # plotting package 45 | conda install -y scikit-learn # machine learning package 46 | conda install -y nltk=3.2.5 # nlp package 47 | conda install -y gensim # nlp package 48 | conda install -y networkx # network graph 49 | conda install -y lxml # xml/html parsing 50 | 51 | pip install --upgrade pip 52 | pip install Cython 53 | pip install cufflinks # combining plotly with pandas 54 | pip install wordcloud 55 | pip install pyvis 56 | 57 | # NLTK PACKAGES 58 | python -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')" 59 | python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('wordnet')" 60 | 61 | # INSTALLING APACHE'S AVRO PACKAGE 62 | printf "Installing avro package.\n" 63 | wget http://mirror.synyx.de/apache/avro/stable/py3/avro-python3-1.8.2.tar.gz 64 | tar xvf avro-python3-1.8.2.tar.gz 65 | cd avro-python3-1.8.2 66 | python setup.py install 67 | cd .. 68 | rm avro-python3-1.8.2.tar.gz 69 | rm -rf avro-python3-1.8.2 70 | 71 | # COPYING FILES AND CREATING DIRECTORIES 72 | mkdir /root/.jupyter 73 | mkdir /root/.jupyter/custom 74 | 75 | cd /root/.jupyter 76 | wget -q http://hilpisch.com/nlp/jupyter_setup.py 77 | printf "Please provide a new password for your Jupyter server.\n" 78 | printf "New password [ENTER]: " 79 | read -s password 80 | printf "\n" 81 | 82 | printf "Repeat password [ENTER]: " 83 | read -s rep_password 84 | printf "\n" 85 | 86 | while [ "$password" = "" -o "$password" != "$rep_password" ] 87 | do 88 | printf "The passwords are empty or not equal, please try again!\n" 89 | printf "New password [ENTER]: " 90 | read -s password 91 | printf "\n" 92 | 93 | printf "Repeat password [ENTER]: " 94 | read -s rep_password 95 | printf "\n" 96 | done 97 | 98 | JUPYTER_URL=$(python jupyter_setup.py $password) 99 | 100 | mkdir /root/notebook 101 | cd /root/notebook 102 | 103 | # CLONING THE REPO 104 | printf "Cloning the DNA NLP Git repository.\n" 105 | git clone --depth=1 http://github.com/yhilpisch/dnanlp 106 | 107 | printf "Downloading additional files.\n" 108 | cd /root/notebook/dnanlp/modules 109 | wget -q http://hilpisch.com/nlp/soiepy.zip 110 | unzip soiepy.zip 111 | rm soiepy.zip 112 | 113 | cd /root/notebook/ 114 | printf "Success.\n" 115 | 116 | # CREATE A SWAP PARTITION 117 | # comment out these lines if not required 118 | wget -q http://hilpisch.com/nlp/create_swap.sh 119 | /bin/bash /root/notebook/create_swap.sh 120 | rm /root/notebook/create_swap.sh 121 | 122 | # STARTING JUPYTER NOTEBOOK 123 | wget -q http://hilpisch.com/nlp/custom.css 124 | mv custom.css /root/.jupyter/custom/custom.css 125 | mkdir logs 126 | touch logs/jupyter.log 127 | nohup jupyter notebook --allow-root > logs/jupyter.log & 128 | 129 | printf "\n\n" 130 | printf "Your Jupyter Server is running. To access it, please visit:\n\n" 131 | printf "$JUPYTER_URL\n\n" 132 | -------------------------------------------------------------------------------- /modules/tpqdna.py: -------------------------------------------------------------------------------- 1 | # 2 | # Wrapper Functions for the 3 | # Dow Jones DNA Snapshot API 4 | # 5 | # The Python Quants GmbH 6 | # 7 | import os 8 | import json 9 | import time 10 | import requests 11 | import avro.schema 12 | import pandas as pd 13 | from avro.io import DatumReader 14 | from avro.datafile import DataFileReader 15 | 16 | # snapshot planning 17 | explain_url = 'https://api.dowjones.com/alpha/extractions/documents/_explain' 18 | 19 | # analytics end point 20 | analytics_url = 'https://api.dowjones.com/alpha/analytics' 21 | 22 | # snapshot creation 23 | snapshot_create_url = 'https://api.dowjones.com/alpha/extractions/documents/' 24 | 25 | # snapshot extraction & download list 26 | snapshot_extraction_list_url = 'https://api.dowjones.com/alpha/extractions/' 27 | 28 | djdna_avro_schema = { 29 | "type": "record", 30 | "name": "Delivery", 31 | "namespace": "com.dowjones.dna.avro", 32 | "doc": 33 | "Avro schema for extraction content used by Dow Jones' SyndicationHub", 34 | "fields": [ 35 | {"name": "an", "type": ["string", "null"]}, 36 | {"name": "modification_datetime", "type": ["long", "null"]}, 37 | {"name": "ingestion_datetime", "type": ["long", "null"]}, 38 | {"name": "publication_date", "type": ["long", "null"]}, 39 | {"name": "publication_datetime", "type": ["long", "null"]}, 40 | {"name": "snippet", "type": ["string", "null"]}, 41 | {"name": "body", "type": ["string", "null"]}, 42 | {"name": "art", "type": ["string", "null"]}, 43 | {"name": "action", "type": ["string", "null"]}, 44 | {"name": "credit", "type": ["string", "null"]}, 45 | {"name": "byline", "type": ["string", "null"]}, 46 | {"name": "document_type", "type": ["string", "null"]}, 47 | {"name": "language_code", "type": ["string", "null"]}, 48 | {"name": "title", "type": ["string", "null"]}, 49 | {"name": "copyright", "type": ["string", "null"]}, 50 | {"name": "dateline", "type": ["string", "null"]}, 51 | {"name": "source_code", "type": ["string", "null"]}, 52 | {"name": "modification_date", "type": ["long", "null"]}, 53 | {"name": "section", "type": ["string", "null"]}, 54 | {"name": "company_codes", "type": ["string", "null"]}, 55 | {"name": "publisher_name", "type": ["string", "null"]}, 56 | {"name": "region_of_origin", "type": ["string", "null"]}, 57 | {"name": "word_count", "type": ["int", "null"]}, 58 | {"name": "subject_codes", "type": ["string", "null"]}, 59 | {"name": "region_codes", "type": ["string", "null"]}, 60 | {"name": "industry_codes", "type": ["string", "null"]}, 61 | {"name": "person_codes", "type": ["string", "null"]}, 62 | {"name": "currency_codes", "type": ["string", "null"]}, 63 | {"name": "market_index_codes", "type": ["string", "null"]}, 64 | {"name": "company_codes_about", "type": ["string", "null"]}, 65 | {"name": "company_codes_association", "type": ["string", "null"]}, 66 | {"name": "company_codes_lineage", "type": ["string", "null"]}, 67 | {"name": "company_codes_occur", "type": ["string", "null"]}, 68 | {"name": "company_codes_relevance", "type": ["string", "null"]}, 69 | {"name": "source_name", "type": ["string", "null"]} 70 | ] 71 | } 72 | 73 | 74 | def create_snapshot(query, headers): 75 | ''' Specifies a DNA snapshot. 76 | ''' 77 | response = requests.request( 78 | 'POST', snapshot_create_url, data=query, headers=headers) 79 | response = response.json() 80 | print(response) 81 | snapshot_create_job_url = response['links']['self'] 82 | # job_status = response['data']['attributes']['current_state'] 83 | return snapshot_create_job_url 84 | 85 | 86 | def run_snapshot(snapshot_url, headers): 87 | ''' Runs the specified DNA snapshot process. 88 | ''' 89 | old_status = '' 90 | job_status = '' 91 | response = '' 92 | while job_status != 'JOB_STATE_DONE': 93 | if job_status != old_status: 94 | print('Job status changed:') 95 | print(job_status) 96 | if job_status == 'JOB_STATE_FAILED': 97 | print('Job failed') 98 | print(response) 99 | break 100 | old_status = job_status 101 | 102 | time.sleep(60) 103 | response = requests.request('GET', snapshot_url, headers=headers) 104 | response = response.json() 105 | job_status = response['data']['attributes']['current_state'] 106 | 107 | snapshot_files_list = list(response['data']['attributes']['files']) 108 | return snapshot_files_list 109 | 110 | 111 | def download_snapshots(snapshot_files, path, headers, verbose=True): 112 | ''' Downloads DNA snapshot data file-by-file given the files list. 113 | ''' 114 | for download_file in snapshot_files: 115 | url = download_file['uri'] 116 | if url[-5:] != '.avro': 117 | continue 118 | filename = url.split('/')[-1] 119 | if verbose: 120 | print('Downloading file {} \r'.format(filename), end='') 121 | download = requests.get(url, headers=headers, 122 | allow_redirects=True, stream=True) 123 | filename = os.path.join(path, filename) 124 | with open(filename, 'wb') as fd: 125 | for chunk in download.iter_content(chunk_size=128): 126 | fd.write(chunk) 127 | 128 | 129 | def avro2dataframe(path, verbose=False): 130 | ''' Transforms DNA snapshot data in a pandas DataFrame object. 131 | ''' 132 | read_schema = avro.schema.Parse(json.dumps(djdna_avro_schema)) 133 | file_content = list() 134 | files = sorted(os.listdir(path)) 135 | for avro_file in files: 136 | if (os.path.isfile(os.path.join(path, avro_file)) and 137 | avro_file.split('.')[-1] == 'avro'): 138 | if verbose: 139 | print('Reading file {} \r'.format(avro_file), end='') 140 | file_path = os.path.join(path, avro_file) 141 | reader = DataFileReader( 142 | open(file_path, 'rb'), DatumReader(read_schema)) 143 | # new_schema = reader.GetMeta('avro.schema') 144 | users = [] 145 | for user in reader: 146 | users.append(user) 147 | file_content.append(users) 148 | reader.close() 149 | data = [pd.DataFrame(content) for content in file_content] 150 | data = pd.concat(data, ignore_index=True) 151 | return data 152 | -------------------------------------------------------------------------------- /code/04_harvey/04_harvey_01_data.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": [""]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Hurricane Harvey._\n", "\n", "**Data Retrieval**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import json\n", "import nltk\n", "import pickle\n", "import tpqdna\n", "import warnings\n", "warnings.simplefilter('ignore')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Creation"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Authentication"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["api_key = pickle.load(open('../dna_api_key.pkl', 'rb'))\n", "headers = {\n", " 'user-key': api_key,\n", " 'content-type': 'application/json',\n", " 'cache-control': 'no-cache'\n", "}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Specification"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["where = '(body like \"%Hurricane Harvey%\") AND language_code=\"en\" '\n", "where += 'AND language_code=\"en\" '\n", "where += 'AND publication_date >= \"2017-08-01 00:00:00\" '\n", "where += 'AND publication_date <= \"2017-12-31 00:00:00\" '"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["includes = {} \n", "excludes = {}\n", "limit = 250"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["query = {'query': \n", " {'where': where,\n", " 'includes': includes,\n", " 'exludes': excludes,\n", " 'limit': limit\n", " }}"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["query = json.dumps(query)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'data': {'attributes': {'current_state': 'JOB_QUEUED', 'extraction_type': 'documents'}, 'id': 'dj-synhub-extraction-feccd780582a0af8b40e86439b3ee921-hdlz7k82ki', 'type': 'snapshot'}, 'links': {'self': 'https://api.dowjones.com/alpha/extractions/documents/dj-synhub-extraction-feccd780582a0af8b40e86439b3ee921-hdlz7k82ki'}}\n", "CPU times: user 40 ms, sys: 4 ms, total: 44 ms\n", "Wall time: 16.1 s\n"]}], "source": ["%time qurl = tpqdna.create_snapshot(query, headers)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Job status changed:\n", "JOB_QUEUED\n", "Job status changed:\n", "JOB_VALIDATING\n", "Job status changed:\n", "JOB_STATE_RUNNING\n", "CPU times: user 2.8 s, sys: 224 ms, total: 3.02 s\n", "Wall time: 1h 45min 15s\n"]}], "source": ["%time fl = tpqdna.run_snapshot(qurl, headers)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Paths"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["project = 'harvey_{}'.format(limit)"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["base_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(base_path, 'data_harvey')\n", "if not os.path.isdir(data_path):\n", " os.mkdir(data_path)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["meta_path = os.path.join(data_path, 'meta')\n", "if not os.path.isdir(meta_path):\n", " os.mkdir(meta_path)\n", "fn = os.path.join(meta_path, 'file_list_{}.pkl'.format(project))"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["# with open(fn, 'wb') as f:\n", "# pickle.dump(fl, f)"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["with open(fn, 'rb') as f:\n", " fl = pickle.load(f)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Retrieval"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot')\n", "if not os.path.isdir(snapshot_path):\n", " os.mkdir(snapshot_path)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 1.1 s, sys: 68 ms, total: 1.17 s\n", "Wall time: 31.6 s\n"]}], "source": ["%time tpqdna.download_snapshots(fl, snapshot_path, headers)"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 208 ms, sys: 4 ms, total: 212 ms\n", "Wall time: 216 ms\n"]}], "source": ["%time data = tpqdna.avro2dataframe(snapshot_path)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "RangeIndex: 250 entries, 0 to 249\n", "Data columns (total 35 columns):\n", "action 250 non-null object\n", "an 250 non-null object\n", "art 250 non-null object\n", "body 250 non-null object\n", "byline 250 non-null object\n", "company_codes 250 non-null object\n", "company_codes_about 250 non-null object\n", "company_codes_association 250 non-null object\n", "company_codes_lineage 250 non-null object\n", "company_codes_occur 250 non-null object\n", "company_codes_relevance 250 non-null object\n", "copyright 250 non-null object\n", "credit 250 non-null object\n", "currency_codes 250 non-null object\n", "dateline 10 non-null object\n", "document_type 250 non-null object\n", "industry_codes 250 non-null object\n", "ingestion_datetime 250 non-null int64\n", "language_code 250 non-null object\n", "market_index_codes 250 non-null object\n", "modification_date 0 non-null object\n", "modification_datetime 250 non-null int64\n", "person_codes 250 non-null object\n", "publication_date 250 non-null int64\n", "publication_datetime 250 non-null int64\n", "publisher_name 250 non-null object\n", "region_codes 250 non-null object\n", "region_of_origin 250 non-null object\n", "section 250 non-null object\n", "snippet 250 non-null object\n", "source_code 250 non-null object\n", "source_name 250 non-null object\n", "subject_codes 250 non-null object\n", "title 250 non-null object\n", "word_count 250 non-null int64\n", "dtypes: int64(5), object(30)\n", "memory usage: 68.4+ KB\n"]}], "source": ["data.info()"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fn, 'data', complevel=5, complib='blosc')"]}, {"cell_type": "markdown", "metadata": {}, "source": [""]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2} -------------------------------------------------------------------------------- /code/03_musk/03_musk_04_ng.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": [""]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Network Graph Analysis**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import pandas as pd\n", "import ng_functions as ng\n", "import nlp_functions as nlp"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Reading the Data"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["project = 'musk_100'"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["abs_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(abs_path, 'data_musk')"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["results_path = os.path.join(data_path, 'results')"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["fnr = os.path.join(results_path, 'relations_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["data = pd.read_hdf(fnr, 'data')"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Node1RelationNode2
11muskwrote inoct. 4 tweet
13entrepreneurhatesellers
21optionbet onstock decline
24waysbet onfall
25tesla short interestapproach40 million shares
\n", "
"], "text/plain": [" Node1 Relation Node2\n", "11 musk wrote in oct. 4 tweet\n", "13 entrepreneur hate sellers\n", "21 option bet on stock decline\n", "24 ways bet on fall\n", "25 tesla short interest approach 40 million shares"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["data.head()"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["1887"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Network Graph"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Full Graph"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(data)"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", " with_edge_label=True,\n", " height='600px', width='80%',\n", " filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["# G.show('ng_musk_01.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Focused Graph"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["entities = ['musk', 'sec', 'tesla', 'tweet']"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["sel = data[data['Node1'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(sel)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", " with_edge_label=True,\n", " height='600px', width='80%',\n", " filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", " \n", " "], "text/plain": [""]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["G.show('ng_musk_02.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Focused Graph"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["entities = ['sec', 'settlement']"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["sel_1 = data[data['Node1'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": ["sel_2 = data[data['Node2'].apply(lambda s: s in entities)].copy()"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": ["sel = pd.concat((sel_1, sel_2), ignore_index=True)"]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": ["sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": ["sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()"]}, {"cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": ["g = ng.create_graph(sel)"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": ["G = ng.plot_graph(g, central_gravity=0.01,\n", " with_edge_label=True,\n", " height='600px', width='80%',\n", " filter_=['physics'])"]}, {"cell_type": "code", "execution_count": 29, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", " \n", " "], "text/plain": [""]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["G.show('ng_musk_03.html')"]}, {"cell_type": "markdown", "metadata": {}, "source": [""]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2} -------------------------------------------------------------------------------- /code/03_musk/03_musk_03_oie.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": [""]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Information Extraction**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import nltk\n", "import pandas as pd\n", "import soiepy.main as ie\n", "import nlp_functions as nlp"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Data"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["project = 'musk_100'"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["abs_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(abs_path, 'data_musk')"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot')"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["raw = pd.read_hdf(fn, 'data')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Preprocessing"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 246 ms, sys: 7.59 ms, total: 253 ms\n", "Wall time: 252 ms\n"]}], "source": ["%time raw['body'] = raw['body'].apply(nlp.clean_up_text) "]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["data = raw['body'].values.tolist() "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 200 ms, sys: 0 ns, total: 200 ms\n", "Wall time: 199 ms\n"]}], "source": ["%%time\n", "s = [nltk.sent_tokenize(a) for a in data] \n", "s = [_ for sl in s for _ in sl] "]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["['after six years of reflection, he returned to the subject.',\n", " 'the last several years have taught me that they are indeed reasonably maligned, musk wrote in an oct. 4 tweet.']"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["s[:2]"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["token_path = os.path.join(data_path, 'tokens') \n", "if not os.path.isdir(token_path):\n", " os.mkdir(token_path)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(token_path, 'tokens_{}_{}.txt') "]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["steps = 250\n", "for c, i in enumerate(range(0, len(s), steps)):\n", " with open(fn.format(project, c), 'w') as f:\n", " f.writelines([_ + '\\n' for _ in s[i:i + steps - 1]]) "]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Relations"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["results_path = os.path.join(data_path, 'results') \n", "if not os.path.isdir(results_path):\n", " os.mkdir(results_path)\n", "fnr = os.path.join(results_path, 'relations_{}.h5'.format(project)) "]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["fl = sorted(os.listdir(token_path))\n", "d = pd.DataFrame()\n", "fno = len(fl)"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 18.7 ms, sys: 292 \u00b5s, total: 19 ms\n", "Wall time: 17.2 ms\n"]}], "source": ["%%time\n", "try:\n", " d = pd.read_hdf(fnr, 'raw') \n", "except:\n", " for i, fn in enumerate(fl):\n", " filename = os.path.join(token_path, fn)\n", " msg = 'Processing file {} of {} \\r'\n", " print(msg.format(i + 1, fno), end='')\n", " r = ie.stanford_ie(filename, verbose=False) \n", " dt = pd.DataFrame(r)\n", " if len(d) == 0:\n", " d = dt\n", " else:\n", " d = pd.concat((d, dt), ignore_index=True)"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": ["d = d.iloc[:, :3]"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": ["d.columns = ['Node1', 'Relation', 'Node2']"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [{"data": {"text/plain": ["11385"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["len(d)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Post Processing"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["data = d.copy()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Basic Processing"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": ["data = data.applymap(lambda s: s.strip()) "]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": ["data = data[data.applymap(lambda s: not s in nlp.stop_words)].dropna() "]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": ["data = data[data.applymap(lambda s: not s.startswith('http'))].dropna() "]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": ["data = data.applymap(lambda s: nlp.nltk_lemma(s)) "]}, {"cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [{"data": {"text/plain": ["7935"]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Removing Duplicates"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": ["def join_columns(row):\n", " return ' '.join([row['Node1'], row['Relation'], row['Node2']]) "]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": ["vec = nlp.TfidfVectorizer(stop_words='english')"]}, {"cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": ["data['Join'] = data.apply(lambda row: join_columns(row), axis=1) "]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": ["mat = vec.fit_transform(data['Join'].values.tolist())"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 113 ms, sys: 374 ms, total: 487 ms\n", "Wall time: 485 ms\n"]}], "source": ["%time sim = (mat * mat.T).A "]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": ["data['Keep'] = True"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 1min 2s, sys: 103 ms, total: 1min 2s\n", "Wall time: 1min 2s\n"]}], "source": ["%%time\n", "for i, ind_i in enumerate(data.index):\n", " for j, ind_j in enumerate(data.index):\n", " if j > i:\n", " simsc = sim[i, j]\n", " if simsc > 0.5:\n", " data.loc[ind_j, 'Keep'] = False "]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": ["data = data.iloc[:, :3][data['Keep'] == True] "]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [{"data": {"text/plain": ["1887"]}, "execution_count": 36, "metadata": {}, "output_type": "execute_result"}], "source": ["len(data)"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Node1RelationNode2
11muskwrote inoct. 4 tweet
13entrepreneurhatesellers
21optionbet onstock decline
24waysbet onfall
25tesla short interestapproach40 million shares
\n", "
"], "text/plain": [" Node1 Relation Node2\n", "11 musk wrote in oct. 4 tweet\n", "13 entrepreneur hate sellers\n", "21 option bet on stock decline\n", "24 ways bet on fall\n", "25 tesla short interest approach 40 million shares"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["data.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Storing Results"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["d.to_hdf(fnr, 'raw', complevel=5, complib='blosc') "]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fnr, 'data', complevel=5, complib='blosc') "]}, {"cell_type": "markdown", "metadata": {}, "source": [""]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2} -------------------------------------------------------------------------------- /data/eikon_eod_tsla_data.csv: -------------------------------------------------------------------------------- 1 | Date,CLOSE 2 | 2017-01-03,216.99 3 | 2017-01-04,226.99 4 | 2017-01-05,226.75 5 | 2017-01-06,229.01 6 | 2017-01-09,231.28 7 | 2017-01-10,229.87 8 | 2017-01-11,229.73 9 | 2017-01-12,229.59 10 | 2017-01-13,237.75 11 | 2017-01-17,235.58 12 | 2017-01-18,238.36 13 | 2017-01-19,243.76 14 | 2017-01-20,244.73 15 | 2017-01-23,248.92 16 | 2017-01-24,254.61 17 | 2017-01-25,254.47 18 | 2017-01-26,252.51 19 | 2017-01-27,252.95 20 | 2017-01-30,250.63 21 | 2017-01-31,251.93 22 | 2017-02-01,249.24 23 | 2017-02-02,251.55 24 | 2017-02-03,251.33 25 | 2017-02-06,257.77 26 | 2017-02-07,257.48 27 | 2017-02-08,262.08 28 | 2017-02-09,269.2 29 | 2017-02-10,269.23 30 | 2017-02-13,280.6 31 | 2017-02-14,280.98 32 | 2017-02-15,279.76 33 | 2017-02-16,268.95 34 | 2017-02-17,272.23 35 | 2017-02-21,277.39 36 | 2017-02-22,273.51 37 | 2017-02-23,255.99 38 | 2017-02-24,257.0 39 | 2017-02-27,246.23 40 | 2017-02-28,249.99 41 | 2017-03-01,250.02 42 | 2017-03-02,250.48 43 | 2017-03-03,251.57 44 | 2017-03-06,251.21 45 | 2017-03-07,248.59 46 | 2017-03-08,246.87 47 | 2017-03-09,244.9 48 | 2017-03-10,243.69 49 | 2017-03-13,246.17 50 | 2017-03-14,258.0 51 | 2017-03-15,255.73 52 | 2017-03-16,262.05 53 | 2017-03-17,261.5 54 | 2017-03-20,261.92 55 | 2017-03-21,250.68 56 | 2017-03-22,255.01 57 | 2017-03-23,254.78 58 | 2017-03-24,263.16 59 | 2017-03-27,270.22 60 | 2017-03-28,277.45 61 | 2017-03-29,277.38 62 | 2017-03-30,277.92 63 | 2017-03-31,278.3 64 | 2017-04-03,298.52 65 | 2017-04-04,303.7 66 | 2017-04-05,295.0 67 | 2017-04-06,298.7 68 | 2017-04-07,302.54 69 | 2017-04-10,312.39 70 | 2017-04-11,308.71 71 | 2017-04-12,296.84 72 | 2017-04-13,304.0 73 | 2017-04-17,301.44 74 | 2017-04-18,300.25 75 | 2017-04-19,305.52 76 | 2017-04-20,302.51 77 | 2017-04-21,305.6 78 | 2017-04-24,308.03 79 | 2017-04-25,313.79 80 | 2017-04-26,310.17 81 | 2017-04-27,308.63 82 | 2017-04-28,314.07 83 | 2017-05-01,322.83 84 | 2017-05-02,318.89 85 | 2017-05-03,311.02 86 | 2017-05-04,295.46 87 | 2017-05-05,308.35 88 | 2017-05-08,307.19 89 | 2017-05-09,321.26 90 | 2017-05-10,325.22 91 | 2017-05-11,323.1 92 | 2017-05-12,324.81 93 | 2017-05-15,315.88 94 | 2017-05-16,317.01 95 | 2017-05-17,306.11 96 | 2017-05-18,313.06 97 | 2017-05-19,310.83 98 | 2017-05-22,310.35 99 | 2017-05-23,303.86 100 | 2017-05-24,310.22 101 | 2017-05-25,316.83 102 | 2017-05-26,325.14 103 | 2017-05-30,335.1 104 | 2017-05-31,341.01 105 | 2017-06-01,340.37 106 | 2017-06-02,339.85 107 | 2017-06-05,347.32 108 | 2017-06-06,352.85 109 | 2017-06-07,359.65 110 | 2017-06-08,370.0 111 | 2017-06-09,357.32 112 | 2017-06-12,359.01 113 | 2017-06-13,375.95 114 | 2017-06-14,380.66 115 | 2017-06-15,375.34 116 | 2017-06-16,371.4 117 | 2017-06-19,369.8 118 | 2017-06-20,372.24 119 | 2017-06-21,376.4 120 | 2017-06-22,382.61 121 | 2017-06-23,383.45 122 | 2017-06-26,377.49 123 | 2017-06-27,362.37 124 | 2017-06-28,371.24 125 | 2017-06-29,360.75 126 | 2017-06-30,361.61 127 | 2017-07-03,352.62 128 | 2017-07-05,327.09 129 | 2017-07-06,308.83 130 | 2017-07-07,313.22 131 | 2017-07-10,316.05 132 | 2017-07-11,327.22 133 | 2017-07-12,329.52 134 | 2017-07-13,323.41 135 | 2017-07-14,327.78 136 | 2017-07-17,319.57 137 | 2017-07-18,328.24 138 | 2017-07-19,325.26 139 | 2017-07-20,329.92 140 | 2017-07-21,328.4 141 | 2017-07-24,342.52 142 | 2017-07-25,339.6 143 | 2017-07-26,343.85 144 | 2017-07-27,334.46 145 | 2017-07-28,335.07 146 | 2017-07-31,323.47 147 | 2017-08-01,319.57 148 | 2017-08-02,325.89 149 | 2017-08-03,347.09 150 | 2017-08-04,356.91 151 | 2017-08-07,355.17 152 | 2017-08-08,365.22 153 | 2017-08-09,363.53 154 | 2017-08-10,355.4 155 | 2017-08-11,357.87 156 | 2017-08-14,363.8 157 | 2017-08-15,362.33 158 | 2017-08-16,362.91 159 | 2017-08-17,351.92 160 | 2017-08-18,347.46 161 | 2017-08-21,337.86 162 | 2017-08-22,341.35 163 | 2017-08-23,352.77 164 | 2017-08-24,352.93 165 | 2017-08-25,348.05 166 | 2017-08-28,345.66 167 | 2017-08-29,347.36 168 | 2017-08-30,353.18 169 | 2017-08-31,355.9 170 | 2017-09-01,355.4 171 | 2017-09-05,349.59 172 | 2017-09-06,344.53 173 | 2017-09-07,350.61 174 | 2017-09-08,343.4 175 | 2017-09-11,363.69 176 | 2017-09-12,362.75 177 | 2017-09-13,366.23 178 | 2017-09-14,377.64 179 | 2017-09-15,379.81 180 | 2017-09-18,385.0 181 | 2017-09-19,375.1 182 | 2017-09-20,373.91 183 | 2017-09-21,366.48 184 | 2017-09-22,351.09 185 | 2017-09-25,344.99 186 | 2017-09-26,345.25 187 | 2017-09-27,340.97 188 | 2017-09-28,339.6 189 | 2017-09-29,341.1 190 | 2017-10-02,341.53 191 | 2017-10-03,348.14 192 | 2017-10-04,355.01 193 | 2017-10-05,355.33 194 | 2017-10-06,356.88 195 | 2017-10-09,342.94 196 | 2017-10-10,355.59 197 | 2017-10-11,354.6 198 | 2017-10-12,355.68 199 | 2017-10-13,355.57 200 | 2017-10-16,350.6 201 | 2017-10-17,355.75 202 | 2017-10-18,359.65 203 | 2017-10-19,351.81 204 | 2017-10-20,345.1 205 | 2017-10-23,337.02 206 | 2017-10-24,337.34 207 | 2017-10-25,325.84 208 | 2017-10-26,326.17 209 | 2017-10-27,320.87 210 | 2017-10-30,320.08 211 | 2017-10-31,331.53 212 | 2017-11-01,321.08 213 | 2017-11-02,299.26 214 | 2017-11-03,306.09 215 | 2017-11-06,302.78 216 | 2017-11-07,306.05 217 | 2017-11-08,304.39 218 | 2017-11-09,302.99 219 | 2017-11-10,302.99 220 | 2017-11-13,315.4 221 | 2017-11-14,308.7 222 | 2017-11-15,311.3 223 | 2017-11-16,312.5 224 | 2017-11-17,315.05 225 | 2017-11-20,308.74 226 | 2017-11-21,317.81 227 | 2017-11-22,312.6 228 | 2017-11-24,315.55 229 | 2017-11-27,316.81 230 | 2017-11-28,317.55 231 | 2017-11-29,307.54 232 | 2017-11-30,308.85 233 | 2017-12-01,306.53 234 | 2017-12-04,305.2 235 | 2017-12-05,303.7 236 | 2017-12-06,313.26 237 | 2017-12-07,311.24 238 | 2017-12-08,315.13 239 | 2017-12-11,328.91 240 | 2017-12-12,341.03 241 | 2017-12-13,339.03 242 | 2017-12-14,337.89 243 | 2017-12-15,343.45 244 | 2017-12-18,338.87 245 | 2017-12-19,331.1 246 | 2017-12-20,328.98 247 | 2017-12-21,331.66 248 | 2017-12-22,325.2 249 | 2017-12-26,317.29 250 | 2017-12-27,311.64 251 | 2017-12-28,315.36 252 | 2017-12-29,311.35 253 | 2018-01-02,320.53 254 | 2018-01-03,317.25 255 | 2018-01-04,314.62 256 | 2018-01-05,316.58 257 | 2018-01-08,336.41 258 | 2018-01-09,333.69 259 | 2018-01-10,334.8 260 | 2018-01-11,337.95 261 | 2018-01-12,336.22 262 | 2018-01-16,340.06 263 | 2018-01-17,347.16 264 | 2018-01-18,344.57 265 | 2018-01-19,350.02 266 | 2018-01-22,351.56 267 | 2018-01-23,352.79 268 | 2018-01-24,345.89 269 | 2018-01-25,337.64 270 | 2018-01-26,342.85 271 | 2018-01-29,349.53 272 | 2018-01-30,345.82 273 | 2018-01-31,354.31 274 | 2018-02-01,349.25 275 | 2018-02-02,343.75 276 | 2018-02-05,333.13 277 | 2018-02-06,333.97 278 | 2018-02-07,345.0 279 | 2018-02-08,315.23 280 | 2018-02-09,310.42 281 | 2018-02-12,315.73 282 | 2018-02-13,323.66 283 | 2018-02-14,322.31 284 | 2018-02-15,334.065 285 | 2018-02-16,335.49 286 | 2018-02-20,334.77 287 | 2018-02-21,333.3 288 | 2018-02-22,346.17 289 | 2018-02-23,352.05 290 | 2018-02-26,357.42 291 | 2018-02-27,350.99 292 | 2018-02-28,343.06 293 | 2018-03-01,330.93 294 | 2018-03-02,335.12 295 | 2018-03-05,333.35 296 | 2018-03-06,328.2 297 | 2018-03-07,332.3 298 | 2018-03-08,329.1 299 | 2018-03-09,327.17 300 | 2018-03-12,345.51 301 | 2018-03-13,341.84 302 | 2018-03-14,326.63 303 | 2018-03-15,325.6 304 | 2018-03-16,321.35 305 | 2018-03-19,313.56 306 | 2018-03-20,310.55 307 | 2018-03-21,316.53 308 | 2018-03-22,309.1 309 | 2018-03-23,301.54 310 | 2018-03-26,304.18 311 | 2018-03-27,279.18 312 | 2018-03-28,257.78 313 | 2018-03-29,266.13 314 | 2018-04-02,252.48 315 | 2018-04-03,267.53 316 | 2018-04-04,286.94 317 | 2018-04-05,305.72 318 | 2018-04-06,299.3 319 | 2018-04-09,289.66 320 | 2018-04-10,304.7 321 | 2018-04-11,300.93 322 | 2018-04-12,294.08 323 | 2018-04-13,300.34 324 | 2018-04-16,291.21 325 | 2018-04-17,287.69 326 | 2018-04-18,293.35 327 | 2018-04-19,300.08 328 | 2018-04-20,290.24 329 | 2018-04-23,283.37 330 | 2018-04-24,283.46 331 | 2018-04-25,280.69 332 | 2018-04-26,285.48 333 | 2018-04-27,294.075 334 | 2018-04-30,293.9 335 | 2018-05-01,299.92 336 | 2018-05-02,301.15 337 | 2018-05-03,284.45 338 | 2018-05-04,294.09 339 | 2018-05-07,302.77 340 | 2018-05-08,301.97 341 | 2018-05-09,306.85 342 | 2018-05-10,305.02 343 | 2018-05-11,301.06 344 | 2018-05-14,291.97 345 | 2018-05-15,284.18 346 | 2018-05-16,286.48 347 | 2018-05-17,284.54 348 | 2018-05-18,276.82 349 | 2018-05-21,284.49 350 | 2018-05-22,275.01 351 | 2018-05-23,279.07 352 | 2018-05-24,277.85 353 | 2018-05-25,278.85 354 | 2018-05-29,283.76 355 | 2018-05-30,291.72 356 | 2018-05-31,284.73 357 | 2018-06-01,291.82 358 | 2018-06-04,296.74 359 | 2018-06-05,291.13 360 | 2018-06-06,319.5 361 | 2018-06-07,316.09 362 | 2018-06-08,317.66 363 | 2018-06-11,332.1 364 | 2018-06-12,342.77 365 | 2018-06-13,344.78 366 | 2018-06-14,357.72 367 | 2018-06-15,358.17 368 | 2018-06-18,370.83 369 | 2018-06-19,352.55 370 | 2018-06-20,362.22 371 | 2018-06-21,347.51 372 | 2018-06-22,333.63 373 | 2018-06-25,333.01 374 | 2018-06-26,342.0 375 | 2018-06-27,344.5 376 | 2018-06-28,349.93 377 | 2018-06-29,342.95 378 | 2018-07-02,335.07 379 | 2018-07-03,310.86 380 | 2018-07-05,309.16 381 | 2018-07-06,308.9 382 | 2018-07-09,318.51 383 | 2018-07-10,322.47 384 | 2018-07-11,318.96 385 | 2018-07-12,316.71 386 | 2018-07-13,318.87 387 | 2018-07-16,310.1 388 | 2018-07-17,322.69 389 | 2018-07-18,323.85 390 | 2018-07-19,320.23 391 | 2018-07-20,313.58 392 | 2018-07-23,303.2 393 | 2018-07-24,297.43 394 | 2018-07-25,308.74 395 | 2018-07-26,306.65 396 | 2018-07-27,297.18 397 | 2018-07-30,290.17 398 | 2018-07-31,298.14 399 | 2018-08-01,300.84 400 | 2018-08-02,349.54 401 | 2018-08-03,348.17 402 | 2018-08-06,341.99 403 | 2018-08-07,379.57 404 | 2018-08-08,370.34 405 | 2018-08-09,352.45 406 | 2018-08-10,355.49 407 | 2018-08-13,356.41 408 | 2018-08-14,347.64 409 | 2018-08-15,338.69 410 | 2018-08-16,335.45 411 | 2018-08-17,305.5 412 | 2018-08-20,308.44 413 | 2018-08-21,321.9 414 | 2018-08-22,321.64 415 | 2018-08-23,320.1 416 | 2018-08-24,322.82 417 | 2018-08-27,319.27 418 | 2018-08-28,311.86 419 | 2018-08-29,305.01 420 | 2018-08-30,303.15 421 | 2018-08-31,301.66 422 | 2018-09-04,288.95 423 | 2018-09-05,280.74 424 | 2018-09-06,280.95 425 | 2018-09-07,263.24 426 | 2018-09-10,285.5 427 | 2018-09-11,279.44 428 | 2018-09-12,290.54 429 | 2018-09-13,289.46 430 | 2018-09-14,295.2 431 | 2018-09-17,294.84 432 | 2018-09-18,284.96 433 | 2018-09-19,299.02 434 | 2018-09-20,298.33 435 | 2018-09-21,299.1 436 | 2018-09-24,299.68 437 | 2018-09-25,300.99 438 | 2018-09-26,309.58 439 | 2018-09-27,307.52 440 | 2018-09-28,264.77 441 | 2018-10-01,310.7 442 | 2018-10-02,301.02 443 | 2018-10-03,294.8 444 | 2018-10-04,281.83 445 | 2018-10-05,261.95 446 | 2018-10-08,250.56 447 | 2018-10-09,262.8 448 | 2018-10-10,256.88 449 | 2018-10-11,252.23 450 | 2018-10-12,258.78 451 | 2018-10-15,259.59 452 | 2018-10-16,276.59 453 | 2018-10-17,271.78 454 | 2018-10-18,263.91 455 | 2018-10-19,260.0 456 | 2018-10-22,260.95 457 | 2018-10-23,294.14 458 | 2018-10-24,288.5 459 | 2018-10-25,314.86 460 | 2018-10-26,330.9 461 | 2018-10-29,334.85 462 | 2018-10-30,329.9 463 | 2018-10-31,337.32 464 | -------------------------------------------------------------------------------- /code/03_musk/03_musk_01_data.ipynb: -------------------------------------------------------------------------------- 1 | {"cells": [{"cell_type": "markdown", "metadata": {}, "source": [""]}, {"cell_type": "markdown", "metadata": {}, "source": ["# Dow Jones DNA NLP Case Study\n", "\n", "_Based on news articles related to Elon Musk, Twitter & Tesla._\n", "\n", "**Data Retrieval**\n", "\n", "Dr Yves J Hilpisch | Michael Schwed\n", "\n", "The Python Quants GmbH"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## The Imports"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import sys\n", "sys.path.append('../../modules/')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import json\n", "import nltk\n", "import pickle\n", "import tpqdna\n", "import warnings\n", "warnings.simplefilter('ignore')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Snapshot Creation"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Authentication"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["# expects the DNA API key to be stored in plain text as a Python pickle file\n", "api_key = pickle.load(open('../dna_api_key.pkl', 'rb')) \n", "headers = {\n", " 'user-key': api_key,\n", " 'content-type': 'application/json',\n", " 'cache-control': 'no-cache'\n", "} "]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Specification"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["where = '(body like \"%Musk.%\" OR body like \"%Musk,%\" '\n", "where += 'OR body like \"%Musk %\" '\n", "where += 'OR body like \"%Tesla.%\" OR body like \"%Tesla,%\" '\n", "where += 'OR body like \"%Tesla %\") '\n", "where += 'AND (body like \"%tweet.%\" OR body like \"%tweet,%\" '\n", "where += 'OR body like \"%tweet %\" '\n", "where += 'OR body like \"%Twitter.%\" OR body like \"%Twitter,%\" '\n", "where += 'OR body like \"%Twitter %\" ) ' \n", "where += 'AND language_code=\"en\" '\n", "where += 'AND publication_date >= \"2018-07-23 00:00:00\" '\n", "where += 'AND publication_date <= \"2018-10-29 23:59:59\" ' "]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["includes = {} \n", "excludes = {}\n", "limit = 100"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": ["query = {'query': \n", " {'where': where,\n", " 'includes': includes,\n", " 'exludes': excludes,\n", " 'limit': limit\n", " }}"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": ["query = json.dumps(query)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["# %time qurl = tpqdna.create_snapshot(query, headers) "]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# %time fl = tpqdna.run_snapshot(qurl, headers) "]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Paths"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["project = 'musk_{}'.format(limit) "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["base_path = os.path.abspath('../../')"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["data_path = os.path.join(base_path, 'data_musk') \n", "if not os.path.isdir(data_path):\n", " os.mkdir(data_path)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["meta_path = os.path.join(data_path, 'meta') \n", "if not os.path.isdir(meta_path):\n", " os.mkdir(meta_path)\n", "fn = os.path.join(meta_path, 'file_list_{}.pkl'.format(project))"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["# with open(fn, 'wb') as f:\n", "# pickle.dump(fl, f) "]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["with open(fn, 'rb') as f:\n", " fl = pickle.load(f)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Data Retrieval"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["snapshot_path = os.path.join(data_path, 'snapshot') \n", "if not os.path.isdir(snapshot_path):\n", " os.mkdir(snapshot_path)"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["# %time tpqdna.download_snapshots(fl, snapshot_path, headers) "]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["CPU times: user 143 ms, sys: 7.56 ms, total: 151 ms\n", "Wall time: 155 ms\n"]}], "source": ["%time data = tpqdna.avro2dataframe(snapshot_path) "]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "RangeIndex: 100 entries, 0 to 99\n", "Data columns (total 35 columns):\n", "action 100 non-null object\n", "an 100 non-null object\n", "art 100 non-null object\n", "body 100 non-null object\n", "byline 100 non-null object\n", "company_codes 100 non-null object\n", "company_codes_about 100 non-null object\n", "company_codes_association 100 non-null object\n", "company_codes_lineage 100 non-null object\n", "company_codes_occur 100 non-null object\n", "company_codes_relevance 100 non-null object\n", "copyright 100 non-null object\n", "credit 100 non-null object\n", "currency_codes 100 non-null object\n", "dateline 5 non-null object\n", "document_type 100 non-null object\n", "industry_codes 100 non-null object\n", "ingestion_datetime 100 non-null int64\n", "language_code 100 non-null object\n", "market_index_codes 100 non-null object\n", "modification_date 0 non-null object\n", "modification_datetime 100 non-null int64\n", "person_codes 100 non-null object\n", "publication_date 100 non-null int64\n", "publication_datetime 100 non-null int64\n", "publisher_name 100 non-null object\n", "region_codes 100 non-null object\n", "region_of_origin 100 non-null object\n", "section 100 non-null object\n", "snippet 100 non-null object\n", "source_code 100 non-null object\n", "source_name 100 non-null object\n", "subject_codes 100 non-null object\n", "title 100 non-null object\n", "word_count 100 non-null int64\n", "dtypes: int64(5), object(30)\n", "memory usage: 27.4+ KB\n"]}], "source": ["data.info() "]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
source_nametitleword_count
0Barron'sWhy Musk Is Wrong About Shorts1031
1San Francisco Chronicle: Web EditionTesla owners fume about delays for service1016
2ArabianBusiness.comSaudi fund PIF said to mull investment in Tesl...240
3DigitSpaceX to announce the name of its first touri...417
4U-WireWe\u2019ve All Been Pronouncing Chrissy Teigen\u2019s La...292
5Dow Jones Institutional NewsPublic Bravado, Private Doubts: How Elon Musk'...1786
6Benzinga.comTesla Zaps Go-Private Plans; Wall Street Reacts581
7The Canadian PressTesla stock drops closer to pre-Musk tweet level308
\n", "
"], "text/plain": [" source_name \\\n", "0 Barron's \n", "1 San Francisco Chronicle: Web Edition \n", "2 ArabianBusiness.com \n", "3 Digit \n", "4 U-Wire \n", "5 Dow Jones Institutional News \n", "6 Benzinga.com \n", "7 The Canadian Press \n", "\n", " title word_count \n", "0 Why Musk Is Wrong About Shorts 1031 \n", "1 Tesla owners fume about delays for service 1016 \n", "2 Saudi fund PIF said to mull investment in Tesl... 240 \n", "3 SpaceX to announce the name of its first touri... 417 \n", "4 We\u2019ve All Been Pronouncing Chrissy Teigen\u2019s La... 292 \n", "5 Public Bravado, Private Doubts: How Elon Musk'... 1786 \n", "6 Tesla Zaps Go-Private Plans; Wall Street Reacts 581 \n", "7 Tesla stock drops closer to pre-Musk tweet level 308 "]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["data[['source_name', 'title', 'word_count']].head(8)"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": ["fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project)) "]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["data.to_hdf(fn, 'data', complevel=5, complib='blosc') "]}, {"cell_type": "markdown", "metadata": {}, "source": [""]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7"}}, "nbformat": 4, "nbformat_minor": 2} -------------------------------------------------------------------------------- /code/04_harvey/04_harvey_04_ng.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Dow Jones DNA NLP Case Study\n", 15 | "\n", 16 | "_Based on news articles related to Hurricane Harvey._\n", 17 | "\n", 18 | "**Network Graph Analysis**\n", 19 | "\n", 20 | "Dr Yves J Hilpisch | Michael Schwed\n", 21 | "\n", 22 | "The Python Quants GmbH" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## The Imports" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "import sys\n", 40 | "sys.path.append('../../modules')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "import ng_functions as ng\n", 51 | "import nlp_functions as nlp" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "project = 'harvey_250'" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "abs_path = os.path.abspath('../../')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "data_path = os.path.join(abs_path, 'data_harvey')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "results_path = os.path.join(data_path, 'results')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Reading the Data" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 8, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "relations_harvey_250.h5\r\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "!ls ../../data_harvey/results" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "data = pd.read_hdf(fn, 'data')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/html": [ 140 | "
\n", 141 | "\n", 154 | "\n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | "
Node1RelationNode2
0hurricane irmahas strengthened tocategory
1peopleis inleeward islands of caribbean
2homeis inaffected areas
6significant eventdirector ofcaribbean disaster emergency management agency
14forecasterexpectstorm
\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " Node1 Relation \\\n", 200 | "0 hurricane irma has strengthened to \n", 201 | "1 people is in \n", 202 | "2 home is in \n", 203 | "6 significant event director of \n", 204 | "14 forecaster expect \n", 205 | "\n", 206 | " Node2 \n", 207 | "0 category \n", 208 | "1 leeward islands of caribbean \n", 209 | "2 affected areas \n", 210 | "6 caribbean disaster emergency management agency \n", 211 | "14 storm " 212 | ] 213 | }, 214 | "execution_count": 10, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "data.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Network Graph" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### Full Graph" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 11, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "g = ng.create_graph(data.iloc[:1000])" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 12, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "G = ng.plot_graph(g, central_gravity=0.01,\n", 253 | " with_edge_label=True,\n", 254 | " height='600px', width='80%',\n", 255 | " filter_=['physics'])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 13, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# G.show('ng_harvey_01.html')" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "### Focused Graph" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 14, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "entities = ['hurricane', 'houston', 'government','trump']" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 15, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "sel_1 = data[data['Node1'].apply(lambda s: s in entities)].copy()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 16, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "sel_2 = data[data['Node2'].apply(lambda s: s in entities)].copy()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 17, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "sel = pd.concat((sel_1, sel_2), ignore_index=True)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 18, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "sel = sel.applymap(lambda s: ' '.join(nlp.tokenize(s)))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 19, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "sel = sel[sel.applymap(lambda s: len(s.split()) <= 1)].dropna()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 20, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "g = ng.create_graph(sel)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 21, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "G = ng.plot_graph(g, central_gravity=0.01,\n", 344 | " with_edge_label=True,\n", 345 | " height='600px', width='80%',\n", 346 | " filter_=['physics'])" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 22, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/html": [ 357 | "\n", 358 | " \n", 365 | " " 366 | ], 367 | "text/plain": [ 368 | "" 369 | ] 370 | }, 371 | "execution_count": 22, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "G.show('ng_harvey_02.html')" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "" 385 | ] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.6.7" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 2 409 | } 410 | -------------------------------------------------------------------------------- /code/04_harvey/04_harvey_03_oie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Dow Jones DNA NLP Case Study\n", 15 | "\n", 16 | "_Based on news articles related to Hurricane Harvey._\n", 17 | "\n", 18 | "**Information Extraction**\n", 19 | "\n", 20 | "Dr Yves J Hilpisch | Michael Schwed\n", 21 | "\n", 22 | "The Python Quants GmbH" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## The Imports" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "import sys\n", 40 | "sys.path.append('../../modules')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import nltk\n", 50 | "import pandas as pd\n", 51 | "import soiepy.main as ie\n", 52 | "import nlp_functions as nlp" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Snapshot Data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "project = 'harvey_250'" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "abs_path = os.path.abspath('../../')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "data_path = os.path.join(abs_path, 'data_harvey')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "snapshot_path = os.path.join(data_path, 'snapshot')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "fn = os.path.join(snapshot_path, 'snapshot_{}.h5'.format(project))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 8, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "raw = pd.read_hdf(fn, 'data')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Preprocessing" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "CPU times: user 954 ms, sys: 31 ms, total: 985 ms\n", 133 | "Wall time: 984 ms\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "%time raw['body'] = raw['body'].apply(nlp.clean_up_text)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 10, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "data = raw['body'].values.tolist()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 11, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "CPU times: user 576 ms, sys: 11.7 ms, total: 587 ms\n", 160 | "Wall time: 588 ms\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "%%time\n", 166 | "s = [nltk.sent_tokenize(a) for a in data]\n", 167 | "s = [_ for sl in s for _ in sl]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 12, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "['hurricane irma has strengthened to a category 5 storm - the most severe designation for a hurricane, the national hurricane center said tuesday.',\n", 179 | " 'people in the leeward islands of the caribbean are preparing for the irma arrival late tuesday or early wednesday, as areas to the northwest, from puerto rico to cuba to the coastal united states, wait to see the track the storm will take as the week progresses.']" 180 | ] 181 | }, 182 | "execution_count": 12, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "s[:2]" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 13, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "token_path = os.path.join(data_path, 'tokens')\n", 198 | "if not os.path.isdir(token_path):\n", 199 | " os.mkdir(token_path)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 14, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "fn = os.path.join(token_path, 'tokens_{}_{}.txt')" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 15, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "steps = 250\n", 218 | "for c, i in enumerate(range(0, len(s), steps)):\n", 219 | " with open(fn.format(project, c), 'w') as f:\n", 220 | " f.writelines([_ + '\\n' for _ in s[i:i + steps - 1]])" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Relations" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 16, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "results_path = os.path.join(data_path, 'results')\n", 237 | "if not os.path.isdir(results_path):\n", 238 | " os.mkdir(results_path)\n", 239 | "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 17, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "fl = sorted(os.listdir(token_path))\n", 249 | "d = pd.DataFrame()\n", 250 | "fno = len(fl)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 18, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "CPU times: user 3.03 s, sys: 943 ms, total: 3.97 s\n", 263 | "Wall time: 28min 25s\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "%%time\n", 269 | "try:\n", 270 | " d = pd.read_hdf(fn, 'raw')\n", 271 | "except:\n", 272 | " for i, fn in enumerate(fl):\n", 273 | " filename = os.path.join(token_path, fn)\n", 274 | " msg = 'Processing file {} of {} \\r'\n", 275 | " print(msg.format(i + 1, fno), end='')\n", 276 | " r = ie.stanford_ie(filename, verbose=False)\n", 277 | " dt = pd.DataFrame(r)\n", 278 | " if len(d) == 0:\n", 279 | " d = dt\n", 280 | " else:\n", 281 | " d = pd.concat((d, dt), ignore_index=True)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 19, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "d = d.iloc[:, :3]" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 20, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "d.columns = ['Node1', 'Relation', 'Node2']" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 21, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "50660" 311 | ] 312 | }, 313 | "execution_count": 21, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "len(d)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## Post Processing" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 22, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "data = d.copy()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### Basic Processing" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 23, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "data = data.applymap(lambda s: s.strip())" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 24, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "data = data[data.applymap(lambda s: not s in nlp.stop_words)].dropna()" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 25, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "data = data[data.applymap(lambda s: not s.startswith('http'))].dropna()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 26, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "data = data.applymap(lambda s: nlp.nltk_lemma(s))" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 27, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "data": { 388 | "text/plain": [ 389 | "31921" 390 | ] 391 | }, 392 | "execution_count": 27, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "len(data)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "### Removing Duplicates" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 28, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "def join_columns(row):\n", 415 | " return ' '.join([row['Node1'], row['Relation'], row['Node2']])" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 29, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "vec = nlp.TfidfVectorizer(stop_words='english')" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 30, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "data['Join'] = data.apply(lambda row: join_columns(row), axis=1)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 31, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "mat = vec.fit_transform(data['Join'].values.tolist())" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 32, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "CPU times: user 2.17 s, sys: 6.89 s, total: 9.06 s\n", 455 | "Wall time: 8.74 s\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "%time sim = (mat * mat.T).A" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 33, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "data['Keep'] = True" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 34, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "CPU times: user 19min 48s, sys: 6.08 s, total: 19min 54s\n", 482 | "Wall time: 19min 54s\n" 483 | ] 484 | } 485 | ], 486 | "source": [ 487 | "%%time\n", 488 | "for i, ind_i in enumerate(data.index):\n", 489 | " for j, ind_j in enumerate(data.index):\n", 490 | " if j > i:\n", 491 | " simsc = sim[i, j]\n", 492 | " if simsc > 0.5:\n", 493 | " data.loc[ind_j, 'Keep'] = False" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 35, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "data = data.iloc[:, :3][data['Keep'] == True]" 503 | ] 504 | }, 505 | { 506 | "cell_type": "raw", 507 | "metadata": {}, 508 | "source": [ 509 | "# tag::HARVEY_01[]" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 36, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "6520" 521 | ] 522 | }, 523 | "execution_count": 36, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "len(data)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 37, 535 | "metadata": {}, 536 | "outputs": [ 537 | { 538 | "data": { 539 | "text/html": [ 540 | "
\n", 541 | "\n", 554 | "\n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | "
Node1RelationNode2
50569flood watersare contaminated withbacteria
50571jack gillisspokesman forconsumer federation of america of car book
50574your mechanicexaminevehicle
50580checkis inwheel wells
50583used car dealeris licensed bystate
\n", 596 | "
" 597 | ], 598 | "text/plain": [ 599 | " Node1 Relation \\\n", 600 | "50569 flood waters are contaminated with \n", 601 | "50571 jack gillis spokesman for \n", 602 | "50574 your mechanic examine \n", 603 | "50580 check is in \n", 604 | "50583 used car dealer is licensed by \n", 605 | "\n", 606 | " Node2 \n", 607 | "50569 bacteria \n", 608 | "50571 consumer federation of america of car book \n", 609 | "50574 vehicle \n", 610 | "50580 wheel wells \n", 611 | "50583 state " 612 | ] 613 | }, 614 | "execution_count": 37, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "data.tail()" 621 | ] 622 | }, 623 | { 624 | "cell_type": "raw", 625 | "metadata": {}, 626 | "source": [ 627 | "# end::HARVEY_01[]" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "## Storing Results" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 38, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "fn = os.path.join(results_path, 'relations_{}.h5'.format(project))" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 39, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "d.to_hdf(fn, 'raw', complevel=5, complib='blosc')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 40, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "data.to_hdf(fn, 'data', complevel=5, complib='blosc')" 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "" 669 | ] 670 | } 671 | ], 672 | "metadata": { 673 | "kernelspec": { 674 | "display_name": "Python 3", 675 | "language": "python", 676 | "name": "python3" 677 | }, 678 | "language_info": { 679 | "codemirror_mode": { 680 | "name": "ipython", 681 | "version": 3 682 | }, 683 | "file_extension": ".py", 684 | "mimetype": "text/x-python", 685 | "name": "python", 686 | "nbconvert_exporter": "python", 687 | "pygments_lexer": "ipython3", 688 | "version": "3.6.7" 689 | } 690 | }, 691 | "nbformat": 4, 692 | "nbformat_minor": 2 693 | } 694 | -------------------------------------------------------------------------------- /code/02_nlp/02_nlp_openie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Natural Language Processing\n", 15 | "\n", 16 | "**Open Information Extraction**\n", 17 | "\n", 18 | "_Illustrated based on a simple example and the texts from three Apple press releases._\n", 19 | "\n", 20 | "Dr Yves J Hilpisch | Michael Schwed\n", 21 | "\n", 22 | "The Python Quants GmbH" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Simple Example" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "import nltk\n", 40 | "import requests\n", 41 | "import pandas as pd" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import sys\n", 51 | "sys.path.append('../../modules/')\n", 52 | "import soiepy.main as ie \n", 53 | "import ng_functions as ng \n", 54 | "import nlp_functions as nlp" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "t = '''\n", 64 | "Peter studies data science.\n", 65 | "Peter knows Java.\n", 66 | "Peter prefers Python.\n", 67 | "Peter works as a data scientist.\n", 68 | "Peter applies machine learning.\n", 69 | "A data scientist uses Python.\n", 70 | "Python revolutionized data science.\n", 71 | "Python is preferred for NLP.\n", 72 | "Python is used for machine learning.\n", 73 | "'''" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "s = nltk.sent_tokenize(t) " 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "['\\nPeter studies data science.', 'Peter knows Java.', 'Peter prefers Python.']" 94 | ] 95 | }, 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "s[:3] " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "s = [nlp.clean_up_text(_) for _ in s] \n", 112 | "s = [' '.join(nlp.tokenize(_)) + '.' for _ in s] " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "['peter study data science.', 'peter know java.', 'peter prefer python.']" 124 | ] 125 | }, 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "s[:3] " 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "abs_path = os.path.abspath('../../')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 9, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "data_path = os.path.join(abs_path, 'data')\n", 151 | "tokens_path = os.path.join(data_path, 'tokens')\n", 152 | "if not os.path.isdir(tokens_path):\n", 153 | " os.mkdir(tokens_path)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "fn = os.path.join(tokens_path, 'tokens_example.txt') " 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 11, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "with open(fn, 'w') as f:\n", 172 | " f.writelines([_ + '\\n' for _ in s]) " 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 12, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Executing command = cd /root/notebook/dnanlp/modules/soiepy/;cd stanford-openie; java -mx4g -cp \"stanford-openie.jar:stanford-openie-models.jar:lib/*\" edu.stanford.nlp.naturalli.OpenIE /root/notebook/dnanlp/data/tokens/tokens_example.txt -format ollie > /tmp/openie/out.txt\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "r = ie.stanford_ie(fn, verbose=True) " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 13, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "[['peter', ' know', ' java'],\n", 201 | " ['peter', ' prefer', ' python'],\n", 202 | " ['peter', ' works', ' data scientist']]" 203 | ] 204 | }, 205 | "execution_count": 13, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "r[:3] " 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 14, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2']) " 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 15, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "d = d.applymap(lambda _: _.strip()) " 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 16, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/html": [ 240 | "
\n", 241 | "\n", 254 | "\n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | "
Node1RelationNode2
0peterknowjava
1peterpreferpython
2peterworksdata scientist
\n", 284 | "
" 285 | ], 286 | "text/plain": [ 287 | " Node1 Relation Node2\n", 288 | "0 peter know java\n", 289 | "1 peter prefer python\n", 290 | "2 peter works data scientist" 291 | ] 292 | }, 293 | "execution_count": 16, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "d.iloc[:3]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 17, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "g = ng.create_graph(d) " 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 18, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "G = ng.plot_graph(g, central_gravity=0.01) " 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 19, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/html": [ 328 | "\n", 329 | " \n", 336 | " " 337 | ], 338 | "text/plain": [ 339 | "" 340 | ] 341 | }, 342 | "execution_count": 19, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "G.show('ng_example.html') " 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "## Apple Press Releases" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 20, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "import requests" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 21, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "sources = [\n", 374 | " 'https://nr.apple.com/dE0b1T5G3u', # iPad Pro\n", 375 | " 'https://nr.apple.com/dE4c7T6g1K', # MacBook Air\n", 376 | " 'https://nr.apple.com/dE4q4r8A2A', # Mac Mini\n", 377 | "]" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 22, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "html = [requests.get(url).text for url in sources]" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 23, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "sents = [nltk.sent_tokenize(h) for h in html]" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 24, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "s = []\n", 405 | "for sent in sents:\n", 406 | " s.extend(sent)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 25, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "200" 418 | ] 419 | }, 420 | "execution_count": 25, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "len(s)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 26, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "s = [nlp.clean_up_text(se) for se in s]" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 27, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "s = [' '.join(nlp.tokenize(se)) + '.' for se in s]" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 28, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "s = [se for se in s if len(se) > 5]" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 29, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "fn = os.path.join(tokens_path, 'tokens_apple.txt')\n", 463 | "with open(fn, 'w') as f:\n", 464 | " f.writelines([_ + '\\n' for _ in s])" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 30, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "CPU times: user 5.85 ms, sys: 16.3 ms, total: 22.1 ms\n", 477 | "Wall time: 21.2 s\n" 478 | ] 479 | } 480 | ], 481 | "source": [ 482 | "%time r = ie.stanford_ie(fn, verbose=False)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 31, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "data": { 492 | "text/plain": [ 493 | "[['apple mac',\n", 494 | " ' ipad',\n", 495 | " ' iphone watch music support shopping bag newsroom archive press release'],\n", 496 | " ['apple mac',\n", 497 | " ' ipad',\n", 498 | " ' watch music support shopping bag newsroom archive press release'],\n", 499 | " ['today', ' introduce', ' design performance']]" 500 | ] 501 | }, 502 | "execution_count": 31, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | } 506 | ], 507 | "source": [ 508 | "r[:3]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 32, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 33, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "d = d.applymap(lambda x: x.strip())" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 34, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/html": [ 537 | "
\n", 538 | "\n", 551 | "\n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | "
Node1RelationNode2
0apple macipadiphone watch music support shopping bag newsro...
1apple macipadwatch music support shopping bag newsroom arch...
2todayintroducedesign performance
3todayintroduceipad design performance
4todayintroducedesign next-generation performance
5todayintroduceall-screen design performance
6todayintroduceipad design next-generation performance
7todayintroduceipad all-screen design next-generation perform...
8todayintroduceipad all-screen design performance
9todayintroduceall-screen design next-generation performance
\n", 623 | "
" 624 | ], 625 | "text/plain": [ 626 | " Node1 Relation Node2\n", 627 | "0 apple mac ipad iphone watch music support shopping bag newsro...\n", 628 | "1 apple mac ipad watch music support shopping bag newsroom arch...\n", 629 | "2 today introduce design performance\n", 630 | "3 today introduce ipad design performance\n", 631 | "4 today introduce design next-generation performance\n", 632 | "5 today introduce all-screen design performance\n", 633 | "6 today introduce ipad design next-generation performance\n", 634 | "7 today introduce ipad all-screen design next-generation perform...\n", 635 | "8 today introduce ipad all-screen design performance\n", 636 | "9 today introduce all-screen design next-generation performance" 637 | ] 638 | }, 639 | "execution_count": 34, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "d.iloc[:10]" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 35, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "d = d[d.applymap(lambda x: len(x) < 25)].dropna()" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 36, 660 | "metadata": {}, 661 | "outputs": [ 662 | { 663 | "data": { 664 | "text/html": [ 665 | "
\n", 666 | "\n", 679 | "\n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | "
Node1RelationNode2
2todayintroducedesign performance
3todayintroduceipad design performance
19workflows .2 apps designtakeadvantage display
45workflows .2 apps designtakeadvantage large display
47photoshop ipadcoming2019 push user computer
\n", 721 | "
" 722 | ], 723 | "text/plain": [ 724 | " Node1 Relation Node2\n", 725 | "2 today introduce design performance\n", 726 | "3 today introduce ipad design performance\n", 727 | "19 workflows .2 apps design take advantage display\n", 728 | "45 workflows .2 apps design take advantage large display\n", 729 | "47 photoshop ipad coming 2019 push user computer" 730 | ] 731 | }, 732 | "execution_count": 36, 733 | "metadata": {}, 734 | "output_type": "execute_result" 735 | } 736 | ], 737 | "source": [ 738 | "d.iloc[:5]" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 37, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "g = ng.create_graph(d)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 38, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "G = ng.plot_graph(g, with_edge_label=False,\n", 757 | " font_color='grey', central_gravity=0.01)" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 39, 763 | "metadata": { 764 | "scrolled": false 765 | }, 766 | "outputs": [ 767 | { 768 | "data": { 769 | "text/html": [ 770 | "\n", 771 | " \n", 778 | " " 779 | ], 780 | "text/plain": [ 781 | "" 782 | ] 783 | }, 784 | "execution_count": 39, 785 | "metadata": {}, 786 | "output_type": "execute_result" 787 | } 788 | ], 789 | "source": [ 790 | "G.show('ng_apple.html')" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "metadata": {}, 796 | "source": [ 797 | "" 798 | ] 799 | } 800 | ], 801 | "metadata": { 802 | "kernelspec": { 803 | "display_name": "Python 3", 804 | "language": "python", 805 | "name": "python3" 806 | }, 807 | "language_info": { 808 | "codemirror_mode": { 809 | "name": "ipython", 810 | "version": 3 811 | }, 812 | "file_extension": ".py", 813 | "mimetype": "text/x-python", 814 | "name": "python", 815 | "nbconvert_exporter": "python", 816 | "pygments_lexer": "ipython3", 817 | "version": "3.6.7" 818 | } 819 | }, 820 | "nbformat": 4, 821 | "nbformat_minor": 2 822 | } 823 | --------------------------------------------------------------------------------