├── .gitignore
├── .idea
    └── vcs.xml
├── LICENSE
├── README.md
├── __init__.py
├── cloud_setup.py
├── feature_engineering
    ├── __init__.py
    ├── author_graph_features.py
    ├── authors.py
    ├── authors_2.py
    ├── baseline_feature_engineering.py
    ├── basic_features.py
    ├── citation_graph_features.py
    ├── cosine_distance.py
    ├── networkx_bigraph.py
    ├── networkx_bigraph_long.py
    ├── networkx_bigraph_long2.py
    ├── networkx_digraph.py
    ├── preprocessing.py
    └── tools.py
├── link-prediction-report.pdf
├── main.py
├── models
    ├── __init__.py
    ├── camboui.ipynb
    ├── feature_selection.ipynb
    ├── feature_selection.py
    ├── lgbm.py
    ├── logistic_regression.py
    ├── nn.py
    ├── nn_deep.py
    ├── plots
    │   ├── rf_importance.png
    │   └── rf_importance_full.png
    ├── random_forest.py
    ├── svm.py
    ├── tools.py
    └── tuning
    │   ├── __init__.py
    │   ├── console_nn_grid_search_example.txt
    │   ├── objective_function.py
    │   ├── plots
    │       └── grid_lgbm.png
    │   ├── tools.py
    │   ├── tuning_lgbm.py
    │   ├── tuning_nn.py
    │   ├── tuning_random_forest.py
    │   ├── tuning_svm.py
    │   └── tuning_svm_feat_selec.py
├── notes
├── ressources
    └── data_challenge_description.pdf
├── results
    └── results
├── sampling
    ├── sampling.ipynb
    └── sampling.py
├── stacking
    ├── __init__.py
    ├── stacking.py
    ├── stacking_tuning.py
    └── stacking_tuning_micro.py
├── task_manager.py
└── tests
    ├── __init__.py
    ├── test_author_graph_features.ipynb
    ├── test_baseline.ipynb
    └── test_preprocessing.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | /data/
103 | /submissions/
104 | /calculated_features/
105 | .idea/
106 | code/data_exploration/camboui.ipynb
107 | /code/data_exploration/camboui.ipynb
108 | /code/data_exploration/data_exploration.ipynb
109 | /code/feature_engineering/camboui.ipynb
110 | /code/models/camboui.ipynb
111 | /code/feature_engineering/camboui.py
112 | /code/feature_engineering/camboui_network_x.ipynb
113 | /tests/multiprocessing_tuto.py
114 | /data_exploration/data_exploration.ipynb
115 | /models/feature_selection_2.ipynb
116 | /results/log.txt
117 | /log.txt
118 | /bigraph_from_root.py
119 | /illustrate_report.ipynb
120 | /camboui.py
121 | *.csv
122 | results/*
123 | /storage.py
124 | Untitled.ipynb
125 | Untitled1.ipynb
126 | requirements.txt
127 | 
128 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 raph-m
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # link-prediction
 2 | 
 3 | Predict links in a citation network.
 4 | You can find the project instructions in the ressources folder.
 5 | Our project report is available in the link-prediction-report pdf.
 6 | 
 7 | ## Feature Engineering
 8 | 
 9 | In the feature engineering folder you can find scripts to compute new features from the dataset. The features computed are described at the beginning of the scripts, and you can find more information in our project report.
10 | 
11 | ## Feature Selection
12 | 
13 | Running the feature_selection.py script will print the results of a forward selection algorithm. We chose the set of features that we were going to use for the rest of the project from these results.
14 | 
15 | ## Models
16 | 
17 | You can find several implementations of models to fit to our data. Running the scripts will give you the results and create a submission file.
18 | 
19 | ## Tuning
20 | 
21 | Running the tuning scripts will output best paramaters resulting from a cross validated grid search on a hand picked parameter grid.
22 | 
23 | ## Main
24 | 
25 | The main.py script processes all you need (feature engineering and machine learning) in order to create our final submission. The svm fit might take a substantial amount of time.
26 | You may use the generated "stack_sub_rf.csv" as a reproduction of our best submission. If they were to be reproducibility issues with runtimes and what not we left our original submission under the name ("stack_sub_rf_reference.csv")
27 | 
28 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/__init__.py


--------------------------------------------------------------------------------
/cloud_setup.py:
--------------------------------------------------------------------------------
  1 | # how to setup the environment for cloud computing (install python tools and libraries, download database from
  2 | # google drive public link and run python file)
  3 | 
  4 | """
  5 | sudo apt update
  6 | sudo apt install python python-dev python3 python3-dev
  7 | sudo apt-get install python3-setuptools
  8 | wget https://bootstrap.pypa.io/get-pip.py
  9 | sudo python get-pip.py
 10 | sudo pip install --upgrade virtualenv
 11 | sudo pip install virtualenvwrapper
 12 | echo "export WORKON_HOME=$HOME/.virtualenvs" >> .bashrc
 13 | echo "export PROJECT_HOME=$HOME/Devel" >> .bashrc
 14 | echo "source /usr/local/bin/virtualenvwrapper.sh" >> .bashrc
 15 | echo "source "/usr/bin/virtualenvwrapper.sh"" >> .bashrc
 16 | echo "export WORKON_HOME="/opt/virtual_env/"" >> .bashrc
 17 | source `which virtualenvwrapper.sh`
 18 | mkvirtualenv -p /usr/bin/python3.5 ml1
 19 | sudo pip install pandas
 20 | sudo pip install requests
 21 | sudo pip install dotenv
 22 | sudo pip install 
 23 | git clone https://github.com/raph-m/safe_driver_prediction
 24 | cd safe_driver_prediction/proj2
 25 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip
 26 | cd ..
 27 | cd ..
 28 | sudo apt-get install unzip
 29 | unzip data.zip
 30 | cd safe_driver_prediction
 31 | git pull origin master
 32 | echo "ENV_NAME=vm" > .env
 33 | python proj2/feature_engineering.py train ../../churn/ 3000000
 34 | """
 35 | 
 36 | # une version qui marche (sans virtualenv):
 37 | """
 38 | sudo apt update
 39 | sudo apt install python python-dev python3 python3-dev
 40 | sudo apt-get install python3-setuptools
 41 | wget https://bootstrap.pypa.io/get-pip.py
 42 | sudo python get-pip.py
 43 | alias python=python3
 44 | sudo apt-get python3-setuptools
 45 | sudo easy_install3 pip
 46 | sudo pip3 install pandas
 47 | sudo pip3 install requests
 48 | sudo pip3 install dotenv
 49 | git clone https://github.com/raph-m/safe_driver_prediction
 50 | cd safe_driver_prediction/proj2
 51 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip
 52 | cd ..
 53 | cd ..
 54 | sudo apt-get install unzip
 55 | unzip data.zip
 56 | cd safe_driver_prediction
 57 | echo "ENV_NAME=vm" > .env
 58 | cd proj2
 59 | python feature_engineering.py
 60 | """
 61 | 
 62 | # une autre façon de faire c'est avec `alias python=python3`
 63 | 
 64 | # pour automatiser ces commandes, il faudrait mettre les commandes dans ce bashCommand et lancer ce script:
 65 | # bashCommand = "cwm --rdf test.rdf --ntriples > test.nt"
 66 | # process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
 67 | # output, error = process.communicate()
 68 | 
 69 | """
 70 | git clone https://github.com/raph-m/link-prediction
 71 | cd link-prediction/
 72 | # get and API token from kaggle (kaggle.json)
 73 | sudo pip install kaggle
 74 | mv kaggle.json .kaggle/
 75 | mkdir data
 76 | cd data
 77 | kaggle competitions download -c link-prediction-challenge-tm-and-nlp
 78 | sudo pip install nltk
 79 | sudo pip install tqdm
 80 | 
 81 | """
 82 | 
 83 | import requests
 84 | 
 85 | 
 86 | # python script to download a file from a google drive public link
 87 | 
 88 | 
 89 | def download_file_from_google_drive(id, destination):
 90 |     def get_confirm_token(response):
 91 |         for key, value in response.cookies.items():
 92 |             if key.startswith('download_warning'):
 93 |                 return value
 94 | 
 95 |         return None
 96 | 
 97 |     def save_response_content(response, destination):
 98 |         CHUNK_SIZE = 32768
 99 | 
100 |         with open(destination, "wb") as f:
101 |             for chunk in response.iter_content(CHUNK_SIZE):
102 |                 if chunk:  # filter out keep-alive new chunks
103 |                     f.write(chunk)
104 | 
105 |     URL = "https://docs.google.com/uc?export=download"
106 | 
107 |     session = requests.Session()
108 | 
109 |     response = session.get(URL, params={'id': id}, stream=True)
110 |     token = get_confirm_token(response)
111 | 
112 |     if token:
113 |         params = {'id': id, 'confirm': token}
114 |         response = session.get(URL, params=params, stream=True)
115 | 
116 |     save_response_content(response, destination)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     import sys
121 | 
122 |     if len(sys.argv) is not 3:
123 |         print("Usage: python google_drive.py drive_file_id destination_file_path")
124 |     else:
125 |         # TAKE ID FROM SHAREABLE LINK
126 |         file_id = sys.argv[1]
127 |         # DESTINATION FILE ON YOUR DISK
128 |         destination = sys.argv[2]
129 |         download_file_from_google_drive(file_id, destination)
130 | 


--------------------------------------------------------------------------------
/feature_engineering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/feature_engineering/__init__.py


--------------------------------------------------------------------------------
/feature_engineering/author_graph_features.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from itertools import permutations, product
  3 | 
  4 | import igraph
  5 | import numpy as np
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | 
  9 | from feature_engineering.tools import lit_eval_nan_proof
 10 | 
 11 | # progress bar for pandas
 12 | tqdm.pandas(tqdm())
 13 | 
 14 | # path
 15 | path_to_data = "data/"
 16 | 
 17 | # loading data
 18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 19 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 21 | nodes.set_index("id", inplace=True)
 22 | training = pd.read_csv(path_to_data + "training_features.txt")
 23 | training.set_index("my_index", inplace=True)
 24 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 25 | testing.set_index("my_index", inplace=True)
 26 | 
 27 | # create author graph
 28 | # vertices are authors
 29 | # edge of weight 1 if they cowrote a paper, 2 if they only cite each other
 30 | 
 31 | # create empty directed graph
 32 | g = igraph.Graph(directed=True)
 33 | 
 34 | # add vertices
 35 | authors = nodes['authors']
 36 | authors_set = list(set(authors.dropna().sum()))
 37 | g.add_vertices(authors_set)
 38 | 
 39 | # first, add citation edges
 40 | edges = {}
 41 | # store edge ids for each edge
 42 | ids = {}
 43 | # store weights
 44 | weights = {}
 45 | id1 = training['id1'].values
 46 | id2 = training['id2'].values
 47 | index_train = training.index
 48 | target = training["target"].values
 49 | # edge id
 50 | id = 0
 51 | # store all the edges related to each citation
 52 | eid = {}
 53 | for i in tqdm(range(len(id1))):
 54 |     # if there is a 
 55 |     if target[i] == 1:
 56 |         authors1 = nodes.at[id1[i], 'authors']
 57 |         authors2 = nodes.at[id2[i], 'authors']
 58 |         # check that author information is not missing
 59 |         if isinstance(authors1, float) or isinstance(authors2, float):
 60 |             continue
 61 |         # if authors available then add edges
 62 |         pairs = list(product(authors1, authors2))
 63 |         # for each pair of authors
 64 |         for pair in pairs:
 65 |             # if edge already exists
 66 |             if pair in edges:
 67 |                 # increment weight
 68 |                 weights[pair] += 1
 69 |                 # add id to edges related to this citation
 70 |                 if index_train[i] in eid:
 71 |                     eid[index_train[i]] += [id]
 72 |                 else:
 73 |                     eid[index_train[i]] = [id]
 74 |             # if doesn't exist
 75 |             else:
 76 |                 # create edge
 77 |                 edges[pair] = 1
 78 |                 # keep track of edge id
 79 |                 ids[pair] = id
 80 |                 # add id to edges related to this citation
 81 |                 if index_train[i] in eid:
 82 |                     eid[index_train[i]] += [id]
 83 |                 else:
 84 |                     eid[index_train[i]] = [id]
 85 |                 # store weight
 86 |                 weights[pair] = 1
 87 |                 # increment id
 88 |                 id += 1
 89 | 
 90 | # then, add coauthor edges
 91 | authors_array = authors.values
 92 | index_nodes = nodes.index.values
 93 | # for each document
 94 | for i in tqdm(range(len(authors_array))):
 95 |     # if missing author info, skip
 96 |     if isinstance(authors_array[i], float):
 97 |         continue
 98 |     # if not for each pair of coauthors
 99 |     coauthors = permutations(authors_array[i], 2)
100 |     for pair in coauthors:
101 |         # if edge already exists
102 |         if pair in edges:
103 |             # increment weight
104 |             weights[pair] += 2
105 |         # if doesn't exist
106 |         else:
107 |             # create edge
108 |             edges[pair] = 1
109 |             # store weight
110 |             weights[pair] = 2
111 | 
112 | # add edges to graph
113 | g.add_edges(list(edges))
114 | 
115 | # add weights
116 | weights = list(edges.values())
117 | max_weight = max(weights)
118 | weights = max_weight - np.array(weights) + 1
119 | g.es['weight'] = list(weights)
120 | 
121 | # compute features such as shortest path
122 | 
123 | # features placeholders
124 | min_shortest_path = []
125 | max_shortest_path = []
126 | mean_shortest_path = []
127 | author_in_degree_mean_target = []
128 | author_in_degree_max_target = []
129 | author_out_degree_mean_source = []
130 | author_out_degree_max_source = []
131 | author_common_neigbors_mean = []
132 | author_common_neigbors_max = []
133 | author_jaccard_mean = []
134 | author_jaccard_max = []
135 | 
136 | # get training ids
137 | id1 = training['id1'].values
138 | id2 = training['id2'].values
139 | target = training["target"].values
140 | index_train = training.index
141 | 
142 | # compute features for all samples
143 | for i in tqdm(range(len(id1))):
144 |     authors1 = nodes.at[id1[i], 'authors']
145 |     authors2 = nodes.at[id2[i], 'authors']
146 |     # if one of the articles has missing author info
147 |     if isinstance(authors1, float) or isinstance(authors2, float):
148 |         # print("NAN")
149 |         # no shortest path can be computed
150 |         min_shortest_path.append(np.nan)
151 |         max_shortest_path.append(np.nan)
152 |         mean_shortest_path.append(np.nan)
153 |         # if author info is missing for first doc
154 |         if isinstance(authors1, float):
155 |             # no degree can be computed
156 |             author_out_degree_max_source.append(np.nan)
157 |             author_out_degree_mean_source.append(np.nan)
158 |         # if not missing
159 |         else:
160 |             # compute degrees
161 |             out = g.strength(authors1, weights="weight")
162 |             mean_out = np.mean(out)
163 |             max_out = np.max(out)
164 |             author_out_degree_max_source.append(max_out)
165 |             author_out_degree_mean_source.append(mean_out)
166 |         # if it is missing for the second document
167 |         if isinstance(authors2, float):
168 |             # no degree can be computed
169 |             author_in_degree_max_target.append(np.nan)
170 |             author_in_degree_mean_target.append(np.nan)
171 |         # if not
172 |         else:
173 |             # compute degrees for other document
174 |             in_ = g.strength(authors2, weights="weight")
175 |             mean_in = np.mean(in_)
176 |             max_in = np.max(in_)
177 |             author_in_degree_max_target.append(max_in)
178 |             author_in_degree_mean_target.append(mean_in)
179 |         continue
180 |     # print("NO NAN")
181 |     # if there's no missing author information
182 |     # set weights of unwanted edges to zero
183 |     if target[i] == 1:
184 |         # print('target is 1')
185 |         t0 = time.time()
186 |         # print('fetching edge ids')
187 |         eids_to_unweigh = eid[index_train[i]]
188 |         t1 = time.time()
189 |         for id in eids_to_unweigh:
190 |             g.es['weight'][id] += 1
191 |         t1_bis = time.time()
192 |         print('bottleneck', t1 - t0, t1_bis - t1)
193 |     # compute shortest paths
194 |     # print("computing shortest path")
195 |     t1 = time.time()
196 |     # paths = g.shortest_paths_dijkstra(source=authors1, target=authors2,
197 |     #                                             mode="OUT", weights="weight")[0][0]
198 |     # min_value = np.min(paths)
199 |     # max_value = np.max(paths)
200 |     # mean_value = np.mean(paths)
201 |     t2 = time.time()
202 |     print('shortest_path', t2 - t1)
203 |     # compute degrees
204 |     out = g.strength(authors1, weights="weight")
205 |     in_ = g.strength(authors2, weights="weight")
206 |     mean_out = np.mean(out)
207 |     max_out = np.max(out)
208 |     in_ = g.strength(authors2, weights="weight")
209 |     mean_in = np.mean(in_)
210 |     max_in = np.max(in_)
211 |     t3 = time.time()
212 |     print('weighted degree', t3 - t2)
213 |     # create set of pairs as vertex ids as well as index values
214 |     pairs = list(product(authors1, authors2))
215 |     pairs_index = list(product(range(len(authors1)), range(len(authors2))))
216 |     # compute jaccard similarity
217 |     # jaccards = g.similarity_jaccard(pairs=pairs)
218 |     # max_jacc = np.max(jaccards)
219 |     # mean_jacc = np.mean(jaccards)
220 |     t4 = time.time()
221 |     # print('jacc', t4 - t3)
222 |     # compute common neighbours
223 |     hoods1 = g.neighborhood(vertices=authors1)
224 |     hoods2 = g.neighborhood(vertices=authors2)
225 |     common_hoods = [set(hoods1[i]).intersection(set(hoods2[j])) for (i, j) in pairs_index]
226 |     common_hoods_size = list(map(len, common_hoods))
227 |     max_hood = np.max(common_hoods_size)
228 |     mean_hood = np.mean(common_hoods_size)
229 |     t5 = time.time()
230 |     print('common hoods', t5 - t4)
231 |     # append features to corresponding set
232 |     # min_shortest_path.append(min_value)
233 |     # max_shortest_path.append(max_value)
234 |     # mean_shortest_path.append(mean_value)
235 |     author_out_degree_max_source.append(max_out)
236 |     author_out_degree_mean_source.append(mean_out)
237 |     author_in_degree_max_target.append(max_in)
238 |     author_in_degree_mean_target.append(mean_in)
239 |     author_common_neigbors_mean.append(mean_hood)
240 |     author_common_neigbors_max.append(max_hood)
241 |     # author_jaccard_mean.append(max_jacc)
242 |     # author_jaccard_max.append(mean_jacc)
243 |     if target[i] == 1:
244 |         for id in eids_to_unweigh:
245 |             g.es['weight'][id] = 0
246 |     t6 = time.time()
247 |     # print("append features", t6 - t5)
248 | 
249 | # add feature to dataframe
250 | # training["author_min_shortest_path"] = min_shortest_path
251 | # training["author_max_shortest_path"] = max_shortest_path
252 | # training["author_sum_shortest_path"] = sum_shortest_path
253 | # training["author_mean_shortest_path"] = mean_shortest_path
254 | training['author_out_degree_max_source'] = author_out_degree_max_source
255 | training['author_out_degree_mean_source'] = author_out_degree_mean_source
256 | training['author_in_degree_max_target'] = author_in_degree_max_target
257 | training['author_in_degree_mean_target'] = author_in_degree_mean_target
258 | training['author_common_neigbors_mean'] = author_common_neigbors_mean
259 | training['author_common_neigbors_max'] = author_common_neigbors_max
260 | # training['author_jaccard_mean'] = author_jaccard_mean
261 | # training['author_jaccard_max'] = author_jaccard_max
262 | 
263 | # repeat process for test set
264 | min_shortest_path_test = []
265 | max_shortest_path_test = []
266 | sum_shortest_path_test = []
267 | mean_shortest_path_test = []
268 | author_in_degree_mean_target_test = []
269 | author_in_degree_sum_target_test = []
270 | author_out_degree_mean_source_test = []
271 | author_out_degree_sum_source_test = []
272 | id1 = testing['id1'].values
273 | id2 = testing['id2'].values
274 | for i in tqdm(range(len(id1))):
275 |     authors1 = nodes.at[id1[i], 'authors']
276 |     authors2 = nodes.at[id2[i], 'authors']
277 |     if isinstance(authors1, float) or isinstance(authors2, float):
278 |         min_shortest_path_test.append(np.nan)
279 |         max_shortest_path_test.append(np.nan)
280 |         sum_shortest_path_test.append(np.nan)
281 |         mean_shortest_path_test.append(np.nan)
282 |         if isinstance(authors1, float):
283 |             author_out_degree_sum_source_test.append(np.nan)
284 |             author_out_degree_mean_source_test.append(np.nan)
285 |         else:
286 |             sum_out = 0
287 |             n_source = len(authors1)
288 |             for author1 in authors1:
289 |                 sum_out += g.strength(author1, mode='OUT', weights="weight")
290 |             mean_out = sum_out / n_source
291 |             author_out_degree_sum_source_test.append(sum_out)
292 |             author_out_degree_mean_source_test.append(mean_out)
293 |         if isinstance(authors2, float):
294 |             author_in_degree_sum_target_test.append(np.nan)
295 |             author_in_degree_mean_target_test.append(np.nan)
296 |         else:
297 |             sum_in = 0
298 |             n_target = len(authors2)
299 |             for author2 in authors2:
300 |                 sum_in += g.strength(author2, mode='IN', weights="weight")
301 |             mean_in = sum_in / n_target
302 |             author_in_degree_sum_target_test.append(sum_in)
303 |             author_in_degree_mean_target_test.append(mean_in)
304 |         continue
305 |     min_value = float('inf')
306 |     max_value = - float('inf')
307 |     sum_value = 0
308 |     n = len(authors1) * len(authors2)
309 |     for author1 in authors1:
310 |         for author2 in authors2:
311 |             current = g.shortest_paths_dijkstra(source=author1, target=author2,
312 |                                                 mode="OUT", weights=g.es["weight"])[0][0]
313 |             min_value = current if current < min_value else min_value
314 |             max_value = current if current > max_value else max_value
315 |             sum_value += current
316 |     mean_value = sum_value / n
317 |     sum_out = 0
318 |     sum_in = 0
319 |     n_source = len(authors1)
320 |     n_target = len(authors2)
321 |     for author1 in authors1:
322 |         sum_out += g.strength(author1, mode='OUT', weights="weight")
323 |     for author2 in authors2:
324 |         sum_in += g.strength(author2, mode='IN', weights="weight")
325 |     mean_out = sum_out / n_source
326 |     mean_in = sum_in / n_target
327 |     min_shortest_path_test.append(min_value)
328 |     max_shortest_path_test.append(max_value)
329 |     sum_shortest_path_test.append(sum_value)
330 |     mean_shortest_path_test.append(mean_value)
331 |     author_out_degree_sum_source_test.append(sum_out)
332 |     author_out_degree_mean_source_test.append(mean_out)
333 |     author_in_degree_sum_target_test.append(sum_in)
334 |     author_in_degree_mean_target_test.append(mean_in)
335 | 
336 | # add feature to dataframe
337 | testing["author_min_shortest_path"] = min_shortest_path_test
338 | testing["author_max_shortest_path"] = max_shortest_path_test
339 | testing["author_sum_shortest_path"] = sum_shortest_path_test
340 | testing["author_mean_shortest_path"] = mean_shortest_path_test
341 | testing['author_out_degree_sum_source'] = author_out_degree_sum_source_test
342 | testing['author_out_degree_mean_source'] = author_out_degree_mean_source_test
343 | testing['author_in_degree_sum_target'] = author_in_degree_sum_target_test
344 | testing['author_in_degree_mean_target'] = author_in_degree_mean_target_test
345 | 
346 | # save data sets
347 | training.to_csv(path_to_data + "training_features.txt")
348 | testing.to_csv(path_to_data + "testing_features.txt")
349 | 


--------------------------------------------------------------------------------
/feature_engineering/authors.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import numpy as np
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | 
  6 | from feature_engineering.tools import lit_eval_nan_proof
  7 | 
  8 | # this script computes the features authors_citation and coauthor score by considering the graph of coauthorship and
  9 | # the author's graph of citations.
 10 | # the script takes approximately 5 minutes to run
 11 | 
 12 | # progress bar for pandas
 13 | tqdm.pandas(tqdm())
 14 | 
 15 | # path
 16 | path_to_data = "data/"
 17 | 
 18 | # loading data
 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 20 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 22 | nodes.set_index("id", inplace=True)
 23 | training = pd.read_csv(path_to_data + "training_features.txt")
 24 | training.set_index("my_index", inplace=True)
 25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 26 | testing.set_index("my_index", inplace=True)
 27 | 
 28 | # loading data
 29 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 30 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 31 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
 32 |                     converters=converter_dict)
 33 | nodes.set_index("id", inplace=True)
 34 | 
 35 | G = nx.DiGraph()
 36 | coauthors = nx.Graph()
 37 | 
 38 | print("building coauthor graph")
 39 | nodes_id = nodes.index.values
 40 | for i in tqdm(range(len(nodes_id))):
 41 | 
 42 |     authors = nodes.loc[nodes_id[i]]["authors"]
 43 |     if authors is np.nan:
 44 |         authors = []
 45 | 
 46 |     authors = np.unique([a for a in authors if a != ""])
 47 | 
 48 |     for a in authors:
 49 |         G.add_node(a)
 50 |         coauthors.add_node(a)
 51 | 
 52 |     for a1 in authors:
 53 |         for a2 in authors:
 54 |             if a1 != a2:
 55 |                 if coauthors.has_edge(a1, a2):
 56 |                     coauthors[a1][a2]["weight"] += 1
 57 |                 else:
 58 |                     coauthors.add_edge(a1, a2, weight=1)
 59 | 
 60 | id1 = training["id1"].values
 61 | id2 = training["id2"].values
 62 | 
 63 | print("building citation graph")
 64 | for i in tqdm(range(len(id1))):
 65 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
 66 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
 67 | 
 68 |     if current_authors_1 is np.nan:
 69 |         current_authors_1 = []
 70 | 
 71 |     if current_authors_2 is np.nan:
 72 |         current_authors_2 = []
 73 | 
 74 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
 75 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
 76 | 
 77 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 78 |         for a1 in current_authors_1:
 79 |             for a2 in current_authors_2:
 80 |                 if G.has_edge(a1, a2):
 81 |                     G[a1][a2]["weight"] += 1
 82 |                 else:
 83 |                     G.add_edge(a1, a2, weight=1)
 84 | 
 85 | coauthor_score = np.zeros(len(id1))
 86 | normalized_coauthor_score = np.zeros(len(id1))
 87 | best_coauthor_score = np.zeros(len(id1))
 88 | authors_citation = np.zeros(len(id1))
 89 | normalized_authors_citation = np.zeros(len(id1))
 90 | best_authors_citation = np.zeros(len(id1))
 91 | 
 92 | print("building features for training")
 93 | for i in tqdm(range(len(id1))):
 94 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
 95 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
 96 | 
 97 |     if current_authors_1 is np.nan:
 98 |         current_authors_1 = []
 99 | 
100 |     if current_authors_2 is np.nan:
101 |         current_authors_2 = []
102 | 
103 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
104 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
105 | 
106 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
107 |         for a1 in current_authors_1:
108 |             for a2 in current_authors_2:
109 |                 G[a1][a2]["weight"] -= 1
110 | 
111 |     best = 0
112 |     for a1 in current_authors_1:
113 |         for a2 in current_authors_2:
114 |             if G.has_edge(a1, a2):
115 |                 current = G[a1][a2]["weight"]
116 |                 authors_citation[i] += current
117 |                 if current > best:
118 |                     best = current
119 | 
120 |     best_authors_citation[i] = best
121 | 
122 |     best = 0
123 |     for a1 in current_authors_1:
124 |         for a2 in current_authors_2:
125 |             if coauthors.has_edge(a1, a2):
126 |                 current = coauthors[a1][a2]["weight"]
127 |                 coauthor_score[i] += current
128 |                 if current > best:
129 |                     best = current
130 | 
131 |     best_coauthor_score[i] = best
132 | 
133 |     # normalize features
134 |     denom = len(current_authors_1) * len(current_authors_2)
135 |     if denom > 0:
136 |         normalized_authors_citation[i] = authors_citation[i] / denom
137 |         normalized_coauthor_score[i] = coauthor_score[i] / denom
138 | 
139 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
140 |         for a1 in current_authors_1:
141 |             for a2 in current_authors_2:
142 |                 G[a1][a2]["weight"] += 1
143 | 
144 | training["authors_citation"] = authors_citation
145 | training["normalized_authors_citation"] = normalized_authors_citation
146 | training["coauthor_score"] = coauthor_score
147 | training["normalized_coauthor_score"] = normalized_coauthor_score
148 | training["best_coauthor_score"] = best_coauthor_score
149 | training["best_authors_citation"] = best_authors_citation
150 | 
151 | id1 = testing["id1"].values
152 | id2 = testing["id2"].values
153 | 
154 | coauthor_score = np.zeros(len(id1))
155 | normalized_coauthor_score = np.zeros(len(id1))
156 | best_coauthor_score = np.zeros(len(id1))
157 | authors_citation = np.zeros(len(id1))
158 | normalized_authors_citation = np.zeros(len(id1))
159 | best_authors_citation = np.zeros(len(id1))
160 | 
161 | print("building features for testing")
162 | for i in tqdm(range(len(id1))):
163 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
164 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
165 | 
166 |     if current_authors_1 is np.nan:
167 |         current_authors_1 = []
168 | 
169 |     if current_authors_2 is np.nan:
170 |         current_authors_2 = []
171 | 
172 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
173 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
174 | 
175 |     best = 0
176 |     for a1 in current_authors_1:
177 |         for a2 in current_authors_2:
178 |             if G.has_edge(a1, a2):
179 |                 current = G[a1][a2]["weight"]
180 |                 authors_citation[i] += current
181 |                 if current > best:
182 |                     best = current
183 | 
184 |     best_authors_citation[i] = best
185 | 
186 |     best = 0
187 |     for a1 in current_authors_1:
188 |         for a2 in current_authors_2:
189 |             if coauthors.has_edge(a1, a2):
190 |                 current = coauthors[a1][a2]["weight"]
191 |                 coauthor_score[i] += current
192 |                 if current > best:
193 |                     best = current
194 | 
195 |     best_coauthor_score[i] = best
196 | 
197 |     # normalize features
198 |     denom = len(current_authors_1) * len(current_authors_2)
199 |     if denom > 0:
200 |         normalized_authors_citation[i] = authors_citation[i] / denom
201 |         normalized_coauthor_score[i] = coauthor_score[i] / denom
202 | 
203 | testing["authors_citation"] = authors_citation
204 | testing["normalized_authors_citation"] = normalized_authors_citation
205 | testing["coauthor_score"] = coauthor_score
206 | testing["normalized_coauthor_score"] = normalized_coauthor_score
207 | testing["best_coauthor_score"] = best_coauthor_score
208 | testing["best_authors_citation"] = best_authors_citation
209 | 
210 | print("done, saving data")
211 | # save data-frame
212 | training.to_csv(path_to_data + "training_features.txt")
213 | testing.to_csv(path_to_data + "testing_features.txt")
214 | 


--------------------------------------------------------------------------------
/feature_engineering/authors_2.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import networkx as nx
  4 | import numpy as np
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | 
  8 | from feature_engineering.tools import lit_eval_nan_proof
  9 | 
 10 | # this script computes the features authors_in_neighbors and authors_common_neighbors by considering
 11 | # the author's graph of citations.
 12 | # the script takes approximately 5 minutes to run
 13 | 
 14 | # progress bar for pandas
 15 | tqdm.pandas(tqdm())
 16 | 
 17 | # path
 18 | path_to_data = "data/"
 19 | 
 20 | # loading data
 21 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 22 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 23 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 24 | nodes.set_index("id", inplace=True)
 25 | training = pd.read_csv(path_to_data + "training_features.txt")
 26 | training.set_index("my_index", inplace=True)
 27 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 28 | testing.set_index("my_index", inplace=True)
 29 | 
 30 | # loading data
 31 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 32 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 33 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
 34 |                     converters=converter_dict)
 35 | nodes.set_index("id", inplace=True)
 36 | 
 37 | G = nx.DiGraph()
 38 | coauthors = nx.Graph()
 39 | 
 40 | print("building coauthor graph")
 41 | nodes_id = nodes.index.values
 42 | for i in tqdm(range(len(nodes_id))):
 43 | 
 44 |     authors = nodes.loc[nodes_id[i]]["authors"]
 45 |     if authors is np.nan:
 46 |         authors = []
 47 | 
 48 |     authors = np.unique([a for a in authors if a != ""])
 49 | 
 50 |     for a in authors:
 51 |         G.add_node(a)
 52 |         coauthors.add_node(a)
 53 | 
 54 |     for a1 in authors:
 55 |         for a2 in authors:
 56 |             if a1 != a2:
 57 |                 if coauthors.has_edge(a1, a2):
 58 |                     coauthors[a1][a2]["weight"] += 1
 59 |                 else:
 60 |                     coauthors.add_edge(a1, a2, weight=1)
 61 | 
 62 | id1 = training["id1"].values
 63 | id2 = training["id2"].values
 64 | 
 65 | print("building citation graph")
 66 | for i in tqdm(range(len(id1))):
 67 | 
 68 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
 69 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
 70 | 
 71 |     if current_authors_1 is np.nan:
 72 |         current_authors_1 = []
 73 | 
 74 |     if current_authors_2 is np.nan:
 75 |         current_authors_2 = []
 76 | 
 77 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
 78 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
 79 | 
 80 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 81 |         for a1 in current_authors_1:
 82 |             for a2 in current_authors_2:
 83 |                 if G.has_edge(a1, a2):
 84 |                     G[a1][a2]["weight"] += 1
 85 |                 else:
 86 |                     G.add_edge(a1, a2, weight=1)
 87 | 
 88 | authors_in_neighbors = np.zeros(len(id1))
 89 | normalized_authors_in_neighbors = np.zeros(len(id1))
 90 | best_authors_in_neighbors = np.zeros(len(id1))
 91 | authors_common_neighbors = np.zeros(len(id1))
 92 | 
 93 | print("building features for training")
 94 | for i in tqdm(range(len(id1))):
 95 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
 96 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
 97 | 
 98 |     if current_authors_1 is np.nan:
 99 |         current_authors_1 = []
100 | 
101 |     if current_authors_2 is np.nan:
102 |         current_authors_2 = []
103 | 
104 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
105 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
106 | 
107 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
108 |         for a1 in current_authors_1:
109 |             for a2 in current_authors_2:
110 |                 G[a1][a2]["weight"] -= 1
111 | 
112 |     # this feature is commented because too long to compute
113 |     # for a1 in current_authors_1:
114 |     #     for p in G.successors(a1):
115 |     #         for a2 in G.successors(p):
116 |     #             if a2 in current_authors_2:
117 |     #                 authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"])
118 | 
119 |     best = 0
120 |     for a1 in current_authors_2:
121 |         current = len([g for g in G.predecessors(a1)])
122 |         authors_in_neighbors[i] += current
123 |         if current > best:
124 |             best = current
125 | 
126 |     best_authors_in_neighbors[i] = best
127 | 
128 |     # normalize feature
129 |     denom = len(current_authors_2)
130 |     if denom > 0:
131 |         normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom
132 | 
133 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
134 |         for a1 in current_authors_1:
135 |             for a2 in current_authors_2:
136 |                 G[a1][a2]["weight"] += 1
137 | 
138 | training["authors_in_neighbors"] = authors_in_neighbors
139 | training["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors
140 | training["best_authors_in_neighbors"] = best_authors_in_neighbors
141 | training["authors_common_neighbors"] = authors_common_neighbors
142 | 
143 | id1 = testing["id1"].values
144 | id2 = testing["id2"].values
145 | 
146 | authors_in_neighbors = np.zeros(len(id1))
147 | normalized_authors_in_neighbors = np.zeros(len(id1))
148 | best_authors_in_neighbors = np.zeros(len(id1))
149 | authors_common_neighbors = np.zeros(len(id1))
150 | 
151 | print("building features for testing")
152 | for i in tqdm(range(len(id1))):
153 |     current_authors_1 = nodes.loc[id1[i]]["authors"]
154 |     current_authors_2 = nodes.loc[id2[i]]["authors"]
155 | 
156 |     if current_authors_1 is np.nan:
157 |         current_authors_1 = []
158 | 
159 |     if current_authors_2 is np.nan:
160 |         current_authors_2 = []
161 | 
162 |     current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
163 |     current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
164 | 
165 |     # for a1 in current_authors_1:
166 |     #     for p in G.successors(a1):
167 |     #         for a2 in G.successors(p):
168 |     #             if a2 in current_authors_2:
169 |     #                 authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"])
170 | 
171 |     best = 0
172 |     for a1 in current_authors_2:
173 |         current = len([g for g in G.predecessors(a1)])
174 |         authors_in_neighbors[i] += current
175 |         if current > best:
176 |             best = current
177 | 
178 |     best_authors_in_neighbors[i] = best
179 | 
180 |     # normalize feature
181 |     denom = len(current_authors_2)
182 |     if denom > 0:
183 |         normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom
184 | 
185 | testing["authors_in_neighbors"] = authors_in_neighbors
186 | testing["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors
187 | testing["best_authors_in_neighbors"] = best_authors_in_neighbors
188 | testing["authors_common_neighbors"] = authors_common_neighbors
189 | 
190 | print("done, saving data")
191 | # save data-frame
192 | training.to_csv(path_to_data + "training_features.txt")
193 | testing.to_csv(path_to_data + "testing_features.txt")
194 | 


--------------------------------------------------------------------------------
/feature_engineering/baseline_feature_engineering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from tqdm import tqdm
 3 | 
 4 | from feature_engineering.tools import lit_eval_nan_proof
 5 | 
 6 | # this script produces the following features: overlap_title, date_diff and common_author
 7 | # this is the script that you should run right after the pre-processing
 8 | 
 9 | # progress bar for pandas
10 | tqdm.pandas(tqdm())
11 | 
12 | # path
13 | path_to_data = "data/"
14 | 
15 | # loading preprocessed data
16 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
17 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
18 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
19 | nodes.set_index("id", inplace=True)
20 | training = pd.read_csv(path_to_data + "training_new_index.txt")
21 | training.set_index("my_index", inplace=True)
22 | testing = pd.read_csv(path_to_data + "testing_new_index.txt")
23 | testing.set_index("my_index", inplace=True)
24 | 
25 | # adding baseline features in training dataframe
26 | 
27 | # features placeholders
28 | overlap_title = []
29 | date_diff = []
30 | common_author = []
31 | 
32 | # IDs for training set
33 | id1 = training['id1'].values
34 | id2 = training['id2'].values
35 | 
36 | # computing features for training set
37 | for i in tqdm(range(len(id1))):
38 |     title1 = nodes.at[id1[i], 'title']
39 |     title2 = nodes.at[id2[i], 'title']
40 |     date1 = nodes.at[id1[i], 'year']
41 |     date2 = nodes.at[id2[i], 'year']
42 |     author1 = nodes.at[id1[i], 'authors']
43 |     author2 = nodes.at[id2[i], 'authors']
44 |     overlap_title.append(len(set(title1).intersection(set(title2))))
45 |     date_diff.append(int(date1) - int(date2))
46 |     if isinstance(author1, float) or isinstance(author2, float):
47 |         common_author.append(0)
48 |     else:
49 |         common_author.append(len(set(author1).intersection(set(author2))))
50 | 
51 | # adding feature to data-frame
52 | training["overlap_title"] = overlap_title
53 | training["date_diff"] = date_diff
54 | training["common_author"] = common_author
55 | 
56 | # repeat process for test set
57 | overlap_title_test = []
58 | date_diff_test = []
59 | common_author_test = []
60 | id1 = testing['id1'].values
61 | id2 = testing['id2'].values
62 | for i in tqdm(range(len(id1))):
63 |     title1 = nodes.at[id1[i], 'title']
64 |     title2 = nodes.at[id2[i], 'title']
65 |     date1 = nodes.at[id1[i], 'year']
66 |     date2 = nodes.at[id2[i], 'year']
67 |     author1 = nodes.at[id1[i], 'authors']
68 |     author2 = nodes.at[id2[i], 'authors']
69 |     overlap_title_test.append(len(set(title1).intersection(set(title2))))
70 |     date_diff_test.append(int(date1) - int(date2))
71 |     if isinstance(author1, float) or isinstance(author2, float):
72 |         common_author_test.append(0)
73 |     else:
74 |         common_author_test.append(len(set(author1).intersection(set(author2))))
75 | testing["overlap_title"] = overlap_title_test
76 | testing["date_diff"] = date_diff_test
77 | testing["common_author"] = common_author_test
78 | 
79 | # save data sets
80 | training.to_csv(path_to_data + "training_features.txt")
81 | testing.to_csv(path_to_data + "testing_features.txt")
82 | 


--------------------------------------------------------------------------------
/feature_engineering/basic_features.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from tqdm import tqdm
 3 | 
 4 | from feature_engineering.tools import compare_journals, lit_eval_nan_proof
 5 | 
 6 | # this script adds the features journal_similarity and overlapping_words_abstract to the csv features files
 7 | 
 8 | # progress bar for pandas
 9 | tqdm.pandas(tqdm())
10 | 
11 | # path
12 | path_to_data = "data/"
13 | 
14 | # loading data
15 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
16 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
17 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
18 |                     converters=converter_dict)
19 | nodes.set_index("id", inplace=True)
20 | training = pd.read_csv(path_to_data + "training_features.txt")
21 | training.set_index("my_index", inplace=True)
22 | testing = pd.read_csv(path_to_data + "testing_features.txt")
23 | testing.set_index("my_index", inplace=True)
24 | 
25 | # placeholder for second batch of features
26 | journal_similarity = []
27 | overlapping_words_abstract = []
28 | 
29 | # IDs for training set
30 | id1 = training['id1'].values
31 | id2 = training['id2'].values
32 | 
33 | # computing features for training set
34 | for i in tqdm(range(len(id1))):
35 |     journal1 = nodes.at[id1[i], 'journal']
36 |     journal2 = nodes.at[id2[i], 'journal']
37 |     abstract1 = nodes.at[id1[i], "abstract"]
38 |     abstract2 = nodes.at[id2[i], "abstract"]
39 |     if isinstance(journal1, float) or isinstance(journal2, float):
40 |         journal_similarity.append(0)
41 |     else:
42 |         journal_similarity.append(compare_journals(journal1, journal2))
43 |     overlapping_words_abstract.append(len(set(abstract1).intersection(set(abstract2))))
44 | 
45 | # adding feature to dataframe
46 | training["journal_similarity"] = journal_similarity
47 | training["overlapping_words_abstract"] = overlapping_words_abstract
48 | 
49 | # repeat process for test set
50 | journal_similarity_test = []
51 | overlapping_words_abstract_test = []
52 | id1 = testing['id1'].values
53 | id2 = testing['id2'].values
54 | for i in tqdm(range(len(id1))):
55 |     journal1 = nodes.at[id1[i], 'journal']
56 |     journal2 = nodes.at[id2[i], 'journal']
57 |     abstract1 = nodes.at[id1[i], "abstract"]
58 |     abstract2 = nodes.at[id2[i], "abstract"]
59 |     if isinstance(journal1, float) or isinstance(journal2, float):
60 |         journal_similarity_test.append(0)
61 |     else:
62 |         journal_similarity_test.append(compare_journals(journal1, journal2))
63 |     overlapping_words_abstract_test.append(len(set(abstract1).intersection(set(abstract2))))
64 | testing["journal_similarity"] = journal_similarity_test
65 | testing["overlapping_words_abstract"] = overlapping_words_abstract_test
66 | 
67 | # save data sets
68 | training.to_csv(path_to_data + "training_features.txt")
69 | testing.to_csv(path_to_data + "testing_features.txt")
70 | 


--------------------------------------------------------------------------------
/feature_engineering/citation_graph_features.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | import numpy as np
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | from feature_engineering.tools import lit_eval_nan_proof
 7 | 
 8 | # this script adds the feature shortest_path to the files training_features and testing_features
 9 | # this script takes approximately 1000 minutes to execute
10 | 
11 | # progress bar for pandas
12 | tqdm.pandas(tqdm())
13 | 
14 | # path
15 | path_to_data = "data/"
16 | 
17 | # loading data
18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
19 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
21 | nodes.set_index("id", inplace=True)
22 | training = pd.read_csv(path_to_data + "training_features.txt")
23 | training.set_index("my_index", inplace=True)
24 | testing = pd.read_csv(path_to_data + "testing_features.txt")
25 | testing.set_index("my_index", inplace=True)
26 | 
27 | # placeholders for graph features
28 | shortest_path = []
29 | 
30 | # IDs for training set
31 | id1 = training['id1'].values
32 | id2 = training['id2'].values
33 | target = training["target"].values
34 | 
35 | # creating graph of citations
36 | 
37 | # create empty directed graph
38 | g = igraph.Graph(directed=True)
39 | 
40 | # some nodes may not be connected to any other node
41 | # hence the need to create the nodes of the graph from node_info.csv,
42 | # not just from the edge list
43 | nodes = nodes.index.values
44 | str_vec = np.vectorize(str)
45 | nodes = str_vec(nodes)
46 | 
47 | # add vertices
48 | g.add_vertices(nodes)
49 | 
50 | # create and add edges
51 | edges = [(str(id1[i]), str(id2[i])) for i in range(len(id1)) if target[i] == 1]
52 | g.add_edges(edges)
53 | 
54 | for i in tqdm(range(len(id1))):
55 |     if target[i] == 1:
56 |         g.delete_edges([(str(id1[i]), str(id2[i]))])
57 |     shortest_path.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0])
58 |     if target[i] == 1:
59 |         g.add_edge(str(id1[i]), str(id2[i]))
60 | # adding feature to dataframe
61 | training["shortest_path"] = shortest_path
62 | 
63 | # repeat process for test set
64 | shortest_path_test = []
65 | id1 = testing['id1'].values
66 | id2 = testing['id2'].values
67 | for i in tqdm(range(len(id1))):
68 |     shortest_path_test.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0])
69 |     if target[i] == 1:
70 |         g.add_edge(str(id1[i]), str(id2[i]))
71 | testing["shortest_path"] = shortest_path_test
72 | 
73 | # save data sets
74 | training.to_csv(path_to_data + "training_features.txt")
75 | testing.to_csv(path_to_data + "testing_features.txt")
76 | 


--------------------------------------------------------------------------------
/feature_engineering/cosine_distance.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from gensim import corpora, models
  6 | from tqdm import tqdm
  7 | 
  8 | from feature_engineering.tools import lit_eval_nan_proof
  9 | 
 10 | # this script adds the features score_1_2, score_2_1 and cosine_distance to the features csv files.
 11 | # this script takes approximately 10 minutes to run
 12 | 
 13 | # progress bar for pandas
 14 | tqdm.pandas(tqdm())
 15 | 
 16 | # path
 17 | path_to_data = "data/"
 18 | 
 19 | # loading data
 20 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 21 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 22 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 23 | nodes.set_index("id", inplace=True)
 24 | training = pd.read_csv(path_to_data + "training_features.txt")
 25 | training.set_index("my_index", inplace=True)
 26 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 27 | testing.set_index("my_index", inplace=True)
 28 | 
 29 | # create dictionary for tfidf
 30 | abstracts = nodes['abstract'].values
 31 | average_len = np.mean(np.array([len(a) for a in abstracts]))
 32 | dictionary = corpora.Dictionary(abstracts)
 33 | 
 34 | 
 35 | def my_tf(p):
 36 |     return math.log(1.0 + p)
 37 | 
 38 | 
 39 | # instantiate tf-idf model
 40 | tfidf = models.TfidfModel(dictionary=dictionary, wlocal=my_tf)
 41 | 
 42 | 
 43 | # handy functions to compute cosine distance
 44 | def get_tf_idf_encoding(index):
 45 |     abstract = nodes.at[index, "abstract"]
 46 |     abstract = dictionary.doc2bow(abstract)
 47 |     ans = tfidf[[abstract]]
 48 |     return ans[0]
 49 | 
 50 | 
 51 | def my_norm(tfidf_abstract):
 52 |     ans = 0.0
 53 |     for (k, v) in tfidf_abstract:
 54 |         ans += v ** 2
 55 |     return np.sqrt(ans)
 56 | 
 57 | 
 58 | def cosine_distance(id1, id2):
 59 |     tfidf_abstract1 = get_tf_idf_encoding(id1)
 60 |     tfidf_abstract2 = get_tf_idf_encoding(id2)
 61 |     f1 = dict(tfidf_abstract1)
 62 |     f2 = dict(tfidf_abstract2)
 63 |     ans = 0.0
 64 |     for k, v in f1.items():
 65 |         if k in f2.keys():
 66 |             ans += v * f2[k]
 67 |     return ans
 68 | 
 69 | 
 70 | def get_score(id1, id2, avglen, k1=1.2, b=0.75):
 71 |     abstract_1 = nodes.at[id1, "abstract"]
 72 |     len_1 = len(abstract_1)
 73 |     abstract_1 = dictionary.doc2bow(abstract_1)
 74 |     tf_1 = dict([
 75 |         (termid, tfidf.wlocal(tf))
 76 |         for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0
 77 |     ])
 78 |     idf_1 = dict([
 79 |         (termid, tfidf.idfs.get(termid))
 80 |         for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0
 81 |     ])
 82 | 
 83 |     abstract_2 = nodes.at[id2, "abstract"]
 84 |     abstract_2 = dictionary.doc2bow(abstract_2)
 85 |     tf_2 = dict([
 86 |         (termid, tfidf.wlocal(tf))
 87 |         for termid, tf in abstract_2 if tfidf.idfs.get(termid, 0.0) != 0.0
 88 |     ])
 89 | 
 90 |     ans = 0.0
 91 |     for k, v in tf_1.items():
 92 |         if k in tf_2.keys():
 93 |             ans += idf_1[k] * (v * (k1 + 1)) / (v + k1 * (1 - b + b * len_1 / avglen))
 94 |     return ans
 95 | 
 96 | 
 97 | # placeholder for feature
 98 | score_1_2 = []
 99 | score_2_1 = []
100 | cosine_dist = []
101 | 
102 | # IDs for training set
103 | id1 = training['id1'].values
104 | id2 = training['id2'].values
105 | 
106 | # computing features for training set
107 | for i in tqdm(range(len(id1))):
108 |     score_1_2.append(get_score(id1[i], id2[i], average_len))
109 |     score_2_1.append(get_score(id2[i], id1[i], average_len))
110 |     cosine_dist.append(cosine_distance(id1[i], id2[i]))
111 | 
112 | # add feature to data-frame
113 | training["score_1_2"] = score_1_2
114 | training["score_2_1"] = score_2_1
115 | training["cosine_distance"] = cosine_dist
116 | 
117 | score_1_2 = []
118 | score_2_1 = []
119 | cosine_dist = []
120 | 
121 | # IDs for training set
122 | id1 = testing['id1'].values
123 | id2 = testing['id2'].values
124 | 
125 | # computing features for training set
126 | for i in tqdm(range(len(id1))):
127 |     score_1_2.append(get_score(id1[i], id2[i], average_len))
128 |     score_2_1.append(get_score(id2[i], id1[i], average_len))
129 |     cosine_dist.append(cosine_distance(id1[i], id2[i]))
130 | 
131 | # add feature to data-frame
132 | testing["score_1_2"] = score_1_2
133 | testing["score_2_1"] = score_2_1
134 | testing["cosine_distance"] = cosine_dist
135 | 
136 | # save data-frame
137 | training.to_csv(path_to_data + "training_features.txt")
138 | testing.to_csv(path_to_data + "testing_features.txt")
139 | 


--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import numpy as np
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | 
  6 | from feature_engineering.tools import lit_eval_nan_proof
  7 | 
  8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar,
  9 | #  preferential_attachment, resource_allocation_index and common_neighbors
 10 | # approx 10 minutes to run it
 11 | 
 12 | # progress bar for pandas
 13 | tqdm.pandas(tqdm())
 14 | 
 15 | # path
 16 | path_to_data = "data/"
 17 | 
 18 | # loading data
 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 20 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 22 | nodes.set_index("id", inplace=True)
 23 | training = pd.read_csv(path_to_data + "training_features.txt")
 24 | training.set_index("my_index", inplace=True)
 25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 26 | testing.set_index("my_index", inplace=True)
 27 | 
 28 | G = nx.Graph()
 29 | G.add_nodes_from(nodes.index.values)
 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
 31 | 
 32 | # IDs for training set
 33 | id1 = training['id1'].values
 34 | id2 = training['id2'].values
 35 | 
 36 | # placeholder for feature
 37 | n = len(id1)
 38 | jaccard = np.zeros(n)
 39 | adar = np.zeros(n)
 40 | preferential_attachment = np.zeros(n)
 41 | resource_allocation_index = np.zeros(n)
 42 | common_neighbors = np.zeros(n)
 43 | 
 44 | # computing features for training set
 45 | for i in tqdm(range(len(id1))):
 46 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 47 |         G.remove_edge(id1[i], id2[i])
 48 | 
 49 |     pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
 50 |     pred = [(u, v, p) for (u, v, p) in pred]
 51 |     jaccard[i] = pred[0][2]
 52 | 
 53 |     pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
 54 |     pred = [(u, v, p) for (u, v, p) in pred]
 55 |     adar[i] = pred[0][2]
 56 | 
 57 |     pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
 58 |     pred = [(u, v, p) for (u, v, p) in pred]
 59 |     preferential_attachment[i] = pred[0][2]
 60 | 
 61 |     pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
 62 |     pred = [(u, v, p) for (u, v, p) in pred]
 63 |     resource_allocation_index[i] = pred[0][2]
 64 | 
 65 |     pred = nx.common_neighbors(G, id1[i], id2[i])
 66 |     pred = len([u for u in pred])
 67 |     common_neighbors[i] = pred
 68 | 
 69 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 70 |         G.add_edge(id1[i], id2[i])
 71 | 
 72 | # add feature to data-frame
 73 | training["jaccard"] = jaccard
 74 | training["adar"] = adar
 75 | training["preferential_attachment"] = preferential_attachment
 76 | training["resource_allocation_index"] = resource_allocation_index
 77 | training["common_neighbors"] = resource_allocation_index
 78 | 
 79 | # IDs for training set
 80 | id1 = testing['id1'].values
 81 | id2 = testing['id2'].values
 82 | 
 83 | # placeholder for feature
 84 | n = len(id1)
 85 | jaccard = np.zeros(n)
 86 | adar = np.zeros(n)
 87 | preferential_attachment = np.zeros(n)
 88 | resource_allocation_index = np.zeros(n)
 89 | common_neighbors = np.zeros(n)
 90 | 
 91 | # computing features for training set
 92 | for i in tqdm(range(len(id1))):
 93 |     pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
 94 |     pred = [(u, v, p) for (u, v, p) in pred]
 95 |     jaccard[i] = pred[0][2]
 96 | 
 97 |     pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
 98 |     pred = [(u, v, p) for (u, v, p) in pred]
 99 |     adar[i] = pred[0][2]
100 | 
101 |     pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
102 |     pred = [(u, v, p) for (u, v, p) in pred]
103 |     preferential_attachment[i] = pred[0][2]
104 | 
105 |     pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
106 |     pred = [(u, v, p) for (u, v, p) in pred]
107 |     resource_allocation_index[i] = pred[0][2]
108 | 
109 |     pred = nx.common_neighbors(G, id1[i], id2[i])
110 |     pred = len([u for u in pred])
111 |     common_neighbors[i] = pred
112 | 
113 | # add feature to data-frame
114 | testing["jaccard"] = jaccard
115 | testing["adar"] = adar
116 | testing["preferential_attachment"] = preferential_attachment
117 | testing["resource_allocation_index"] = resource_allocation_index
118 | testing["common_neighbors"] = resource_allocation_index
119 | 
120 | # save data-frame
121 | training.to_csv(path_to_data + "training_features.txt")
122 | testing.to_csv(path_to_data + "testing_features.txt")
123 | 


--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph_long.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Pool
  2 | 
  3 | import networkx as nx
  4 | import numpy as np
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | 
  8 | from feature_engineering.tools import lit_eval_nan_proof
  9 | 
 10 | # this script computes some features by considering the bidirectional graph of citations: katz
 11 | 
 12 | # progress bar for pandas
 13 | tqdm.pandas(tqdm())
 14 | 
 15 | # path
 16 | path_to_data = "data/"
 17 | 
 18 | # loading data
 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 20 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 22 | nodes.set_index("id", inplace=True)
 23 | training = pd.read_csv(path_to_data + "training_features.txt")
 24 | training.set_index("my_index", inplace=True)
 25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 26 | testing.set_index("my_index", inplace=True)
 27 | 
 28 | # IDs for training set
 29 | id1 = training['id1'].values
 30 | id2 = training['id2'].values
 31 | 
 32 | # placeholder for feature
 33 | n = len(id1)
 34 | print("start computing for training: ")
 35 | print("size of data to process: " + str(n))
 36 | katz = np.zeros(n)
 37 | katz_2 = np.zeros(n)
 38 | breaking_point = 10
 39 | beta = 0.98
 40 | beta_2 = 0.9
 41 | 
 42 | 
 43 | def work(i0=None, n=None, is_training=True):
 44 |     print(i0)
 45 |     G = nx.Graph()
 46 |     G.add_nodes_from(nodes.index.values)
 47 |     G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
 48 | 
 49 |     ans = np.zeros(n)
 50 |     ans_2 = np.zeros(n)
 51 | 
 52 |     for i in range(n):
 53 |         if is_training:
 54 |             if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1:
 55 |                 G.remove_edge(id1[i0 + i], id2[i0 + i])
 56 | 
 57 |         katz_acc = 0.0
 58 |         katz_2_acc = 0.0
 59 |         counter = 0
 60 |         try:
 61 |             iterator = nx.all_shortest_paths(G, source=id1[i0 + i], target=id2[i0 + i])
 62 |             for p in iterator:
 63 |                 len_p = len(p)
 64 |                 katz_acc += (beta ** len_p)
 65 |                 katz_2_acc += (beta_2 ** len_p)
 66 |                 counter += 1
 67 |                 if counter >= breaking_point:
 68 |                     break
 69 |         except:
 70 |             ans[i] = -1
 71 |             ans_2[i] = -1
 72 | 
 73 |         if is_training:
 74 |             if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1:
 75 |                 G.add_edge(id1[i0 + i], id2[i0 + i])
 76 | 
 77 |         ans[i] = katz_acc
 78 |         ans_2[i] = katz_2_acc
 79 | 
 80 |     print(i0)
 81 | 
 82 |     return ans, ans_2, i0
 83 | 
 84 | 
 85 | def callback(r):
 86 |     ans, ans_2, i0 = r
 87 | 
 88 | 
 89 | # computing features for training set
 90 | 
 91 | pool = Pool()
 92 | print("starting pool...")
 93 | import time
 94 | 
 95 | start = time.time()
 96 | n_tasks = 512
 97 | tasks = []
 98 | step = int(n / n_tasks)
 99 | print(step)
100 | for i0 in range(n_tasks):
101 |     kwds = {
102 |         "i0": i0 * step,
103 |         "n": step,
104 |         "is_training": True
105 |     }
106 |     tasks.append(pool.apply_async(work, kwds=kwds, callback=callback))
107 | pool.close()
108 | pool.join()
109 | for i in range(n_tasks):
110 |     katz[i * step: (i + 1) * step], \
111 |     katz_2[i * step: (i + 1) * step], _ = tasks[i].get()
112 | 
113 | end = time.time()
114 | print(end - start)
115 | # add feature to data-frame
116 | training["katz"] = katz
117 | training["katz_2"] = katz_2
118 | 
119 | # IDs for testing set
120 | print("start computing for training: ")
121 | id1 = testing['id1'].values
122 | id2 = testing['id2'].values
123 | 
124 | # placeholder for feature
125 | n = len(id1)
126 | print("size of data to process: " + str(n))
127 | 
128 | katz = np.zeros(n)
129 | katz_2 = np.zeros(n)
130 | 
131 | pool = Pool()
132 | print("starting pool...")
133 | n_tasks = 512
134 | tasks = []
135 | step = int(n / n_tasks)
136 | for i0 in range(n_tasks):
137 |     kwds = {
138 |         "i0": i0 * step,
139 |         "n": step,
140 |         "is_training": False
141 |     }
142 |     tasks.append(pool.apply_async(work, kwds=kwds, callback=callback))
143 | pool.close()
144 | pool.join()
145 | for i in range(n_tasks):
146 |     katz[i * step: (i + 1) * step], \
147 |     katz_2[i * step: (i + 1) * step], _ = tasks[i].get()
148 | 
149 | # add feature to data-frame
150 | testing["katz"] = katz
151 | testing["katz_2"] = katz_2
152 | 
153 | print("done, saving data")
154 | # save data-frame
155 | training.to_csv(path_to_data + "training_features.txt")
156 | testing.to_csv(path_to_data + "testing_features.txt")
157 | 


--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph_long2.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import numpy as np
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | 
  6 | from feature_engineering.tools import lit_eval_nan_proof
  7 | 
  8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar,
  9 | #  preferential_attachment, resource_allocation_index and common_neighbors
 10 | # approx 10 minutes to run it
 11 | 
 12 | # progress bar for pandas
 13 | tqdm.pandas(tqdm())
 14 | 
 15 | # path
 16 | path_to_data = "data/"
 17 | 
 18 | # loading data
 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 20 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 22 | nodes.set_index("id", inplace=True)
 23 | training = pd.read_csv(path_to_data + "training_features.txt")
 24 | training.set_index("my_index", inplace=True)
 25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 26 | testing.set_index("my_index", inplace=True)
 27 | 
 28 | G = nx.Graph()
 29 | G.add_nodes_from(nodes.index.values)
 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
 31 | 
 32 | # IDs for training set
 33 | id1 = training['id1'].values
 34 | id2 = training['id2'].values
 35 | 
 36 | # placeholder for feature
 37 | n = len(id1)
 38 | katz = np.zeros(n)
 39 | katz_2 = np.zeros(n)
 40 | beta = 0.98
 41 | beta_2 = 0.90
 42 | breaking_point = 10
 43 | 
 44 | # computing features for training set
 45 | for i in tqdm(range(len(id1))):
 46 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 47 |         G.remove_edge(id1[i], id2[i])
 48 | 
 49 |     katz_acc = 0.0
 50 |     katz_2_acc = 0.0
 51 |     counter = 0
 52 |     try:
 53 |         iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i])
 54 |         for p in iterator:
 55 |             len_p = len(p)
 56 |             katz_acc += len_p * (beta ** len_p)
 57 |             katz_2_acc += len_p * (beta_2 ** len_p)
 58 |             counter += 1
 59 |             if counter >= breaking_point:
 60 |                 break
 61 |         katz[i] = katz_acc
 62 |         katz[i] = katz_2_acc
 63 |     except:
 64 |         katz[i] = -1
 65 |         katz_2[i] = -1
 66 | 
 67 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 68 |         G.add_edge(id1[i], id2[i])
 69 | 
 70 | # add feature to data-frame
 71 | training["katz"] = katz
 72 | training["katz_2"] = katz_2
 73 | 
 74 | # IDs for training set
 75 | id1 = testing['id1'].values
 76 | id2 = testing['id2'].values
 77 | 
 78 | # placeholder for feature
 79 | n = len(id1)
 80 | katz = np.zeros(n)
 81 | katz_2 = np.zeros(n)
 82 | 
 83 | # computing features for training set
 84 | for i in tqdm(range(len(id1))):
 85 |     katz_acc = 0.0
 86 |     katz_2_acc = 0.0
 87 |     counter = 0
 88 |     try:
 89 |         iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i])
 90 |         for p in iterator:
 91 |             len_p = len(p)
 92 |             katz_acc += len_p * (beta ** len_p)
 93 |             katz_2_acc += len_p * (beta_2 ** len_p)
 94 |             counter += 1
 95 |             if counter >= breaking_point:
 96 |                 break
 97 |         katz[i] = katz_acc
 98 |         katz[i] = katz_2_acc
 99 |     except:
100 |         katz[i] = -1
101 |         katz_2[i] = -1
102 | 
103 | # add feature to data-frame
104 | testing["katz"] = katz
105 | testing["katz_2"] = katz_2
106 | 
107 | # save data-frame
108 | training.to_csv(path_to_data + "training_features.txt")
109 | testing.to_csv(path_to_data + "testing_features.txt")
110 | 


--------------------------------------------------------------------------------
/feature_engineering/networkx_digraph.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import numpy as np
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | 
  6 | from feature_engineering.tools import lit_eval_nan_proof
  7 | 
  8 | # this script computes the features out_neighbors, in_neighbors and popularity by considering the directed
  9 | # graph of citations. Popularity is the sum of in degrees of predecessors.
 10 | # the script takes approximately 5 minutes to run
 11 | 
 12 | # progress bar for pandas
 13 | tqdm.pandas(tqdm())
 14 | 
 15 | # path
 16 | path_to_data = "data/"
 17 | 
 18 | # loading data
 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
 20 |                   'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
 22 | nodes.set_index("id", inplace=True)
 23 | training = pd.read_csv(path_to_data + "training_features.txt")
 24 | training.set_index("my_index", inplace=True)
 25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 26 | testing.set_index("my_index", inplace=True)
 27 | 
 28 | G = nx.DiGraph()
 29 | G.add_nodes_from(nodes.index.values)
 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
 31 | 
 32 | # IDs for training set
 33 | id1 = training['id1'].values
 34 | id2 = training['id2'].values
 35 | 
 36 | # placeholder for feature
 37 | n = len(id1)
 38 | out_neighbors = np.zeros(n)
 39 | in_neighbors = np.zeros(n)
 40 | popularity = np.zeros(n)
 41 | common_predecessors = np.zeros(n)
 42 | common_successors = np.zeros(n)
 43 | paths_of_length_one = np.zeros(n)
 44 | 
 45 | # computing features for training set
 46 | for i in tqdm(range(len(id1))):
 47 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 48 |         G.remove_edge(id1[i], id2[i])
 49 | 
 50 |     in_neighbors[i] = G.in_degree(id2[i])
 51 |     out_neighbors[i] = G.out_degree(id1[i])
 52 | 
 53 |     current_common_successors = 0
 54 |     current_common_predecessors = 0
 55 |     current_paths_of_length_one = 0
 56 | 
 57 |     predecessors_2 = G.predecessors(id2[i])
 58 |     predecessors_1 = G.predecessors(id1[i])
 59 | 
 60 |     pop = 0
 61 |     for p in predecessors_2:
 62 |         pop += G.in_degree(p)
 63 |         if p in predecessors_1:
 64 |             current_common_predecessors += 1
 65 |     popularity[i] = pop
 66 | 
 67 |     successors_2 = G.successors(id2[i])
 68 |     successors_1 = G.successors(id1[i])
 69 | 
 70 |     for p in successors_1:
 71 |         if p in successors_2:
 72 |             current_common_successors += 1
 73 | 
 74 |     for p in successors_1:
 75 |         if p in predecessors_2:
 76 |             current_paths_of_length_one += 1
 77 | 
 78 |     common_successors[i] = current_common_successors
 79 |     common_predecessors[i] = current_common_predecessors
 80 |     paths_of_length_one[i] = current_paths_of_length_one
 81 | 
 82 |     if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
 83 |         G.add_edge(id1[i], id2[i])
 84 | 
 85 | # add feature to data-frame
 86 | training["out_neighbors"] = out_neighbors
 87 | training["in_neighbors"] = in_neighbors
 88 | training["popularity"] = popularity
 89 | training["common_successors"] = out_neighbors
 90 | training["common_predecessors"] = in_neighbors
 91 | training["paths_of_length_one"] = popularity
 92 | 
 93 | # IDs for training set
 94 | id1 = testing['id1'].values
 95 | id2 = testing['id2'].values
 96 | 
 97 | # placeholder for feature
 98 | n = len(id1)
 99 | out_neighbors = np.zeros(n)
100 | in_neighbors = np.zeros(n)
101 | popularity = np.zeros(n)
102 | 
103 | # computing features for training set
104 | for i in tqdm(range(len(id1))):
105 | 
106 |     in_neighbors[i] = G.in_degree(id2[i])
107 |     out_neighbors[i] = G.out_degree(id1[i])
108 | 
109 |     current_common_successors = 0
110 |     current_common_predecessors = 0
111 |     current_paths_of_length_one = 0
112 | 
113 |     predecessors_2 = G.predecessors(id2[i])
114 |     predecessors_1 = G.predecessors(id1[i])
115 | 
116 |     pop = 0
117 |     for p in predecessors_2:
118 |         pop += G.in_degree(p)
119 |         if p in predecessors_1:
120 |             current_common_predecessors += 1
121 |     popularity[i] = pop
122 | 
123 |     successors_2 = G.successors(id2[i])
124 |     successors_1 = G.successors(id1[i])
125 | 
126 |     for p in successors_1:
127 |         if p in successors_2:
128 |             current_common_successors += 1
129 | 
130 |     for p in successors_1:
131 |         if p in predecessors_2:
132 |             current_paths_of_length_one += 1
133 | 
134 |     common_successors[i] = current_common_successors
135 |     common_predecessors[i] = current_common_predecessors
136 |     paths_of_length_one[i] = current_paths_of_length_one
137 | 
138 |     popularity[i] = pop
139 | 
140 | # add feature to data-frame
141 | testing["out_neighbors"] = out_neighbors
142 | testing["in_neighbors"] = in_neighbors
143 | testing["popularity"] = popularity
144 | testing["common_successors"] = out_neighbors
145 | testing["common_predecessors"] = in_neighbors
146 | testing["paths_of_length_one"] = popularity
147 | 
148 | # save data-frame
149 | training.to_csv(path_to_data + "training_features.txt")
150 | testing.to_csv(path_to_data + "testing_features.txt")
151 | 


--------------------------------------------------------------------------------
/feature_engineering/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import pandas as pd
 3 | from tqdm import tqdm
 4 | 
 5 | from feature_engineering.tools import \
 6 |     text_element_wise_preprocess, \
 7 |     authors_element_wise_preprocess, \
 8 |     journal_element_wise_preprocess
 9 | 
10 | # This script reads the data in node_information.csv and training_set and testing_set.csv, and creates the
11 | # files "nodes_preprocessed.csv", "training_new_index.txt" and "testing_new_index.txt".
12 | 
13 | 
14 | # pre-processing tools
15 | nltk.download('punkt')  # for tokenization
16 | nltk.download('stopwords')
17 | 
18 | # progress bar for pandas
19 | tqdm.pandas(tqdm())
20 | 
21 | # path
22 | path_to_data = "data/"
23 | 
24 | # pre-processing tools
25 | nltk.download('punkt')  # for tokenization
26 | nltk.download('stopwords')
27 | stpwds = set(nltk.corpus.stopwords.words("english"))
28 | stemmer = nltk.stem.PorterStemmer()
29 | 
30 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
31 | nodes = pd.read_csv(path_to_data + "node_information.csv", names=nodes_header)
32 | nodes.set_index("id", inplace=True)
33 | 
34 | # apply to DF
35 | nodes['title'] = nodes['title'].progress_apply(text_element_wise_preprocess)
36 | nodes['abstract'] = nodes['abstract'].progress_apply(text_element_wise_preprocess)
37 | nodes['authors'] = nodes['authors'].progress_apply(authors_element_wise_preprocess)
38 | nodes['journal'] = nodes['journal'].progress_apply(journal_element_wise_preprocess)
39 | 
40 | # loading train
41 | names = ["id1", "id2", "target"]
42 | training = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
43 | 
44 | # indexing consistent throughout project
45 | training["my_index"] = training["id1"].astype(str) + "|" + training["id2"].astype(str)
46 | training.set_index("my_index", inplace=True)
47 | 
48 | # same process for testing set
49 | names = ["id1", "id2"]
50 | testing = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
51 | testing["my_index"] = testing["id1"].astype(str) + "|" + testing["id2"].astype(str)
52 | testing.set_index("my_index", inplace=True)
53 | 
54 | # save preprocessed data sets
55 | nodes.to_csv(path_to_data + "nodes_preprocessed.csv")
56 | training.to_csv(path_to_data + "training_new_index.txt")
57 | testing.to_csv(path_to_data + "testing_new_index.txt")
58 | 


--------------------------------------------------------------------------------
/feature_engineering/tools.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | import nltk
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | # journal similarity feature
 9 | def compare_journals(journal1, journal2):
10 |     if len(journal1) == 0 or len(journal2) == 0:
11 |         return 0
12 |     if journal1[0] == journal2[0]:
13 |         return 1 + compare_journals(journal1[1:], journal2[1:])
14 |     else:
15 |         return 0
16 | 
17 | 
18 | # nan-proof string converter wrapper
19 | def lit_eval_nan_proof(string):
20 |     if len(string) == 0:
21 |         return np.nan
22 |     else:
23 |         return ast.literal_eval(string)
24 | 
25 | 
26 | # element-wise stemmed tokenization and stopwords removal for titles and abstracts
27 | def text_element_wise_preprocess(string):
28 |     stpwds = set(nltk.corpus.stopwords.words("english"))
29 |     stemmer = nltk.stem.PorterStemmer()
30 |     tokens = string.lower().split(" ")
31 |     tokens_wo_stpwds = [stemmer.stem(token) for token in tokens if token not in stpwds]
32 |     return tokens_wo_stpwds
33 | 
34 | 
35 | # element-wise lower case tokenization for authors
36 | def authors_element_wise_preprocess(string):
37 |     if pd.isna(string):
38 |         return string
39 |     tokens = string.lower().split(", ")
40 |     for i in range(len(tokens)):
41 |         tokens[i] = tokens[i].split('(', 1)[0].strip(' ')
42 |     return tokens
43 | 
44 | 
45 | # element-wise lower case tokenization for journals
46 | def journal_element_wise_preprocess(string):
47 |     if pd.isna(string):
48 |         return string
49 |     tokens = string.lower().rstrip(".").split(".")
50 |     return tokens
51 | 


--------------------------------------------------------------------------------
/link-prediction-report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/link-prediction-report.pdf


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # feature engineering
  2 | 
  3 | import time
  4 | 
  5 | start = time.time()
  6 | print("preprocessing:")
  7 | import feature_engineering.preprocessing
  8 | 
  9 | end = time.time()
 10 | print("done in: " + str(end - start))
 11 | 
 12 | start = time.time()
 13 | print("baseline_feature_engineering:")
 14 | import feature_engineering.baseline_feature_engineering
 15 | 
 16 | end = time.time()
 17 | print("done in: " + str(end - start))
 18 | 
 19 | start = time.time()
 20 | print("basic_features:")
 21 | import feature_engineering.basic_features
 22 | 
 23 | end = time.time()
 24 | print("done in: " + str(end - start))
 25 | 
 26 | start = time.time()
 27 | print("cosine_distance:")
 28 | import feature_engineering.cosine_distance
 29 | 
 30 | end = time.time()
 31 | print("done in: " + str(end - start))
 32 | 
 33 | start = time.time()
 34 | print("networkx_bigraph:")
 35 | import feature_engineering.networkx_bigraph
 36 | 
 37 | end = time.time()
 38 | print("done in: " + str(end - start))
 39 | 
 40 | start = time.time()
 41 | print("networkx_digraph:")
 42 | import feature_engineering.networkx_digraph
 43 | 
 44 | end = time.time()
 45 | print("done in: " + str(end - start))
 46 | 
 47 | start = time.time()
 48 | print("author's features:")
 49 | import feature_engineering.authors
 50 | 
 51 | end = time.time()
 52 | print("done in: " + str(end - start))
 53 | 
 54 | start = time.time()
 55 | print("author's features:")
 56 | import feature_engineering.authors_2
 57 | 
 58 | end = time.time()
 59 | print("done in: " + str(end - start))
 60 | 
 61 | # models : train them and store the output probits for stacking purposes
 62 | 
 63 | start = time.time()
 64 | print("SVM:")
 65 | import models.svm
 66 | 
 67 | end = time.time()
 68 | print("done in: " + str(end - start))
 69 | 
 70 | start = time.time()
 71 | print("Random Forest:")
 72 | import models.random_forest
 73 | 
 74 | end = time.time()
 75 | print("done in: " + str(end - start))
 76 | 
 77 | start = time.time()
 78 | print("LightGBM:")
 79 | import models.lgbm
 80 | 
 81 | end = time.time()
 82 | print("done in: " + str(end - start))
 83 | 
 84 | start = time.time()
 85 | print("shallow NN:")
 86 | import models.nn
 87 | 
 88 | end = time.time()
 89 | print("done in: " + str(end - start))
 90 | 
 91 | start = time.time()
 92 | print("deep NN:")
 93 | import models.nn_deep
 94 | 
 95 | end = time.time()
 96 | print("done in: " + str(end - start))
 97 | 
 98 | # train the model stack and generate final submission "stack_sub_rf.csv"
 99 | 
100 | start = time.time()
101 | print("stack :")
102 | import stacking.stacking
103 | 
104 | end = time.time()
105 | print("done in: " + str(end - start))
106 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/__init__.py


--------------------------------------------------------------------------------
/models/camboui.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "date: 2018-02-16 16:16:34.322166\n",
 13 |       "features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance', 'shortest_path', 'jaccard', 'adar', 'preferential_attachment', 'resource_allocation_index', 'out_neighbors', 'in_neighbors', 'popularity', 'common_neighbors']\n",
 14 |       "model: Random Forest\n",
 15 |       "parameters:\n",
 16 |       "{'n_estimators': 10}\n",
 17 |       "cross validation:\n",
 18 |       "train: 0.9966042778250185\n",
 19 |       "test: 0.9720086406139066\n",
 20 |       "train: 0.9967559756127237\n",
 21 |       "test: 0.9717386898461955\n",
 22 |       "train: 0.9965911639381028\n",
 23 |       "test: 0.9717295528568946\n",
 24 |       "train: 0.9965775073031881\n",
 25 |       "test: 0.9722326963394218\n",
 26 |       "train: 0.9965775816654026\n",
 27 |       "test: 0.9718838998969885\n",
 28 |       "kaggle score: \n",
 29 |       "overlap_title: 0.01665081496613972\n",
 30 |       "date_diff: 0.02190514883983991\n",
 31 |       "common_author: 0.005109300600450039\n",
 32 |       "journal_similarity: 0.002403034365747304\n",
 33 |       "shortest_path: 0.019781629572646377\n",
 34 |       "overlapping_words_abstract: 0.01535330054775155\n",
 35 |       "jaccard: 0.19108201273444772\n",
 36 |       "adar: 0.006316136251304461\n",
 37 |       "preferential_attachment: 0.052909861150268744\n",
 38 |       "resource_allocation_index: 0.43101242342404056\n",
 39 |       "out_neighbors: 0.015096505980321603\n",
 40 |       "in_neighbors: 0.015219239486567731\n",
 41 |       "popularity: 0.018873794971630692\n",
 42 |       "common_neighbors: 0.18828679710884363\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "import datetime\n",
 48 |     "from sklearn.model_selection import KFold\n",
 49 |     "from sklearn.ensemble import RandomForestClassifier\n",
 50 |     "from sklearn.metrics import accuracy_score\n",
 51 |     "import pandas as pd\n",
 52 |     "import numpy as np\n",
 53 |     "\n",
 54 |     "from tools import f1_score\n",
 55 |     "\n",
 56 |     "# path\n",
 57 |     "path_to_data = \"../../data/\"\n",
 58 |     "path_to_submissions = \"../../submissions/\"\n",
 59 |     "\n",
 60 |     "parameters = {\n",
 61 |     "    \"n_estimators\": 10\n",
 62 |     "}\n",
 63 |     "# parameters\n",
 64 |     "\n",
 65 |     "# load data\n",
 66 |     "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
 67 |     "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
 68 |     "del training[\"my_index\"]\n",
 69 |     "del testing[\"my_index\"]\n",
 70 |     "\n",
 71 |     "# replace inf in shortest_path with -1\n",
 72 |     "training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])\n",
 73 |     "testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])\n",
 74 |     "\n",
 75 |     "my_features_string = [\n",
 76 |     "    \"overlap_title\",\n",
 77 |     "    \"date_diff\",\n",
 78 |     "    \"common_author\",\n",
 79 |     "    \"journal_similarity\",\n",
 80 |     "    \"overlapping_words_abstract\",\n",
 81 |     "    \"cosine_distance\",\n",
 82 |     "    \"shortest_path\",\n",
 83 |     "    \"jaccard\",\n",
 84 |     "    \"adar\",\n",
 85 |     "    \"preferential_attachment\",\n",
 86 |     "    \"resource_allocation_index\",\n",
 87 |     "    \"out_neighbors\",\n",
 88 |     "    \"in_neighbors\",\n",
 89 |     "    \"popularity\",\n",
 90 |     "    \"common_neighbors\"\n",
 91 |     "]\n",
 92 |     "my_features_index = []\n",
 93 |     "my_features_dic = {}\n",
 94 |     "\n",
 95 |     "target = 0\n",
 96 |     "for i in range(len(training.columns)):\n",
 97 |     "    if training.columns[i] == \"target\":\n",
 98 |     "        target = i\n",
 99 |     "    elif training.columns[i] in my_features_string:\n",
100 |     "        my_features_dic.update({len(my_features_index): training.columns[i]})\n",
101 |     "        my_features_index.append(i)\n",
102 |     "\n",
103 |     "# separating features and labels\n",
104 |     "training_val = training.values\n",
105 |     "testing_val = testing.values\n",
106 |     "X_train, Y_train = training_val[:, my_features_index].astype(float), training_val[:, target].astype(int)\n",
107 |     "X_test = testing_val[:, my_features_index]\n",
108 |     "\n",
109 |     "now = datetime.datetime.now()\n",
110 |     "print(\"date: \"+str(now))\n",
111 |     "print(\"features: \"+str(my_features_string))\n",
112 |     "print(\"model: Random Forest\")\n",
113 |     "print(\"parameters:\")\n",
114 |     "print(parameters)\n",
115 |     "print(\"cross validation:\")\n",
116 |     "\n",
117 |     "RF = RandomForestClassifier(n_estimators=parameters[\"n_estimators\"])\n",
118 |     "k = 5\n",
119 |     "kf = KFold(k)\n",
120 |     "predictions = np.zeros((X_test.shape[0], k))\n",
121 |     "i = 0\n",
122 |     "\n",
123 |     "for train_index, test_index in kf.split(X_train, Y_train):\n",
124 |     "    RF.fit(X_train[train_index], Y_train[train_index])\n",
125 |     "    Y_pred = RF.predict(X_train[test_index])\n",
126 |     "    Y_pred_train = RF.predict(X_train[train_index])\n",
127 |     "    predictions[:, i] = RF.predict(X_test)\n",
128 |     "    print(\"train: \"+str(f1_score(Y_train[train_index], Y_pred_train)))\n",
129 |     "    print(\"test: \"+str(f1_score(Y_train[test_index], Y_pred)))\n",
130 |     "    i += 1\n",
131 |     "\n",
132 |     "Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)\n",
133 |     "\n",
134 |     "submission = pd.DataFrame(Y_test)\n",
135 |     "submission.to_csv(\n",
136 |     "    path_or_buf=path_to_submissions+\"-\".join(my_features_string)+\".csv\",\n",
137 |     "    index=True,\n",
138 |     "    index_label=\"id\",\n",
139 |     "    header=[\"category\"]\n",
140 |     ")\n",
141 |     "print(\"kaggle score: \")\n",
142 |     "\n",
143 |     "for i in range(len(RF.feature_importances_)):\n",
144 |     "    print(str(my_features_dic[i]) + \": \" + str(RF.feature_importances_[i]))"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 3,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/html": [
155 |        "<div>\n",
156 |        "<style scoped>\n",
157 |        "    .dataframe tbody tr th:only-of-type {\n",
158 |        "        vertical-align: middle;\n",
159 |        "    }\n",
160 |        "\n",
161 |        "    .dataframe tbody tr th {\n",
162 |        "        vertical-align: top;\n",
163 |        "    }\n",
164 |        "\n",
165 |        "    .dataframe thead th {\n",
166 |        "        text-align: right;\n",
167 |        "    }\n",
168 |        "</style>\n",
169 |        "<table border=\"1\" class=\"dataframe\">\n",
170 |        "  <thead>\n",
171 |        "    <tr style=\"text-align: right;\">\n",
172 |        "      <th></th>\n",
173 |        "      <th>id1</th>\n",
174 |        "      <th>id2</th>\n",
175 |        "      <th>target</th>\n",
176 |        "      <th>overlap_title</th>\n",
177 |        "      <th>date_diff</th>\n",
178 |        "      <th>common_author</th>\n",
179 |        "      <th>journal_similarity</th>\n",
180 |        "      <th>shortest_path</th>\n",
181 |        "      <th>overlapping_words_abstract</th>\n",
182 |        "      <th>jaccard</th>\n",
183 |        "      <th>adar</th>\n",
184 |        "      <th>preferential_attachment</th>\n",
185 |        "      <th>resource_allocation_index</th>\n",
186 |        "      <th>out_neighbors</th>\n",
187 |        "      <th>in_neighbors</th>\n",
188 |        "      <th>popularity</th>\n",
189 |        "    </tr>\n",
190 |        "  </thead>\n",
191 |        "  <tbody>\n",
192 |        "    <tr>\n",
193 |        "      <th>0</th>\n",
194 |        "      <td>9510123</td>\n",
195 |        "      <td>9502114</td>\n",
196 |        "      <td>1</td>\n",
197 |        "      <td>2</td>\n",
198 |        "      <td>0</td>\n",
199 |        "      <td>0</td>\n",
200 |        "      <td>2</td>\n",
201 |        "      <td>-1.0</td>\n",
202 |        "      <td>4</td>\n",
203 |        "      <td>0.066667</td>\n",
204 |        "      <td>0.513898</td>\n",
205 |        "      <td>55.0</td>\n",
206 |        "      <td>0.142857</td>\n",
207 |        "      <td>2.0</td>\n",
208 |        "      <td>7.0</td>\n",
209 |        "      <td>76.0</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>1</th>\n",
213 |        "      <td>9707075</td>\n",
214 |        "      <td>9604178</td>\n",
215 |        "      <td>1</td>\n",
216 |        "      <td>1</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>0</td>\n",
219 |        "      <td>0</td>\n",
220 |        "      <td>2.0</td>\n",
221 |        "      <td>7</td>\n",
222 |        "      <td>0.098039</td>\n",
223 |        "      <td>4.320366</td>\n",
224 |        "      <td>11388.0</td>\n",
225 |        "      <td>0.226401</td>\n",
226 |        "      <td>67.0</td>\n",
227 |        "      <td>123.0</td>\n",
228 |        "      <td>4019.0</td>\n",
229 |        "    </tr>\n",
230 |        "    <tr>\n",
231 |        "      <th>2</th>\n",
232 |        "      <td>9312155</td>\n",
233 |        "      <td>9506142</td>\n",
234 |        "      <td>0</td>\n",
235 |        "      <td>0</td>\n",
236 |        "      <td>-2</td>\n",
237 |        "      <td>0</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>-1.0</td>\n",
240 |        "      <td>6</td>\n",
241 |        "      <td>0.000000</td>\n",
242 |        "      <td>0.000000</td>\n",
243 |        "      <td>5.0</td>\n",
244 |        "      <td>0.000000</td>\n",
245 |        "      <td>0.0</td>\n",
246 |        "      <td>2.0</td>\n",
247 |        "      <td>8.0</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>3</th>\n",
251 |        "      <td>9911255</td>\n",
252 |        "      <td>302165</td>\n",
253 |        "      <td>0</td>\n",
254 |        "      <td>0</td>\n",
255 |        "      <td>-4</td>\n",
256 |        "      <td>0</td>\n",
257 |        "      <td>0</td>\n",
258 |        "      <td>-1.0</td>\n",
259 |        "      <td>8</td>\n",
260 |        "      <td>0.000000</td>\n",
261 |        "      <td>0.000000</td>\n",
262 |        "      <td>280.0</td>\n",
263 |        "      <td>0.000000</td>\n",
264 |        "      <td>16.0</td>\n",
265 |        "      <td>2.0</td>\n",
266 |        "      <td>3.0</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>4</th>\n",
270 |        "      <td>9701033</td>\n",
271 |        "      <td>209076</td>\n",
272 |        "      <td>0</td>\n",
273 |        "      <td>0</td>\n",
274 |        "      <td>-5</td>\n",
275 |        "      <td>0</td>\n",
276 |        "      <td>0</td>\n",
277 |        "      <td>-1.0</td>\n",
278 |        "      <td>8</td>\n",
279 |        "      <td>0.000000</td>\n",
280 |        "      <td>0.000000</td>\n",
281 |        "      <td>168.0</td>\n",
282 |        "      <td>0.000000</td>\n",
283 |        "      <td>0.0</td>\n",
284 |        "      <td>2.0</td>\n",
285 |        "      <td>1.0</td>\n",
286 |        "    </tr>\n",
287 |        "  </tbody>\n",
288 |        "</table>\n",
289 |        "</div>"
290 |       ],
291 |       "text/plain": [
292 |        "       id1      id2  target  overlap_title  date_diff  common_author  \\\n",
293 |        "0  9510123  9502114       1              2          0              0   \n",
294 |        "1  9707075  9604178       1              1          1              0   \n",
295 |        "2  9312155  9506142       0              0         -2              0   \n",
296 |        "3  9911255   302165       0              0         -4              0   \n",
297 |        "4  9701033   209076       0              0         -5              0   \n",
298 |        "\n",
299 |        "   journal_similarity  shortest_path  overlapping_words_abstract   jaccard  \\\n",
300 |        "0                   2           -1.0                           4  0.066667   \n",
301 |        "1                   0            2.0                           7  0.098039   \n",
302 |        "2                   0           -1.0                           6  0.000000   \n",
303 |        "3                   0           -1.0                           8  0.000000   \n",
304 |        "4                   0           -1.0                           8  0.000000   \n",
305 |        "\n",
306 |        "       adar  preferential_attachment  resource_allocation_index  \\\n",
307 |        "0  0.513898                     55.0                   0.142857   \n",
308 |        "1  4.320366                  11388.0                   0.226401   \n",
309 |        "2  0.000000                      5.0                   0.000000   \n",
310 |        "3  0.000000                    280.0                   0.000000   \n",
311 |        "4  0.000000                    168.0                   0.000000   \n",
312 |        "\n",
313 |        "   out_neighbors  in_neighbors  popularity  \n",
314 |        "0            2.0           7.0        76.0  \n",
315 |        "1           67.0         123.0      4019.0  \n",
316 |        "2            0.0           2.0         8.0  \n",
317 |        "3           16.0           2.0         3.0  \n",
318 |        "4            0.0           2.0         1.0  "
319 |       ]
320 |      },
321 |      "execution_count": 3,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "training.head()"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": []
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "kernelspec": {
340 |    "display_name": "Python 3",
341 |    "language": "python",
342 |    "name": "python3"
343 |   },
344 |   "language_info": {
345 |    "codemirror_mode": {
346 |     "name": "ipython",
347 |     "version": 3
348 |    },
349 |    "file_extension": ".py",
350 |    "mimetype": "text/x-python",
351 |    "name": "python",
352 |    "nbconvert_exporter": "python",
353 |    "pygments_lexer": "ipython3",
354 |    "version": "3.5.2"
355 |   }
356 |  },
357 |  "nbformat": 4,
358 |  "nbformat_minor": 2
359 | }
360 | 


--------------------------------------------------------------------------------
/models/feature_selection.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import datetime\n",
10 |     "from sklearn.model_selection import KFold\n",
11 |     "from sklearn.ensemble import RandomForestClassifier\n",
12 |     "from sklearn.metrics import accuracy_score\n",
13 |     "import pandas as pd\n",
14 |     "import numpy as np\n",
15 |     "\n",
16 |     "from tools import f1_score\n",
17 |     "\n",
18 |     "# path\n",
19 |     "path_to_data = \"../../data/\"\n",
20 |     "path_to_submissions = \"../../submissions/\"\n",
21 |     "\n",
22 |     "parameters = {\n",
23 |     "    \"n_estimators\": 10,\n",
24 |     "    \"criterion\": \"entropy\",  # default = gini\n",
25 |     "    \"bootstrap\": True\n",
26 |     "}\n",
27 |     "# parameters\n",
28 |     "\n",
29 |     "# load data\n",
30 |     "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
31 |     "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
32 |     "del training[\"my_index\"]\n",
33 |     "del testing[\"my_index\"]\n",
34 |     "\n",
35 |     "\n",
36 |     "import pandas\n",
37 |     "import numpy\n",
38 |     "from sklearn.feature_selection import SelectKBest\n",
39 |     "from sklearn.feature_selection import chi2\n",
40 |     "# load data\n",
41 |     "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data\"\n",
42 |     "names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
43 |     "dataframe = pandas.read_csv(url, names=names)\n",
44 |     "array = dataframe.values\n",
45 |     "X = array[:,0:8]\n",
46 |     "Y = array[:,8]\n",
47 |     "# feature extraction\n",
48 |     "test = SelectKBest(score_func=chi2, k=4)\n",
49 |     "fit = test.fit(X, Y)\n",
50 |     "# summarize scores\n",
51 |     "numpy.set_printoptions(precision=3)\n",
52 |     "print(fit.scores_)\n",
53 |     "features = fit.transform(X)\n",
54 |     "# summarize selected features\n",
55 |     "print(features[0:5,:])"
56 |    ]
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.5.2"
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 0
80 | }
81 | 


--------------------------------------------------------------------------------
/models/feature_selection.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.ensemble import RandomForestClassifier
  3 | from sklearn.model_selection import KFold
  4 | 
  5 | from models.tools import f1_score
  6 | 
  7 | # path
  8 | path_to_data = "data/"
  9 | path_to_submissions = "submissions/"
 10 | path_to_stacking = "stacking"
 11 | path_to_plots = "plots"
 12 | 
 13 | # tuned hyper-parameters
 14 | 
 15 | parameters = {
 16 |     "n_estimators": 100,
 17 |     "criterion": "entropy",  # default = gini
 18 |     "max_depth": 20,
 19 |     "min_samples_leaf": 10,
 20 |     "bootstrap": True,
 21 |     "n_jobs": -1
 22 | }
 23 | 
 24 | # load data
 25 | training = pd.read_csv(path_to_data + "training_features.txt")
 26 | del training["my_index"]
 27 | 
 28 | # replace inf in shortest_path with -1
 29 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])
 30 | 
 31 | my_features_string = [
 32 |     "date_diff",
 33 |     "overlap_title",
 34 |     "common_author",
 35 |     "score_1_2",
 36 |     "score_2_1",
 37 |     "cosine_distance",
 38 |     "journal_similarity",
 39 |     "overlapping_words_abstract",
 40 |     "jaccard",
 41 |     "adar",
 42 |     "preferential_attachment",
 43 |     "resource_allocation_index",
 44 |     "out_neighbors",
 45 |     "in_neighbors",
 46 |     "common_neighbors",
 47 |     "shortest_path",
 48 |     "popularity",
 49 |     "common_successors",
 50 |     "common_predecessors",
 51 |     "paths_of_length_one",
 52 |     "authors_citation",
 53 |     "normalized_authors_citation",
 54 |     "best_authors_citation",
 55 |     "coauthor_score",
 56 |     "normalized_coauthor_score",
 57 |     "best_coauthor_score",
 58 |     "authors_in_neighbors",
 59 |     "normalized_authors_in_neighbors",
 60 |     "best_authors_in_neighbors"
 61 | ]
 62 | 
 63 | my_features_index = []
 64 | my_features_dic = {}
 65 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 66 | 
 67 | target = 0
 68 | for i in range(len(training.columns)):
 69 |     if training.columns[i] == "target":
 70 |         target = i
 71 | 
 72 | Y_train = training.values[:, target].astype(int)
 73 | 
 74 | del training["target"]
 75 | 
 76 | already_computed_names = []
 77 | already_computed = []
 78 | 
 79 | for i in range(len(training.columns)):
 80 |     if training.columns[i] in my_features_string:
 81 |         my_features_dic.update({i: training.columns[i]})
 82 |         my_features_index.append(i)
 83 |     if training.columns[i] in already_computed_names:
 84 |         already_computed.append(i)
 85 | 
 86 | features_to_keep = []
 87 | for u in range(len(my_features_index)):
 88 | 
 89 |     try:
 90 |         features_to_keep.append(already_computed[u])
 91 |         print("added already computed feature " + str(my_features_dic[already_computed[u]]))
 92 |     except:
 93 | 
 94 |         features_to_keep_names = [my_features_dic[i] for i in features_to_keep]
 95 |         print("new round !")
 96 |         print("u = " + str(u) + ", current features are: " + str(features_to_keep_names))
 97 |         best_test_score = 0.0
 98 |         best_train_score = 0.0
 99 |         best_index = 0
100 |         for i, f in my_features_dic.items():
101 |             if i not in features_to_keep:
102 |                 # separating features and labels
103 |                 print("testing additional feature: " + f)
104 |                 current_features = features_to_keep + [i]
105 | 
106 |                 X_train = training.values[:, current_features]
107 | 
108 |                 RF = RandomForestClassifier(
109 |                     n_estimators=parameters["n_estimators"],
110 |                     criterion=parameters["criterion"],
111 |                     max_depth=parameters["max_depth"],
112 |                     min_samples_leaf=parameters["min_samples_leaf"],
113 |                     bootstrap=parameters["bootstrap"],
114 |                     n_jobs=parameters["n_jobs"]
115 |                 )
116 |                 k = 2
117 |                 kf = KFold(k)
118 |                 train_score = 0.0
119 |                 test_score = 0.0
120 | 
121 |                 for train_index, test_index in kf.split(X_train, Y_train):
122 |                     RF.fit(X_train[train_index], Y_train[train_index])
123 |                     Y_pred = RF.predict(X_train[test_index])
124 |                     Y_pred_train = RF.predict(X_train[train_index])
125 |                     train_score += f1_score(Y_train[train_index], Y_pred_train)
126 |                     test_score += f1_score(Y_train[test_index], Y_pred)
127 | 
128 |                 train_score /= k
129 |                 test_score /= k
130 | 
131 |                 if test_score > best_test_score:
132 |                     best_index = i
133 |                     best_train_score = train_score
134 |                     best_test_score = test_score
135 | 
136 |                 print("train score: " + str(train_score))
137 |                 print("test score: " + str(test_score))
138 |                 print("")
139 | 
140 |         print("for this round, the best feature was " + my_features_dic[best_index])
141 |         features_to_keep.append(best_index)
142 |         print("the scores obtained were: ")
143 |         print("train score: " + str(best_train_score))
144 |         print("test score: " + str(best_test_score))
145 |         print("\n\n\n\n")
146 | 
147 | # # print feature importances
148 | # for i in range(len(RF.feature_importances_)):
149 | #     print(str(my_features_dic[i]) + ": " + str(RF.feature_importances_[i]))
150 | 


--------------------------------------------------------------------------------
/models/lgbm.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import lightgbm as lgb
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.model_selection import KFold
  7 | 
  8 | from models.tools import f1_score, f1_score_lgbm, load_data
  9 | 
 10 | # path
 11 | path_to_data = "data/"
 12 | path_to_submissions = "submissions/"
 13 | path_to_stacking = "stacking/"
 14 | path_to_plots = "models/plots/"
 15 | 
 16 | # tuned hyper-parameters
 17 | parameters = {
 18 |     'task': 'train',
 19 |     'boosting_type': 'gbdt',
 20 |     'objective': 'binary',
 21 |     # 'metric': {},
 22 |     'num_leaves': 200,
 23 |     'learning_rate': 0.1,
 24 |     'feature_fraction': 0.5,
 25 |     'bagging_fraction': 0.6,
 26 |     'bagging_freq': 5,
 27 |     'verbose': 0,
 28 |     "min_data_in_leaf": 3,
 29 |     "max_depth": 150
 30 | }
 31 | # used features
 32 | 
 33 | my_features_string = [
 34 |     "date_diff",
 35 |     "overlap_title",
 36 |     "common_author",
 37 |     # "score_1_2",
 38 |     # "score_2_1",
 39 |     "cosine_distance",
 40 |     "journal_similarity",
 41 |     # "overlapping_words_abstract",
 42 |     # "jaccard",
 43 |     # "adar",
 44 |     "preferential_attachment",
 45 |     # "resource_allocation_index",
 46 |     "out_neighbors",
 47 |     "in_neighbors",
 48 |     "common_neighbors",
 49 |     "shortest_path",
 50 |     "popularity",
 51 |     "common_successors",
 52 |     "common_predecessors",
 53 |     "paths_of_length_one",
 54 |     "authors_citation",
 55 |     "coauthor_score"
 56 |     # "katz"
 57 |     # "katz_2"
 58 | ]
 59 | 
 60 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 61 | 
 62 | # load data
 63 | 
 64 | (X_train,
 65 |  X_test,
 66 |  Y_train,
 67 |  my_features_index,
 68 |  my_features_dic) = load_data(my_features_string)
 69 | 
 70 | # print user info
 71 | now = datetime.datetime.now()
 72 | print("date: " + str(now))
 73 | print("features: " + str(my_features_string))
 74 | print("model: LGBM")
 75 | print("parameters:")
 76 | print(parameters)
 77 | print("cross validation:")
 78 | 
 79 | # instantiate Kfold and predictions placeholder
 80 | k = 5
 81 | kf = KFold(k)
 82 | predictions = np.zeros((X_test.shape[0], k))
 83 | predictions_train = np.zeros(X_train.shape[0])
 84 | i = 0
 85 | 
 86 | # for each fold store predictions on test set and print validation results
 87 | results = []
 88 | print('Start training...')
 89 | for train_index, test_index in kf.split(X_train):
 90 |     lgb_train = lgb.Dataset(X_train[train_index], Y_train[train_index])
 91 |     lgb_eval = lgb.Dataset(X_train[test_index], Y_train[test_index], reference=lgb_train)
 92 |     gbm = lgb.train(parameters,
 93 |                     train_set=lgb_train,
 94 |                     num_boost_round=100,
 95 |                     valid_sets=lgb_eval,
 96 |                     verbose_eval=40,
 97 |                     feval=f1_score_lgbm
 98 |                     )
 99 |     res = gbm.predict(X_test)
100 |     Y_pred = gbm.predict(X_train[test_index])
101 |     Y_pred_train = gbm.predict(X_train[train_index])
102 |     predictions[:, i] = res
103 |     predictions_train[test_index] = Y_pred
104 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train.round())))
105 |     print("test: " + str(f1_score(Y_train[test_index], Y_pred.round())))
106 |     i += 1
107 | 
108 | # save submission file
109 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
110 | submission = pd.DataFrame(Y_test)
111 | submission.to_csv(
112 |     path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "lgbm" + ".csv",
113 |     index=True,
114 |     index_label="id",
115 |     header=["category"]
116 | )
117 | 
118 | # save probabilities for stacking
119 | stacking_logits_test = np.sum(predictions, axis=1)
120 | stacking_test = pd.DataFrame(stacking_logits_test)
121 | stacking_test.to_csv(
122 |     path_or_buf=path_to_stacking + "lgbm_test" + ".csv",
123 |     index=True,
124 |     index_label="id",
125 |     header=["category"]
126 | )
127 | 
128 | stacking_train = pd.DataFrame(predictions_train)
129 | stacking_train.to_csv(
130 |     path_or_buf=path_to_stacking + "lgbm_train" + ".csv",
131 |     index=True,
132 |     index_label="id",
133 |     header=["category"]
134 | )
135 | 


--------------------------------------------------------------------------------
/models/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.linear_model import LogisticRegressionCV
  6 | from sklearn.model_selection import KFold
  7 | 
  8 | from models.tools import f1_score
  9 | 
 10 | # path
 11 | path_to_data = "data/"
 12 | path_to_submissions = "submissions/"
 13 | 
 14 | # parameters
 15 | parameters = {
 16 |     "max_iter": 100,
 17 |     "tol": 1e-6,
 18 |     "penalty": "l2"
 19 | }
 20 | 
 21 | # load data
 22 | training = pd.read_csv(path_to_data + "training_features.txt")
 23 | testing = pd.read_csv(path_to_data + "testing_features.txt")
 24 | del training["my_index"]
 25 | del testing["my_index"]
 26 | 
 27 | # replace inf in shortest_path with -1
 28 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])
 29 | testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])
 30 | 
 31 | my_features_string = [
 32 |     "date_diff",
 33 |     "overlap_title",
 34 |     "common_author",
 35 |     "score_1_2",
 36 |     # "score_2_1",
 37 |     # "cosine_distance",
 38 |     # "journal_similarity",
 39 |     # "overlapping_words_abstract",
 40 |     # "jaccard",
 41 |     # "adar",
 42 |     # "preferential_attachment",
 43 |     # "resource_allocation_index",
 44 |     # "out_neighbors",
 45 |     # "in_neighbors",
 46 |     # "common_neighbors",
 47 |     "shortest_path",
 48 |     "popularity"
 49 | ]
 50 | my_features_index = []
 51 | my_features_dic = {}
 52 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 53 | 
 54 | target = 0
 55 | for i in range(len(training.columns)):
 56 |     if training.columns[i] == "target":
 57 |         target = i
 58 | 
 59 | Y_train = training.values[:, target].astype(int)
 60 | 
 61 | del training["target"]
 62 | 
 63 | for i in range(len(training.columns)):
 64 |     if training.columns[i] in my_features_string:
 65 |         my_features_dic.update({i: training.columns[i]})
 66 |         my_features_index.append(i)
 67 | 
 68 | # separating features and labels
 69 | training_val = training.values
 70 | testing_val = testing.values
 71 | X_train = training_val[:, my_features_index].astype(float)
 72 | X_test = testing_val[:, my_features_index]
 73 | 
 74 | now = datetime.datetime.now()
 75 | print("date: " + str(now))
 76 | print("features: " + str(my_features_string))
 77 | print("model: Random Forest")
 78 | print("parameters:")
 79 | print(parameters)
 80 | print("cross validation:")
 81 | 
 82 | LogReg = LogisticRegressionCV(max_iter=parameters['max_iter'],
 83 |                               tol=parameters['tol'],
 84 |                               penalty=parameters['penalty'])
 85 | k = 5
 86 | kf = KFold(k)
 87 | predictions = np.zeros((X_test.shape[0], k))
 88 | i = 0
 89 | 
 90 | for train_index, test_index in kf.split(X_train, Y_train):
 91 |     LogReg.fit(X_train[train_index], Y_train[train_index])
 92 |     Y_pred = LogReg.predict(X_train[test_index])
 93 |     Y_pred_train = LogReg.predict(X_train[train_index])
 94 |     predictions[:, i] = LogReg.predict(X_test)
 95 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
 96 |     print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
 97 |     i += 1
 98 | 
 99 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
100 | 
101 | # submission = pd.DataFrame(Y_test)
102 | # submission.to_csv(
103 | #     path_or_buf=path_to_submissions+"-".join(my_features_string)+"LogReg.csv",
104 | #     index=True,
105 | #     index_label="id",
106 | #     header=["category"]
107 | # )
108 | 


--------------------------------------------------------------------------------
/models/nn.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy
  4 | import numpy as np
  5 | import pandas as pd
  6 | from keras.layers import Dense, Dropout
  7 | from keras.models import Sequential
  8 | from keras.wrappers.scikit_learn import KerasClassifier
  9 | from sklearn.model_selection import StratifiedKFold
 10 | from sklearn.preprocessing import StandardScaler
 11 | 
 12 | from models.tools import load_data
 13 | 
 14 | # path
 15 | path_to_data = "data/"
 16 | path_to_submissions = "submissions/"
 17 | path_to_stacking = "stacking/"
 18 | path_to_plots = "plots/"
 19 | 
 20 | # tuned hyper-parameters
 21 | 
 22 | parameters = {
 23 |     "n_estimators": 150,
 24 |     "criterion": "entropy",  # default = gini
 25 |     "max_depth": 15,  # 9
 26 |     "min_samples_leaf": 4,  # 10
 27 |     "bootstrap": True,
 28 |     "n_jobs": -1
 29 | }
 30 | 
 31 | # features used
 32 | 
 33 | my_features_string = [
 34 |     "date_diff",
 35 |     "overlap_title",
 36 |     "common_author",
 37 |     "score_1_2",
 38 |     "score_2_1",
 39 |     "cosine_distance",
 40 |     "journal_similarity",
 41 |     # "overlapping_words_abstract",
 42 |     "jaccard",
 43 |     "adar",
 44 |     "preferential_attachment",
 45 |     "resource_allocation_index",
 46 |     "out_neighbors",
 47 |     "in_neighbors",
 48 |     "common_neighbors",
 49 |     # "shortest_path",
 50 |     "popularity",
 51 |     "authors_citation",
 52 |     "coauthor_score"
 53 |     # "paths_of_length_one"
 54 |     # "katz"
 55 |     # "katz_2"
 56 | ]
 57 | 
 58 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 59 | 
 60 | (X_train,
 61 |  X_test,
 62 |  Y_train,
 63 |  my_features_index,
 64 |  my_features_dic) = load_data(my_features_string)
 65 | 
 66 | # normalize data
 67 | scaler = StandardScaler()
 68 | X_train = scaler.fit_transform(X_train)
 69 | X_test = scaler.transform(X_test)
 70 | 
 71 | # Function to create model, required for KerasClassifier
 72 | nb_input = len(my_features_string)
 73 | 
 74 | 
 75 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
 76 |     # create model
 77 |     model = Sequential()
 78 |     model.add(Dense(neurons, input_dim=nb_input, activation=activation))
 79 |     model.add(Dropout(dropout_rate))
 80 |     model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
 81 |     # Compile model
 82 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 83 |     return model
 84 | 
 85 | 
 86 | # parameters
 87 | epochs = 30
 88 | batch_size = 128
 89 | 
 90 | # tuned parameters
 91 | dropout_rate = 0.2
 92 | neurons = 75
 93 | 
 94 | # fix random seed for reproducibility
 95 | seed = 7
 96 | numpy.random.seed(seed)
 97 | 
 98 | # instantiate classifier
 99 | nn = KerasClassifier(build_fn=create_model,
100 |                      epochs=epochs,
101 |                      batch_size=batch_size,
102 |                      dropout_rate=dropout_rate,
103 |                      neurons=neurons,
104 |                      verbose=1
105 |                      )
106 | 
107 | # print user info
108 | now = datetime.datetime.now()
109 | print("date: " + str(now))
110 | print("features: " + str(my_features_string))
111 | print("model: Neural Network")
112 | print("parameters:")
113 | print(parameters)
114 | print("cross validation:")
115 | 
116 | # instantiate Kfold and predictions placeholder
117 | k = 5
118 | kf = StratifiedKFold(k)
119 | predictions = np.zeros((X_test.shape[0], k))
120 | predictions_test = np.zeros((X_test.shape[0], k))
121 | predictions_train = np.zeros(X_train.shape[0])
122 | i = 0
123 | 
124 | # for each fold store predictions on test set and print validation results
125 | test_score = 0.0
126 | for train_index, test_index in kf.split(X_train, Y_train):
127 |     nn.fit(X_train[train_index], Y_train[train_index])
128 |     Y_pred = nn.predict(X_train[test_index])[:, 0]
129 |     Y_pred_train = nn.predict(X_train[train_index])[:, 0]
130 |     predictions[:, i] = nn.predict(X_test)[:, 0]
131 |     predictions_test[:, i] = nn.predict_proba(X_test)[:, 1]
132 |     predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1]
133 |     # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0]
134 |     # test_score += current_test_score
135 |     # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
136 |     # print("test: " + str(current_test_score))
137 |     i += 1
138 | # print("CV test score: "+str(test_score/k))
139 | 
140 | # save submission file
141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
142 | submission = pd.DataFrame(Y_test)
143 | submission.to_csv(
144 |     path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn.csv",
145 |     index=True,
146 |     index_label="id",
147 |     header=["category"]
148 | )
149 | 
150 | # save probabilities for stacking
151 | stacking_logits_test = np.sum(predictions_test, axis=1)
152 | stacking_test = pd.DataFrame(stacking_logits_test)
153 | stacking_test.to_csv(
154 |     path_or_buf=path_to_stacking + "nn_test" + ".csv",
155 |     index=True,
156 |     index_label="id",
157 |     header=["category"]
158 | )
159 | 
160 | stacking_train = pd.DataFrame(predictions_train)
161 | stacking_train.to_csv(
162 |     path_or_buf=path_to_stacking + "nn_train" + ".csv",
163 |     index=True,
164 |     index_label="id",
165 |     header=["category"]
166 | )
167 | 


--------------------------------------------------------------------------------
/models/nn_deep.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy
  4 | import numpy as np
  5 | import pandas as pd
  6 | from keras.layers import Dense, Dropout
  7 | from keras.models import Sequential
  8 | from keras.wrappers.scikit_learn import KerasClassifier
  9 | from sklearn.model_selection import StratifiedKFold
 10 | from sklearn.preprocessing import StandardScaler
 11 | 
 12 | from models.tools import load_data
 13 | 
 14 | # path
 15 | path_to_data = "data/"
 16 | path_to_submissions = "submissions/"
 17 | path_to_stacking = "stacking/"
 18 | path_to_plots = "plots/"
 19 | 
 20 | # tuned hyper-parameters
 21 | 
 22 | parameters = {
 23 |     "n_estimators": 150,
 24 |     "criterion": "entropy",  # default = gini
 25 |     "max_depth": 15,  # 9
 26 |     "min_samples_leaf": 4,  # 10
 27 |     "bootstrap": True,
 28 |     "n_jobs": -1
 29 | }
 30 | 
 31 | # used features
 32 | 
 33 | my_features_string = [
 34 |     "date_diff",
 35 |     "overlap_title",
 36 |     "common_author",
 37 |     "score_1_2",
 38 |     "score_2_1",
 39 |     "cosine_distance",
 40 |     "journal_similarity",
 41 |     # "overlapping_words_abstract",
 42 |     "jaccard",
 43 |     "adar",
 44 |     "preferential_attachment",
 45 |     "resource_allocation_index",
 46 |     "out_neighbors",
 47 |     "in_neighbors",
 48 |     "common_neighbors",
 49 |     # "shortest_path",
 50 |     "popularity",
 51 |     # "paths_of_length_one"
 52 |     # "katz"
 53 |     # "katz_2"
 54 | ]
 55 | 
 56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 57 | 
 58 | (X_train,
 59 |  X_test,
 60 |  Y_train,
 61 |  my_features_index,
 62 |  my_features_dic) = load_data(my_features_string)
 63 | 
 64 | # normalize data
 65 | scaler = StandardScaler()
 66 | X_train = scaler.fit_transform(X_train)
 67 | X_test = scaler.transform(X_test)
 68 | 
 69 | # Function to create model, required for KerasClassifier
 70 | nb_input = len(my_features_string)
 71 | 
 72 | 
 73 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
 74 |     # create model
 75 |     model = Sequential()
 76 |     model.add(Dense(neurons, input_dim=nb_input, activation=activation))
 77 |     model.add(Dropout(dropout_rate))
 78 |     model.add(Dense(2 * neurons, activation=activation))
 79 |     model.add(Dropout(dropout_rate))
 80 |     model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
 81 |     # Compile model
 82 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 83 |     return model
 84 | 
 85 | 
 86 | # parameters
 87 | epochs = 30
 88 | batch_size = 128
 89 | 
 90 | # tuned parameters
 91 | dropout_rate = 0.2
 92 | neurons = 75
 93 | 
 94 | # fix random seed for reproducibility
 95 | seed = 7
 96 | numpy.random.seed(seed)
 97 | 
 98 | # instantiate classifier
 99 | nn = KerasClassifier(build_fn=create_model,
100 |                      epochs=epochs,
101 |                      batch_size=batch_size,
102 |                      dropout_rate=dropout_rate,
103 |                      neurons=neurons,
104 |                      verbose=1
105 |                      )
106 | 
107 | # print user info
108 | now = datetime.datetime.now()
109 | print("date: " + str(now))
110 | print("features: " + str(my_features_string))
111 | print("model: Neural Network")
112 | print("parameters:")
113 | print(parameters)
114 | print("cross validation:")
115 | 
116 | # instantiate Kfold and predictions placeholder
117 | k = 5
118 | kf = StratifiedKFold(k)
119 | predictions = np.zeros((X_test.shape[0], k))
120 | predictions_test = np.zeros((X_test.shape[0], k))
121 | predictions_train = np.zeros(X_train.shape[0])
122 | i = 0
123 | 
124 | # for each fold store predictions on test set and print validation results
125 | test_score = 0.0
126 | for train_index, test_index in kf.split(X_train, Y_train):
127 |     nn.fit(X_train[train_index], Y_train[train_index])
128 |     Y_pred = nn.predict(X_train[test_index])[:, 0]
129 |     Y_pred_train = nn.predict(X_train[train_index])[:, 0]
130 |     predictions[:, i] = nn.predict(X_test)[:, 0]
131 |     predictions_test[:, i] = nn.predict_proba(X_test)[:, 1]
132 |     predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1]
133 |     # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0]
134 |     # test_score += current_test_score
135 |     # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
136 |     # print("test: " + str(current_test_score))
137 |     i += 1
138 | # print("CV test score: "+str(test_score/k))
139 | 
140 | # save submission file
141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
142 | submission = pd.DataFrame(Y_test)
143 | submission.to_csv(
144 |     path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn_deep.csv",
145 |     index=True,
146 |     index_label="id",
147 |     header=["category"]
148 | )
149 | 
150 | # save probabilities for stacking
151 | stacking_logits_test = np.sum(predictions_test, axis=1)
152 | stacking_test = pd.DataFrame(stacking_logits_test)
153 | stacking_test.to_csv(
154 |     path_or_buf=path_to_stacking + "nn_deep_test" + ".csv",
155 |     index=True,
156 |     index_label="id",
157 |     header=["category"]
158 | )
159 | 
160 | stacking_train = pd.DataFrame(predictions_train)
161 | stacking_train.to_csv(
162 |     path_or_buf=path_to_stacking + "nn_deep_train" + ".csv",
163 |     index=True,
164 |     index_label="id",
165 |     header=["category"]
166 | )
167 | 


--------------------------------------------------------------------------------
/models/plots/rf_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance.png


--------------------------------------------------------------------------------
/models/plots/rf_importance_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance_full.png


--------------------------------------------------------------------------------
/models/random_forest.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.ensemble import RandomForestClassifier
  6 | from sklearn.model_selection import KFold
  7 | 
  8 | from models.tools import f1_score, plot_importance, load_data
  9 | 
 10 | # path
 11 | path_to_data = "data/"
 12 | path_to_submissions = "submissions/"
 13 | path_to_stacking = "stacking/"
 14 | path_to_plots = "plots/"
 15 | 
 16 | # tuned hyper-parameters
 17 | 
 18 | parameters = {
 19 |     "n_estimators": 150,
 20 |     "criterion": "entropy",  # default = gini
 21 |     "max_depth": 20,  # 9
 22 |     "min_samples_leaf": 10,  # 10
 23 |     "bootstrap": True,
 24 |     "n_jobs": -1
 25 | }
 26 | 
 27 | # used features
 28 | 
 29 | my_features_string = [
 30 |     "date_diff",
 31 |     # "overlap_title",
 32 |     "common_author",
 33 |     # "score_1_2",
 34 |     # "score_2_1",
 35 |     "cosine_distance",
 36 |     # "journal_similarity",
 37 |     # "overlapping_words_abstract",
 38 |     # "jaccard",
 39 |     # "adar",
 40 |     "preferential_attachment",
 41 |     # "resource_allocation_index",
 42 |     # "out_neighbors",
 43 |     "in_neighbors",
 44 |     "common_neighbors",
 45 |     # "shortest_path",
 46 |     # "popularity",
 47 |     # "common_successors",
 48 |     # "common_predecessors",
 49 |     # "paths_of_length_one",
 50 |     "authors_citation",
 51 |     # "coauthor_score",
 52 |     # "katz",
 53 |     # "katz_2"
 54 | ]
 55 | 
 56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 57 | 
 58 | # load data
 59 | 
 60 | (X_train,
 61 |  X_test,
 62 |  Y_train,
 63 |  my_features_index,
 64 |  my_features_dic) = load_data(my_features_string)
 65 | 
 66 | # print user info
 67 | now = datetime.datetime.now()
 68 | print("date: " + str(now))
 69 | print("features: " + str(my_features_string))
 70 | print("model: Random Forest")
 71 | print("parameters:")
 72 | print(parameters)
 73 | print("cross validation:")
 74 | 
 75 | # instantiate classifier
 76 | RF = RandomForestClassifier(
 77 |     n_estimators=parameters["n_estimators"],
 78 |     criterion=parameters["criterion"],
 79 |     max_depth=parameters["max_depth"],
 80 |     min_samples_leaf=parameters["min_samples_leaf"],
 81 |     bootstrap=parameters["bootstrap"],
 82 |     n_jobs=parameters["n_jobs"]
 83 | )
 84 | 
 85 | # instantiate Kfold and predictions placeholder
 86 | k = 2
 87 | kf = KFold(k)
 88 | predictions = np.zeros((X_test.shape[0], k))
 89 | predictions_test = np.zeros((X_test.shape[0], k))
 90 | predictions_train = np.zeros(X_train.shape[0])
 91 | i = 0
 92 | 
 93 | # for each fold store predictions on test set and print validation results
 94 | test_score = 0.0
 95 | for train_index, test_index in kf.split(X_train, Y_train):
 96 |     RF.fit(X_train[train_index], Y_train[train_index])
 97 |     Y_pred = RF.predict(X_train[test_index])
 98 |     Y_pred_train = RF.predict(X_train[train_index])
 99 |     predictions[:, i] = RF.predict(X_test)
100 |     predictions_test[:, i] = RF.predict_proba(X_test)[:, 1]
101 |     predictions_train[test_index] = RF.predict_proba(X_train[test_index])[:, 1]
102 |     current_test_score = f1_score(Y_train[test_index], Y_pred)
103 |     test_score += current_test_score
104 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
105 |     print("test: " + str(current_test_score))
106 |     i += 1
107 | 
108 | print("CV test score: " + str(test_score / k))
109 | # save submission file
110 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
111 | submission = pd.DataFrame(Y_test)
112 | submission.to_csv(
113 |     path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "RF.csv",
114 |     index=True,
115 |     index_label="id",
116 |     header=["category"]
117 | )
118 | 
119 | # save probabilities for stacking
120 | stacking_logits_test = np.sum(predictions_test, axis=1)
121 | stacking_test = pd.DataFrame(stacking_logits_test)
122 | stacking_test.to_csv(
123 |     path_or_buf=path_to_stacking + "rf_test_2" + ".csv",
124 |     index=True,
125 |     index_label="id",
126 |     header=["category"]
127 | )
128 | 
129 | stacking_train = pd.DataFrame(predictions_train)
130 | stacking_train.to_csv(
131 |     path_or_buf=path_to_stacking + "rf_train_2" + ".csv",
132 |     index=True,
133 |     index_label="id",
134 |     header=["category"]
135 | )
136 | 
137 | # plot feature importances
138 | plot_importance(RF,
139 |                 features_dict=my_features_dic,
140 |                 features_index=my_features_index,
141 |                 name='rf_importance')
142 | 


--------------------------------------------------------------------------------
/models/svm.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn import svm
  6 | from sklearn.model_selection import StratifiedKFold
  7 | from sklearn.preprocessing import StandardScaler
  8 | 
  9 | from models.tools import f1_score, load_data
 10 | 
 11 | # path
 12 | path_to_data = "data/"
 13 | path_to_submissions = "submissions/"
 14 | path_to_stacking = "stacking/"
 15 | path_to_plots = "models/plots"
 16 | 
 17 | # used features
 18 | 
 19 | my_features_string = [
 20 |     "date_diff",
 21 |     "overlap_title",
 22 |     "common_author",
 23 |     # "score_1_2",
 24 |     # "score_2_1",
 25 |     "cosine_distance",
 26 |     # "journal_similarity",
 27 |     # # "overlapping_words_abstract",
 28 |     # "jaccard",
 29 |     # "adar",
 30 |     "preferential_attachment",
 31 |     # "resource_allocation_index",
 32 |     "out_neighbors",
 33 |     "in_neighbors",
 34 |     "common_neighbors",
 35 |     # # "shortest_path",
 36 |     # "popularity",
 37 |     # # "paths_of_length_one"
 38 |     # "katz"
 39 |     # "katz_2"
 40 | ]
 41 | 
 42 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
 43 | 
 44 | # load data
 45 | 
 46 | (X_train,
 47 |  X_test,
 48 |  Y_train,
 49 |  my_features_index,
 50 |  my_features_dic) = load_data(my_features_string)
 51 | 
 52 | # normalize data
 53 | scaler = StandardScaler()
 54 | X_train = scaler.fit_transform(X_train)
 55 | X_test = scaler.transform(X_test)
 56 | 
 57 | # tuned hyperparameters
 58 | parameters = {
 59 |     'C': 0.1,
 60 |     'gamma': 0.01,
 61 |     'kernel': "linear"
 62 | }
 63 | 
 64 | # print user info
 65 | now = datetime.datetime.now()
 66 | print("date: " + str(now))
 67 | print("features: " + str(my_features_string))
 68 | print("model: SVM")
 69 | print("parameters:")
 70 | print(parameters)
 71 | print("cross validation:")
 72 | 
 73 | # instantiate classifier
 74 | svm_classifier = svm.SVC(C=parameters['C'],
 75 |                          gamma=parameters['gamma'],
 76 |                          kernel=parameters['kernel'],
 77 |                          probability=True,
 78 |                          verbose=1)
 79 | 
 80 | # instantiate Kfold and predictions placeholder
 81 | k = 2
 82 | kf = StratifiedKFold(k)
 83 | predictions = np.zeros((X_test.shape[0], k))
 84 | predictions_test = np.zeros((X_test.shape[0], k))
 85 | predictions_train = np.zeros(X_train.shape[0])
 86 | i = 0
 87 | 
 88 | # for each fold store predictions on test set and print validation results
 89 | for train_index, test_index in kf.split(X_train, Y_train):
 90 |     svm_classifier.fit(X_train[train_index], Y_train[train_index])
 91 |     Y_pred = svm_classifier.predict(X_train[test_index])
 92 |     Y_pred_train = svm_classifier.predict(X_train[train_index])
 93 |     predictions[:, i] = svm_classifier.predict(X_test)
 94 |     predictions_test[:, i] = svm_classifier.predict_proba(X_test)[:, 1]
 95 |     predictions_train[test_index] = svm_classifier.predict_proba(X_train[test_index])[:, 1]
 96 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
 97 |     print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
 98 |     i += 1
 99 | 
100 | # save submission file
101 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
102 | submission = pd.DataFrame(Y_test)
103 | submission.to_csv(
104 |     path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "SVM.csv",
105 |     index=True,
106 |     index_label="id",
107 |     header=["category"]
108 | )
109 | 
110 | # save probabilities for stacking
111 | stacking_logits_test = np.sum(predictions_test, axis=1)
112 | stacking_test = pd.DataFrame(stacking_logits_test)
113 | stacking_test.to_csv(
114 |     path_or_buf=path_to_stacking + "svmlinear_test" + ".csv",
115 |     index=True,
116 |     index_label="id",
117 |     header=["category"]
118 | )
119 | 
120 | stacking_train = pd.DataFrame(predictions_train)
121 | stacking_train.to_csv(
122 |     path_or_buf=path_to_stacking + "svmlinear_train" + ".csv",
123 |     index=True,
124 |     index_label="id",
125 |     header=["category"]
126 | )
127 | 


--------------------------------------------------------------------------------
/models/tools.py:
--------------------------------------------------------------------------------
  1 | import matplotlib as mpl
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | import pandas as pd
  5 | import seaborn as sns
  6 | 
  7 | 
  8 | def binary_error(preds, train_data):
  9 |     labels = train_data.get_label()
 10 |     return 'error', np.mean(labels != (preds > 0.5)), False
 11 | 
 12 | 
 13 | def f1_score_lgbm(preds, train_data):
 14 |     labels = train_data.get_label()
 15 |     tp = np.sum(labels[labels == 1] == (preds[labels == 1] > 0.5))
 16 |     tn = np.sum(labels[labels == 0] == (preds[labels == 0] > 0.5))
 17 |     fp = np.sum(labels[labels == 1] != (preds[labels == 1] > 0.5))
 18 |     fn = np.sum(labels[labels == 0] != (preds[labels == 0] > 0.5))
 19 |     p = tp / (tp + fp)
 20 |     r = tp / (tp + fn)
 21 | 
 22 |     return 'f1 score', 2 * p * r / (p + r), False
 23 | 
 24 | 
 25 | def f1_score(preds, labels):
 26 |     tp = np.sum(labels[labels == 1] == preds[labels == 1])
 27 |     tn = np.sum(labels[labels == 0] == preds[labels == 0])
 28 |     fp = np.sum(labels[labels == 1] != preds[labels == 1])
 29 |     fn = np.sum(labels[labels == 0] != preds[labels == 0])
 30 |     p = tp / (tp + fp)
 31 |     r = tp / (tp + fn)
 32 | 
 33 |     return 2 * p * r / (p + r)
 34 | 
 35 | 
 36 | def load_data(my_features_string):
 37 |     # path
 38 |     path_to_data = "data/"
 39 | 
 40 |     # feature tracking utils
 41 |     my_features_index = []
 42 |     my_features_dic = {}
 43 | 
 44 |     # load raw data
 45 |     training = pd.read_csv(path_to_data + "training_features.txt")
 46 |     testing = pd.read_csv(path_to_data + "testing_features.txt")
 47 | 
 48 |     del training["my_index"]
 49 |     del testing["my_index"]
 50 | 
 51 |     # track features and target
 52 |     target = 0
 53 |     for i in range(len(training.columns)):
 54 |         if training.columns[i] == "target":
 55 |             target = i
 56 | 
 57 |     Y_train = training.values[:, target].astype(int)
 58 | 
 59 |     del training["target"]
 60 | 
 61 |     for i in range(len(training.columns)):
 62 |         if training.columns[i] in my_features_string:
 63 |             my_features_dic.update({i: training.columns[i]})
 64 |             my_features_index.append(i)
 65 | 
 66 |     # separating features and labels
 67 |     training_val = training.values
 68 |     testing_val = testing.values
 69 |     X_train = training_val[:, my_features_index].astype(float)
 70 |     X_test = testing_val[:, my_features_index]
 71 | 
 72 |     del training_val
 73 |     del testing_val
 74 | 
 75 |     print(training.head())
 76 |     print(testing.head())
 77 | 
 78 |     return X_train, X_test, Y_train, my_features_index, my_features_dic
 79 | 
 80 | 
 81 | # plotting feature importances
 82 | def plot_importance(rf, features_dict, features_index, name):
 83 |     # plot settings
 84 |     sns.set_style("darkgrid")
 85 |     mpl.rcParams['figure.dpi'] = 200
 86 |     # mpl.rcParams['figure.tight_layout'] = True
 87 |     path_to_plot = "models/plots/"
 88 | 
 89 |     # fetch mean importances
 90 |     importances = rf.feature_importances_
 91 |     # compute std using each estimator in the forest
 92 |     std = np.std([tree.feature_importances_ for tree in rf.estimators_],
 93 |                  axis=0)
 94 |     # argsort the values
 95 |     index = list(map(int, np.argsort(importances)[::-1]))
 96 |     # Plot the feature importances of the rf
 97 |     plt.figure()
 98 |     # get axis
 99 |     fig, ax = plt.subplots(figsize=(6, 3))
100 |     # add space for x labels
101 |     plt.subplots_adjust(bottom=0.30)
102 |     plt.title("Feature importances")
103 |     # get number of features
104 |     nb_features = len(features_dict)
105 |     # plot with error bars
106 |     plt.bar(range(nb_features), importances[index],
107 |             color="r", yerr=std[index], align="center")
108 |     # create x axis tickers
109 |     plt.xticks(range(nb_features), index)
110 |     # get feature names in right order
111 |     index_features_sorted = np.array(features_index)[index]
112 |     feature_names = list(map(lambda x: features_dict[x], index_features_sorted))
113 |     # font dict to control x tickers labels
114 |     ax.set_xticklabels(feature_names, rotation=40, fontsize=9, ha='right')
115 |     plt.xlim([-1, nb_features])
116 |     plt.ylim([0, 0.8])
117 |     plt.savefig(path_to_plot + name)
118 |     plt.show()
119 | 


--------------------------------------------------------------------------------
/models/tuning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/__init__.py


--------------------------------------------------------------------------------
/models/tuning/objective_function.py:
--------------------------------------------------------------------------------
 1 | class ObjectiveFunction:
 2 |     """class to analyze objective function optimization : hyperparameter tuning"""
 3 | 
 4 |     def __init__(self, func):
 5 |         self.f = func
 6 |         self.history_f = []
 7 |         self.history_fbest = None
 8 |         self.history_bests = []
 9 | 
10 |     def __call__(self, x):
11 |         val = self.f(x)
12 |         self.history_f.append(-val)
13 |         if self.history_fbest is None:
14 |             self.history_fbest = val
15 |             self.history_bests.append(-val)
16 |         elif self.history_fbest > val:
17 |             self.history_fbest = val
18 |             self.history_bests.append(-val)
19 |         else:
20 |             self.history_bests.append(-self.history_fbest)
21 |         return val
22 | 


--------------------------------------------------------------------------------
/models/tuning/plots/grid_lgbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/plots/grid_lgbm.png


--------------------------------------------------------------------------------
/models/tuning/tools.py:
--------------------------------------------------------------------------------
 1 | import matplotlib as mpl
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import seaborn as sns
 5 | 
 6 | 
 7 | # Function to help plotting the results of our cross-validation for the random forest algorithm.
 8 | def triple_ticker_grid(tuned_parameters, parameter_1, parameter_2, parameter_3):
 9 |     ticker_labels = []
10 |     for i in tuned_parameters[parameter_1]:
11 |         for j in tuned_parameters[parameter_2]:
12 |             for k in tuned_parameters[parameter_3]:
13 |                 ticker_labels.append(str((i, j, k)))
14 |     return ticker_labels
15 | 
16 | 
17 | def double_ticker_grid(tuned_parameters, parameter_1, parameter_2):
18 |     ticker_labels = []
19 |     for i in tuned_parameters[parameter_1]:
20 |         for j in tuned_parameters[parameter_2]:
21 |             ticker_labels.append(str((i, j)))
22 |     return ticker_labels
23 | 
24 | 
25 | # fucntion for plotting the results of the grid search
26 | def plot_grid(metrics, params, param_names, index, name):
27 |     # plot settings
28 |     sns.reset_orig()
29 |     mpl.rcParams['figure.dpi'] = 200
30 |     path_to_plot = "models/tuning/plots/"
31 | 
32 |     # For the test set
33 |     plt.figure(figsize=(30, 30))
34 |     f, (ax1, ax2) = plt.subplots(2, 1, sharey=True)
35 |     x = range(len(metrics['params']))
36 |     ax1.plot(x, list(metrics['mean_test_precision']), '--',
37 |              label='mean test precision', color='g')
38 |     ax1.plot(x, list(metrics['mean_test_recall']), '-.',
39 |              label='mean test recall', color='r')
40 |     ax1.plot(x, list(metrics['mean_test_roc_auc']), '-o',
41 |              label='mean test roc auc', color='b')
42 |     ax1.plot(x, list(metrics['mean_test_accuracy']), '-*',
43 |              label='mean test accuracy', color='purple')
44 |     ax1.plot(x, list(metrics['mean_test_f1']), '-',
45 |              label='mean test f1', color='orange')
46 |     y_13 = np.arange(0.9, 1.05, 0.05)
47 |     x_13 = np.repeat(index, len(y_13))
48 | 
49 |     ax1.plot(x_13, y_13, '-.', color='pink', lw=2.0)
50 |     plt.ylim([0.95, 1.0])
51 |     ax1.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=7)
52 |     ax1.set_ylabel('Test metrics')
53 |     plt.subplots_adjust(left=0.40, bottom=0.15)
54 |     plt.title(" ".join(param_names) + " Grid Search")
55 | 
56 |     # Setting the labels for the x-axis (gridsearch combination)
57 |     # x_ticks_labels = double_parameter_cross_validation(params, 
58 |     #                                                    'max_depth', 
59 |     #                                                    'min_samples_leaf', 
60 |     #                                                    'n_estimators')
61 |     # Set number of ticks for x-axis
62 |     ax1.set_xticks([])
63 | 
64 |     # Set ticks labels for x-axis
65 |     # ax1.set_xticklabels(x_ticks_labels, rotation=70, fontsize=6);
66 | 
67 |     # For the train set
68 |     ax2.plot(x, list(metrics['mean_train_precision']), '--',
69 |              label='mean train precision', color='c')
70 |     ax2.plot(x, list(metrics['mean_train_recall']), '-.',
71 |              label='mean train recall', color='m')
72 |     ax2.plot(x, list(metrics['mean_train_roc_auc']), '-o',
73 |              label='mean train roc auc', color='y')
74 |     ax2.plot(x, list(metrics['mean_train_accuracy']), '-*',
75 |              label='mean train accuracy', color='k')
76 |     ax2.plot(x, list(metrics['mean_train_f1']), '-',
77 |              label='mean train f1', color='orange')
78 |     ax2.plot(x_13, y_13, '-.', color='pink', lw=2.0)
79 |     ax2.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=6)
80 |     plt.ylim([0.95, 1.0])
81 |     if len(param_names) == 2:
82 |         x_ticks_labels = double_ticker_grid(params,
83 |                                             param_names[0],
84 |                                             param_names[1])
85 |     if len(param_names) == 3:
86 |         x_ticks_labels = triple_ticker_grid(params,
87 |                                             param_names[0],
88 |                                             param_names[1],
89 |                                             param_names[2])
90 |     # Set number of ticks for x-axis
91 |     ax2.set_xticks(x)
92 |     # Set ticks labels for x-axis
93 |     ax2.set_xticklabels(x_ticks_labels, rotation=70, fontsize=7, ha='right')
94 |     ax2.set_ylabel('Train metrics')
95 |     plt.savefig(path_to_plot + name)
96 |     plt.show()
97 | 


--------------------------------------------------------------------------------
/models/tuning/tuning_lgbm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from lightgbm import LGBMClassifier
 4 | from sklearn.model_selection import GridSearchCV
 5 | 
 6 | from models.tools import load_data
 7 | from models.tuning.tools import plot_grid
 8 | 
 9 | # deactivate deprecation warnings
10 | warnings.simplefilter("ignore", DeprecationWarning)
11 | 
12 | n_jobs = 2
13 | 
14 | # path
15 | path_to_data = "data/"
16 | path_to_submissions = "submissions/"
17 | path_to_stacking = "stacking/"
18 | path_to_plots = "plots/"
19 | 
20 | # used features
21 | 
22 | my_features_string = [
23 |     "date_diff",
24 |     "overlap_title",
25 |     "common_author",
26 |     "score_1_2",
27 |     "score_2_1",
28 |     "cosine_distance",
29 |     "journal_similarity",
30 |     "overlapping_words_abstract",
31 |     "jaccard",
32 |     "adar",
33 |     "preferential_attachment",
34 |     "resource_allocation_index",
35 |     "out_neighbors",
36 |     "in_neighbors",
37 |     "common_neighbors",
38 |     # "shortest_path",
39 |     "popularity",
40 |     "common_successors",
41 |     "common_predecessors",
42 |     # "paths_of_length_one",
43 |     "authors_citation"
44 |     "coauthor_score"
45 |     # # "katz"
46 |     # # "katz_2"
47 | ]
48 | 
49 | # load data
50 | 
51 | (X_train,
52 |  X_test,
53 |  Y_train,
54 |  my_features_index,
55 |  my_features_dic) = load_data(my_features_string)
56 | 
57 | # GridSearchCV
58 | 
59 | # param grid
60 | 
61 | tuned_parameters = {
62 |     # 'metric': {},
63 |     'num_leaves': [150, 200, 250],
64 |     "min_data_in_leaf": [2, 4, 6],
65 |     "max_depth": [150, 200, 250]
66 | }
67 | 
68 | # tuning
69 | gbm = LGBMClassifier(
70 |     boosting_type='gbdt',
71 |     objective='binary',
72 |     # 'metric': {},
73 |     learning_rate=0.1,
74 |     feature_fraction=0.4,
75 |     bagging_fraction=0.6,
76 |     bagging_freq=5,
77 |     silent=True)
78 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"]
79 | grid_lgbm = GridSearchCV(gbm,
80 |                          param_grid=tuned_parameters,
81 |                          scoring=metrics,
82 |                          refit='f1',
83 |                          cv=5,
84 |                          n_jobs=n_jobs
85 |                          )
86 | grid_lgbm.fit(X_train, Y_train, verbose=-1)
87 | print("GridSearch best parameters", grid_lgbm.best_params_)
88 | 
89 | # plot grid search results
90 | best_params = grid_lgbm.best_params_
91 | results = grid_lgbm.cv_results_
92 | index = grid_lgbm.best_index_
93 | plot_grid(metrics=results,
94 |           params=tuned_parameters,
95 |           index=index,
96 |           param_names=list(tuned_parameters),
97 |           name="grid_lgbm")
98 | 


--------------------------------------------------------------------------------
/models/tuning/tuning_nn.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from keras.layers import Dense, Dropout
 3 | from keras.models import Sequential
 4 | from keras.wrappers.scikit_learn import KerasClassifier
 5 | from sklearn.model_selection import GridSearchCV
 6 | from sklearn.preprocessing import StandardScaler
 7 | 
 8 | from models.tools import load_data
 9 | 
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 | path_to_stacking = "stacking/"
14 | path_to_plots = "models/plots/"
15 | 
16 | # load data
17 | my_features_string = [
18 |     "date_diff",
19 |     "overlap_title",
20 |     "common_author",
21 |     "score_1_2",
22 |     "score_2_1",
23 |     "cosine_distance",
24 |     "journal_similarity",
25 |     # "overlapping_words_abstract",
26 |     "jaccard",
27 |     "adar",
28 |     "preferential_attachment",
29 |     "resource_allocation_index",
30 |     "out_neighbors",
31 |     "in_neighbors",
32 |     "common_neighbors",
33 |     # "shortest_path",
34 |     "popularity",
35 |     # "paths_of_length_one"
36 |     # "katz"
37 |     # "katz_2"
38 | ]
39 | 
40 | (X_train,
41 |  X_test,
42 |  Y_train,
43 |  my_features_index,
44 |  my_features_dic) = load_data(my_features_string)
45 | 
46 | # check for nans
47 | data = pd.DataFrame(X_test)
48 | print(data.info())
49 | print(data.isna().sum(axis=0))
50 | print(data.min(axis=0))
51 | print(data.max(axis=0))
52 | print(my_features_index)
53 | print(my_features_dic)
54 | 
55 | # normalize data
56 | scaler = StandardScaler()
57 | X_train = scaler.fit_transform(X_train)
58 | X_test = scaler.transform(X_test)
59 | 
60 | # Function to create model, required for KerasClassifier
61 | nb_input = len(my_features_string)
62 | 
63 | 
64 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
65 |     # create model
66 |     model = Sequential()
67 |     model.add(Dense(neurons, input_dim=nb_input, activation=activation))
68 |     model.add(Dropout(dropout_rate))
69 |     model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
70 |     # Compile model
71 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
72 |     return model
73 | 
74 | # fixed parameters
75 | epochs = 20
76 | batch_size = 128
77 | 
78 | # create model
79 | model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=batch_size, verbose=1)
80 | 
81 | # define the grid search parameters
82 | neurons = [15, 30, 45, 60, 75]
83 | dropout_rate = [0.0, 0.1, 0.2, 0.3]
84 | activation = ['relu', 'tanh', 'sigmoid']
85 | param_grid = dict(neurons=neurons, dropout_rate=dropout_rate, activation=activation)
86 | grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
87 | grid_result = grid.fit(X_train, Y_train)
88 | 
89 | # summarize results
90 | print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
91 | means = grid_result.cv_results_['mean_test_score']
92 | stds = grid_result.cv_results_['std_test_score']
93 | params = grid_result.cv_results_['params']
94 | for mean, stdev, param in zip(means, stds, params):
95 |     print("%f (%f) with: %r" % (mean, stdev, param))
96 | 


--------------------------------------------------------------------------------
/models/tuning/tuning_random_forest.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | from sklearn.model_selection import GridSearchCV
 3 | 
 4 | from models.tools import load_data
 5 | 
 6 | # path
 7 | path_to_data = "data/"
 8 | path_to_submissions = "submissions/"
 9 | path_to_stacking = "stacking/"
10 | path_to_plots = "plots/"
11 | 
12 | # tuned hyper-parameters
13 | 
14 | parameters = {
15 |     "criterion": "entropy",  # default = gini
16 |     "bootstrap": True,
17 |     "n_jobs": -1
18 | }
19 | 
20 | # used features
21 | 
22 | my_features_string = [
23 |     "date_diff",
24 |     "overlap_title",
25 |     "common_author",
26 |     # "score_1_2",
27 |     # "score_2_1",
28 |     "cosine_distance",
29 |     # "journal_similarity",
30 |     # "overlapping_words_abstract",
31 |     # "jaccard",
32 |     # "adar",
33 |     "preferential_attachment",
34 |     # "resource_allocation_index",
35 |     "out_neighbors",
36 |     "in_neighbors",
37 |     "common_neighbors",
38 |     "shortest_path",
39 |     "popularity",
40 |     "common_successors",
41 |     "common_predecessors",
42 |     "paths_of_length_one"
43 |     # "katz"
44 |     # "katz_2"
45 | ]
46 | 
47 | # load data
48 | 
49 | (X_train,
50 |  X_test,
51 |  Y_train,
52 |  my_features_index,
53 |  my_features_dic) = load_data(my_features_string)
54 | 
55 | 
56 | # GridSearchCV
57 | 
58 | # param grid
59 | 
60 | tuned_parameters = {
61 |     "n_estimators": [150],
62 |     "max_depth": [3, 6, 9, 12, 15, 20],
63 |     "min_samples_leaf": [3, 5, 10, 20]
64 | }
65 | 
66 | # tuning
67 | rf = RandomForestClassifier(
68 |     criterion=parameters["criterion"],
69 |     bootstrap=parameters["bootstrap"],
70 |     n_jobs=parameters["n_jobs"]
71 | )
72 | 
73 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"]
74 | grid_RF = GridSearchCV(rf,
75 |                        param_grid=tuned_parameters,
76 |                        scoring=metrics,
77 |                        refit='f1',
78 |                        cv=5,
79 |                        n_jobs=-1,
80 |                        verbose=10
81 |                        )
82 | grid_RF.fit(X_train, Y_train)
83 | print("GridSearch best parameters", grid_RF.best_params_)
84 | 


--------------------------------------------------------------------------------
/models/tuning/tuning_svm.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from sklearn.model_selection import StratifiedKFold
 4 | from sklearn.svm import SVC
 5 | from skopt import gp_minimize
 6 | 
 7 | from models.tools import f1_score, load_data
 8 | from models.tuning.objective_function import ObjectiveFunction
 9 | 
10 | # path
11 | path_to_data = "data/"
12 | path_to_plots = "models/tuning/plots/"
13 | 
14 | # used features
15 | 
16 | my_features_string = [
17 |     "overlap_title",
18 |     "date_diff",
19 |     "common_author",
20 |     "journal_similarity",
21 |     "overlapping_words_abstract",
22 |     "cosine_distance",
23 |     "shortest_path",
24 |     "jaccard",
25 |     "adar",
26 |     "preferential_attachment",
27 |     "resource_allocation_index",
28 |     "out_neighbors",
29 |     "in_neighbors",
30 |     "common_neighbors"
31 | ]
32 | 
33 | # load data
34 | 
35 | (X_train,
36 |  X_test,
37 |  Y_train,
38 |  my_features_index,
39 |  my_features_dic) = load_data(my_features_string)
40 | 
41 | 
42 | # function to optimize (too costly --> feature selection, subsampling)
43 | def objective_svm(x):
44 |     C_in, gamma_in = x[0] ** 2, x[1] ** 2
45 |     svm_classifier = SVC(C=C_in, cache_size=200,
46 |                          class_weight=None,
47 |                          coef0=0.0,
48 |                          decision_function_shape='ovr',
49 |                          degree=3, gamma=gamma_in,
50 |                          kernel='rbf',
51 |                          max_iter=-1,
52 |                          probability=False,
53 |                          random_state=None,
54 |                          shrinking=True,
55 |                          tol=0.001,
56 |                          verbose=False)
57 |     k = 5
58 |     kf = StratifiedKFold(k)
59 |     i = 0
60 |     score = 0
61 |     for train_index, test_index in kf.split(X_train, Y_train):
62 |         svm_classifier.fit(X_train[train_index], Y_train[train_index])
63 |         Y_pred = svm_classifier.predict(X_train[test_index])
64 |         score += f1_score(Y_train[test_index], Y_pred)
65 |         i += 1
66 |     return score
67 | 
68 | 
69 | # Bayesian Optimization (too costly --> feature selection, subsampling)
70 | f_bo = ObjectiveFunction(objective_svm)
71 | t0 = time.time()
72 | res = gp_minimize(f_bo, [(10 ** (-9), 10), (10 ** (-9), 0.1)], n_jobs=4)
73 | t1 = time.time()
74 | print("The total time with BO is : " + str(t1 - t0) + " seconds")
75 | print('best score BO :', -res.fun)
76 | print('best parameters BO:', res.x)
77 | 


--------------------------------------------------------------------------------
/models/tuning/tuning_svm_feat_selec.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import PCA
 2 | from sklearn.feature_selection import SelectKBest, mutual_info_classif
 3 | from sklearn.model_selection import GridSearchCV
 4 | from sklearn.pipeline import Pipeline
 5 | from sklearn.svm import SVC
 6 | from sklearn.utils import resample
 7 | 
 8 | from models.tools import load_data
 9 | 
10 | # path
11 | path_to_data = "data/"
12 | path_to_plots = "models/tuning/plots/"
13 | 
14 | # used features
15 | 
16 | my_features_string = [
17 |     "date_diff",
18 |     # "overlap_title",
19 |     "common_author",
20 |     # # "score_1_2",
21 |     # # "score_2_1",
22 |     "cosine_distance",
23 |     # "journal_similarity",
24 |     # # "overlapping_words_abstract",
25 |     # # "jaccard",
26 |     # # "adar",
27 |     "preferential_attachment",
28 |     # # "resource_allocation_index",
29 |     # "out_neighbors",
30 |     "in_neighbors",
31 |     "common_neighbors",
32 |     # "shortest_path",
33 |     # "popularity",
34 |     # "common_successors",
35 |     # "common_predecessors",
36 |     # "paths_of_length_one",
37 |     "authors_citation"
38 |     # "coauthor_score"
39 |     # # "katz"
40 |     # # "katz_2"
41 | ]
42 | 
43 | # load data
44 | 
45 | (X_train,
46 |  X_test,
47 |  Y_train,
48 |  my_features_index,
49 |  my_features_dic) = load_data(my_features_string)
50 | 
51 | # subsampling
52 | X_train_sub, Y_train_sub = resample(X_train, Y_train, n_samples=500, random_state=42)
53 | print(X_train_sub.shape, Y_train_sub.shape)
54 | # pipeline architecture
55 | pipe = Pipeline([
56 |     ('reduce_dim', PCA()),
57 |     ('classif', SVC(gamma=0.01))
58 | ])
59 | # parameter values
60 | nb_features = [2, 4]
61 | Cs = [0.001, 0.01, 0.1]
62 | kernels = ['linear', 'rbf']
63 | 
64 | # parameter grid
65 | param_grid = [
66 |     {
67 |         'reduce_dim': [PCA()],
68 |         'reduce_dim__n_components': nb_features,
69 |         'classif__C': Cs,
70 |         'classif__kernel': kernels
71 |     },
72 |     # {
73 |     #     'reduce_dim': [SelectKBest(chi2)],
74 |     #     'reduce_dim__k': nb_features,
75 |     #     'classif__C': Cs,
76 |     #     'classif__kernel':kernels
77 |     # }
78 |     # ,
79 |     {
80 |         'reduce_dim': [SelectKBest(mutual_info_classif)],
81 |         'reduce_dim__k': nb_features,
82 |         'classif__C': Cs,
83 |         'classif__kernel': kernels
84 |     }
85 | ]
86 | 
87 | # cross validation grid search instance
88 | grid = GridSearchCV(pipe, cv=4, n_jobs=2, param_grid=param_grid, verbose=10)
89 | 
90 | # fit grid
91 | grid.fit(X_train_sub, Y_train_sub)
92 | 
93 | # print best params
94 | print(grid.best_params_)
95 | 


--------------------------------------------------------------------------------
/notes:
--------------------------------------------------------------------------------
  1 | Lien vers kaggle: https://www.kaggle.com/t/012200c0318541f6806bfe757092b4f0
  2 | Il faut citer des papiers si possible.
  3 | 
  4 | Il faudrait faire un plan d'attaque.
  5 | D'abord tester des choses très simples pour avoir des premiers résultats et une idée de la puissance de calcul nécessaire.
  6 | 
  7 | Le modèle "baseline" prend en compte seulement quelques features et les entraîne dans un SVM:
  8 | - number of overlapping words in paper titles
  9 | - number of common authors
 10 | - difference in publication years
 11 | 
 12 | Il faudrait créer ces features et ajouter quelques features au fur et à mesure.
 13 | 
 14 | brainstorming sur les features:
 15 | - mots les plus importants d'après TF-IDF dans le titre et dans l'abstract: check. voir comment on fait exactement. cosine distance ?
 16 | - biensûr les dates: à faire
 17 | - un genre d'historique de citation des auteurs. Par exemple les 10 auteurs les plus cités par les auteurs du texte: à faire
 18 | - représentation word2vec des abstracts ?
 19 | - représentations sous formes de graphes des textes et essayer d'en extraire des features
 20 | 
 21 | 
 22 | à faire:
 23 | relire les cours pour trouver tout ce qui pourrait nous servir.
 24 | 
 25 | brainstorming data exploration:
 26 | - nombre d'auteurs différents: check
 27 | - nombre d'apparitions d'un auteur dans la base de donnée: check
 28 | - les mots les plus fréquents dans les abstracts: pas encore
 29 | - distribution du nombre d'overlapping words chez les textes qui ne se citent pas et chez les textes qui se citent: check
 30 | - faire un tf-idf sur la base de donnée entière et voir les résultats de ça: à faire aussi
 31 | - combien de journaux différents ?
 32 | - combien d'auteurs manquants ?
 33 | 
 34 | subsampler pour l'exploration, les premiers tests ?
 35 | 
 36 | brainstorming recherche d'articles:
 37 | - checker dans les cours les articles qui sont cités
 38 | - demander à des gens ?
 39 | - faire des recherches en ligne
 40 | 
 41 | 
 42 | 
 43 | Références à aller checker:
 44 | • Christopher	D.	Manning,	Prabhakar Raghavan and	Hinrich
 45 | Schütze,	Introduction	to	Information	Retrieval,	Cambridge
 46 | University	Press.	2008.	http://www-nlp.stanford.edu/IR-book/
 47 | • “Indexing	by	Latent	Semantic	Analysis”,	S.Deerwester,
 48 | S.Dumais,	T.Landauer,	G.Fumas,	R.Harshman,	Journal	of	the
 49 | Society	for	Information	Science,	1990
 50 | • “Mining	the	Web:	Discovering	Knowledge	from	Hypertext
 51 | Data”,	Soumen	Chakrabarti
 52 | 
 53 | 
 54 | 
 55 | Résultats:
 56 | Random Forest avec les paramètres de base:
 57 | Avec 10 estimateurs j'ai peu d'overfitting et un score de 0.85. Avec 30 estimateurs j'ai pas franchement plus d'overfitting mais les résultats ne s'améliorent pas.
 58 | Si je rajoute cosine distance avec 30 estimateurs j'ai masse overfitting et les résultats qui baissent. Avec 10 estimateurs ça overfitte encore pas mal. Donc faudrait réussir à réduire cet overfitting
 59 | Si je rajoute les deux score en plus, on reste sur de l'overfitting de gros porc.
 60 | 
 61 | Light GBM:
 62 | pas d'overfitting sur les features de base. Léger overfitting avec la cosine distance mais les résultats sont meilleurs, genre 0.87. Ensuite rajouter les deux scores n'améliore pas vraiment les résultats.
 63 | 
 64 | Avec shortest path on arrive à 94.5/93.8 (train/test).
 65 | En LGBM on a des résultats un peu meilleurs et avec moins d'overfitting.
 66 | 
 67 | On passe à 94.1/94.1 si on rajoute cosine distance et l'ajout de cosine distance a un intérêt très limité...
 68 | 
 69 | LGBM avec les basics et shortest path:  92.9/92.8
 70 | LGBM avec shortest path et overlapping: 94.2/93.9
 71 | RF avec shortest path et overlapping:   94.5/93.7
 72 | LGBM avec shortest path et cosine distance: 94.2/93.9
 73 | 
 74 | 
 75 | Un papier qui fait de la link prediction (coauthorship)
 76 | http://www.cs.rpi.edu/~zaki/PaperDir/LINK06.pdf
 77 | Un article qui donne des bonnées idées sur la théorie des graphes:
 78 | http://be.amazd.com/link-prediction/
 79 | 
 80 | Une thèse sur les graphes dirigés:
 81 | https://www.cs.upc.edu/~dariog/PhD-Thesis-Link-Prediction-DGG.pdf
 82 | 
 83 | 
 84 | relecture de code le 1er Mars:
 85 | _ je change un peu le format du code. Maintenant c'est en mode projet donc tu dois ouvrir tout le bordel sous pycharm et changer les paramètres pour que le working directory ça soit toujours la source du projet
 86 | - preprocessing. done.
 87 | - pour moi dans author_graph_features il y a un soucis. Le même que ce qu'on avait déjà eu avant, il faudrait supprimer les arrêtes qui existent si target == 1. à corriger ou à jeter... Si tu le corriges je peux le faire tourner sur Compute Engine.
 88 | - baseline: ok
 89 | - citation_graph_features: ok
 90 | - network_x bigraph: ok
 91 | - network_x digraph: ok
 92 | - en train de faire le network_x bigraph_long pour calculer katz
 93 | 
 94 | nohup python3 -u task_manager.py > log.txt 2>&1 &
 95 | 
 96 | https://drive.google.com/file/d/1RetpAekytXLNwQLUfJhHxamGHcOd_7j8/view?usp=sharing
 97 | 
 98 | 17663 sur le cloud
 99 | 
100 | 371 460 dans le dernier tail log.txt à 22H35.
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/ressources/data_challenge_description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/ressources/data_challenge_description.pdf


--------------------------------------------------------------------------------
/results/results:
--------------------------------------------------------------------------------
 1 | date: 2018-02-15 00:26:26.153189
 2 | features: ['overlap_title', 'date_diff', 'common_author']
 3 | model: Random Forest
 4 | parameters: default
 5 | cross validation:
 6 | 0.7780963908271935
 7 | 0.7784294452612852
 8 | 0.776469919253952
 9 | 0.7771685269126416
10 | 0.7766730028756641
11 | kaggle score: 0.77710
12 | 
13 | 
14 | date: 2018-02-15 00:26:26.153189
15 | features: ['overlap_title', 'date_diff', 'common_author', "journal_similarity"]
16 | model: Random Forest
17 | parameters: default
18 | cross validation:
19 | 0.7797616629976524
20 | 0.7810288945029772
21 | 0.778419522022388
22 | 0.7788338126106805
23 | 0.779093759646472
24 | kaggle score: 0.77904
25 | 
26 | 
27 | date: 2018-02-15 00:30:45.924858
28 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract']
29 | model: Random Forest
30 | parameters: default
31 | cross validation:
32 | 0.8342363711688586
33 | 0.8365596289286208
34 | 0.8348442754788712
35 | 0.8360140371399327
36 | 0.8359328036912479
37 | kaggle score: 0.83755
38 | 
39 | 
40 | date: 2018-02-15 15:09:52.437552
41 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance']
42 | model: Random Forest
43 | parameters:
44 | {'min_data_in_leaf': 2, 'max_depth': 200, 'boosting_type': 'gbdt', 'objective': 'binary', 'task': 'train', 'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'learning_rate': 0.1, 'bagging_freq': 5, 'num_leaves': 200}
45 | cross validation:
46 | Start training...
47 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
48 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
49 | [40]	valid_0's f1 score: 0.870444
50 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
51 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
52 | [80]	valid_0's f1 score: 0.871552
53 | train: 0.874026271431
54 | test: 0.871902069297
55 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
56 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
57 | [40]	valid_0's f1 score: 0.872282
58 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
59 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
60 | [80]	valid_0's f1 score: 0.872716
61 | train: 0.873581441765
62 | test: 0.872562595906
63 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
64 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
65 | [40]	valid_0's f1 score: 0.87105
66 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
67 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
68 | [80]	valid_0's f1 score: 0.872016
69 | train: 0.873792318185
70 | test: 0.872204161004
71 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
72 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
73 | [40]	valid_0's f1 score: 0.871357
74 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
75 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
76 | [80]	valid_0's f1 score: 0.872234
77 | train: 0.873583781681
78 | test: 0.872538922426
79 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
80 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
81 | [40]	valid_0's f1 score: 0.869521
82 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
83 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
84 | [80]	valid_0's f1 score: 0.870304
85 | train: 0.874169648139
86 | test: 0.870220013444
87 | kaggle score:
88 | 
89 | 


--------------------------------------------------------------------------------
/sampling/sampling.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | path_to_data = "~/Documents/polytechnique/3A/nlp/link-prediction/data/"
 4 | 
 5 | divide_by = 100
 6 | sample_size_string = str(divide_by)
 7 | 
 8 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
 9 | nodes = pd.read_csv(path_to_data+"node_information.csv", names=nodes_header)
10 | 
11 | names = ["id1", "id2", "target"]
12 | training = pd.read_csv(path_to_data+"training_set.txt", names=names, delimiter=" ")
13 | 
14 | sample_1 = training.sample(frac=1.0/divide_by, replace=False)
15 | sample_2 = sample_1.copy()
16 | sample_2.columns = ["id2", "id1", "target"]
17 | 
18 | names = ["id1", "id2"]
19 | testing = pd.read_csv(path_to_data+"testing_set.txt", names=names, delimiter=" ")
20 | 
21 | sample_1_testing = testing.sample(frac=1.0/divide_by, replace=False)
22 | 
23 | sample_2_testing = sample_1_testing.copy()
24 | sample_2_testing.columns = ["id2", "id1"]
25 | 
26 | all_ids = pd.concat([sample_1, sample_2, sample_1_testing, sample_2_testing])
27 | del all_ids["target"]
28 | del all_ids["id2"]
29 | all_ids.columns = ["id"]
30 | 
31 | all_ids_2 = all_ids.groupby(by="id").first().reset_index()
32 | 
33 | merged = all_ids_2.merge(right=nodes, how="inner")
34 | 
35 | 
36 | merged.to_csv(path_to_data+"node_information"+sample_size_string+".csv", header=False)
37 | sample_1.to_csv(path_to_data+"training_set"+sample_size_string+".txt", header=False, sep=" ")
38 | sample_1_testing.to_csv(path_to_data+"testing_set"+sample_size_string+".txt", header=False, sep=" ")
39 | 


--------------------------------------------------------------------------------
/stacking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/stacking/__init__.py


--------------------------------------------------------------------------------
/stacking/stacking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | from sklearn.metrics import f1_score
 5 | from sklearn.model_selection import StratifiedKFold
 6 | 
 7 | # path
 8 | path_to_data = "data/"
 9 | path_to_submissions = "submissions/"
10 | path_to_stacking = "stacking/"
11 | 
12 | # get labels
13 | names = ["id1", "id2", "target"]
14 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
15 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
16 | Y_train = Y_train['target'].values
17 | Y_test = Y_test['target'].values
18 | 
19 | # group model predictions as features
20 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
21 | X_train = pd.DataFrame(columns=model_strings)
22 | X_test = pd.DataFrame(columns=model_strings)
23 | for model in model_strings:
24 |     X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
25 |     # take the mean of the test set probs of each cv fold
26 |     if model == 'svm_linear':
27 |         X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
28 |     else:
29 |         X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
30 | print(X_train.head(), X_test.head())
31 | X_train = X_train.values
32 | X_test = X_test.values
33 | 
34 | # model
35 | model = RandomForestClassifier(
36 |     criterion='entropy',
37 |     n_estimators=100,
38 |     min_samples_leaf=6,
39 |     max_depth=7,
40 |     bootstrap=True,
41 |     n_jobs=-1
42 | )
43 | 
44 | # cross validated predictions
45 | k = 5
46 | kf = StratifiedKFold(k)
47 | predictions = np.zeros((X_test.shape[0], k))
48 | i = 0
49 | 
50 | for train_index, test_index in kf.split(X_train, Y_train):
51 |     model.fit(X_train[train_index], Y_train[train_index])
52 |     Y_pred = model.predict(X_train[test_index])
53 |     Y_pred_train = model.predict(X_train[train_index])
54 |     predictions[:, i] = model.predict(X_test)
55 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
56 |     print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
57 |     i += 1
58 | 
59 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
60 | submission = pd.DataFrame(Y_test)
61 | submission.to_csv(
62 |     path_or_buf=path_to_submissions + "stack_sub_rf.csv",
63 |     index=True,
64 |     index_label="id",
65 |     header=["category"]
66 | )
67 | 


--------------------------------------------------------------------------------
/stacking/stacking_tuning.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.ensemble import RandomForestClassifier
  4 | from sklearn.feature_selection import SelectKBest, chi2
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics import f1_score
  7 | from sklearn.model_selection import GridSearchCV, StratifiedKFold
  8 | from sklearn.pipeline import Pipeline
  9 | 
 10 | # path
 11 | path_to_data = "data/"
 12 | path_to_submissions = "submissions/"
 13 | path_to_stacking = "stacking/"
 14 | 
 15 | # get labels
 16 | names = ["id1", "id2", "target"]
 17 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
 18 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
 19 | Y_train = Y_train['target'].values
 20 | Y_test = Y_test['target'].values
 21 | 
 22 | # group model predictions as features
 23 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
 24 | X_train = pd.DataFrame(columns=model_strings)
 25 | X_test = pd.DataFrame(columns=model_strings)
 26 | for model in model_strings:
 27 |     X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
 28 |     # take the mean of the test set probs of each cv fold
 29 |     if model == 'svm_linear':
 30 |         X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
 31 |     else:
 32 |         X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
 33 | print(X_train.head(), X_test.head())
 34 | X_train = X_train.values
 35 | X_test = X_test.values
 36 | 
 37 | # fit a grid searched logistic regression on top of the base models
 38 | # parameters grid
 39 | param_grid = {
 40 |     "C": [1, 0.1, 0.01, 0.001],
 41 |     "penalty": ["l2", "l1"]
 42 | }
 43 | 
 44 | # pipeline architecture
 45 | pipe = Pipeline([
 46 |     ('reduce_dim', SelectKBest(chi2)),
 47 |     ('classif', LogisticRegression())
 48 | ])
 49 | # parameter values
 50 | nb_features = [2, 3, 4, 5]
 51 | C = [0.001, 0.01, 0.1]
 52 | kernels = ['linear', 'rbf']
 53 | n_estimators = [100, 200]
 54 | max_depth = [10, 20]
 55 | min_samples_leaf = [20]
 56 | penalty = ["l2", "l1"]
 57 | 
 58 | # parameter grid
 59 | param_grid = [
 60 |     {
 61 |         'reduce_dim__k': [5],
 62 |         'classif': [RandomForestClassifier(bootstrap=True, n_jobs=-1)],
 63 |         'classif__n_estimators': n_estimators,
 64 |         'classif__max_depth': max_depth,
 65 |         'classif__min_samples_leaf': min_samples_leaf,
 66 | 
 67 |     },
 68 |     {
 69 |         'reduce_dim__k': nb_features,
 70 |         'classif': [LogisticRegression(n_jobs=-1)],
 71 |         'classif__C': C,
 72 |         'classif__penalty': penalty
 73 |     }
 74 | ]
 75 | 
 76 | # cross validation grid search instance
 77 | grid = GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=param_grid, verbose=10)
 78 | 
 79 | # fit grid
 80 | grid.fit(X_train, Y_train)
 81 | 
 82 | # print best params
 83 | print(grid.best_params_)
 84 | 
 85 | # get params
 86 | print(grid.best_params_)
 87 | parameters = grid.best_params_
 88 | 
 89 | # model instance for prediction
 90 | model = grid.best_estimator_
 91 | 
 92 | # cross validated predictions
 93 | k = 5
 94 | kf = StratifiedKFold(k)
 95 | predictions = np.zeros((X_test.shape[0], k))
 96 | i = 0
 97 | 
 98 | for train_index, test_index in kf.split(X_train, Y_train):
 99 |     model.fit(X_train[train_index], Y_train[train_index])
100 |     Y_pred = model.predict(X_train[test_index])
101 |     Y_pred_train = model.predict(X_train[train_index])
102 |     predictions[:, i] = model.predict(X_test)
103 |     print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
104 |     print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
105 |     i += 1
106 | 
107 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
108 | submission = pd.DataFrame(Y_test)
109 | submission.to_csv(
110 |     path_or_buf=path_to_submissions + "stack_sub2.csv",
111 |     index=True,
112 |     index_label="id",
113 |     header=["category"]
114 | )
115 | 


--------------------------------------------------------------------------------
/stacking/stacking_tuning_micro.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.ensemble import RandomForestClassifier
 3 | from sklearn.model_selection import GridSearchCV
 4 | 
 5 | # path
 6 | path_to_data = "data/"
 7 | path_to_submissions = "submissions/"
 8 | path_to_stacking = "stacking/"
 9 | 
10 | # get labels
11 | names = ["id1", "id2", "target"]
12 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
13 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
14 | Y_train = Y_train['target'].values
15 | Y_test = Y_test['target'].values
16 | 
17 | # group model predictions as features
18 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
19 | X_train = pd.DataFrame(columns=model_strings)
20 | X_test = pd.DataFrame(columns=model_strings)
21 | for model in model_strings:
22 |     X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
23 |     # take the mean of the test set probs of each cv fold
24 |     if model == 'svm_linear':
25 |         X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
26 |     else:
27 |         X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
28 | print(X_train.head(), X_test.head())
29 | X_train = X_train.values
30 | X_test = X_test.values
31 | 
32 | # GridSearchCV to fine tune the stacking parameters
33 | 
34 | # instantiate param grid
35 | 
36 | tuned_parameters = {
37 |     "n_estimators": [100],
38 |     "max_depth": [3, 7, 10],
39 |     "min_samples_leaf": [6],
40 |     "criterion": ["entropy"]
41 | }
42 | 
43 | # fit a GridSearchCV instance and print optimal parameters
44 | rf = RandomForestClassifier(
45 |     bootstrap=True,
46 |     n_jobs=-1
47 | )
48 | 
49 | metrics = ["f1"]
50 | grid_RF = GridSearchCV(rf,
51 |                        param_grid=tuned_parameters,
52 |                        scoring=metrics,
53 |                        refit='f1',
54 |                        cv=4,
55 |                        n_jobs=-1,
56 |                        verbose=10
57 |                        )
58 | grid_RF.fit(X_train, Y_train)
59 | print("GridSearch best parameters", grid_RF.best_params_)
60 | 


--------------------------------------------------------------------------------
/task_manager.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_author_graph_features.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "ename": "ImportError",
10 |      "evalue": "No module named 'code.feature_engineering'; 'code' is not a package",
11 |      "traceback": [
12 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
13 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
14 |       "\u001b[0;32m<ipython-input-1-a6f3aa142ae6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0migraph\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_engineering\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtools\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mlit_eval_nan_proof\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;31m# progress bar for pandas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
15 |       "\u001b[0;31mImportError\u001b[0m: No module named 'code.feature_engineering'; 'code' is not a package"
16 |      ],
17 |      "output_type": "error"
18 |     }
19 |    ],
20 |    "source": [
21 |     "import pandas as pd\n",
22 |     "import numpy as np\n",
23 |     "from tqdm import tqdm\n",
24 |     "from itertools import permutations\n",
25 |     "import igraph\n",
26 |     "\n",
27 |     "from code.feature_engineering.tools import lit_eval_nan_proof\n",
28 |     "\n",
29 |     "# progress bar for pandas\n",
30 |     "tqdm.pandas(tqdm())\n",
31 |     "\n",
32 |     "# path\n",
33 |     "path_to_data = \"../../data/\"\n",
34 |     "\n",
35 |     "# loading data\n",
36 |     "converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,\n",
37 |     "                  'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}\n",
38 |     "nodes = pd.read_csv(path_to_data + \"nodes_preprocessed.csv\", converters=converter_dict)\n",
39 |     "nodes.set_index(\"id\", inplace=True)\n",
40 |     "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
41 |     "training.set_index(\"my_index\", inplace=True)\n",
42 |     "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
43 |     "testing.set_index(\"my_index\", inplace=True)\n",
44 |     "\n",
45 |     "# create author graph\n",
46 |     "# vertices are authors\n",
47 |     "# edge of weight 1 if they co-wrote a paper, 2 if they only cite each other\n",
48 |     "\n",
49 |     "# create empty directed graph\n",
50 |     "g = igraph.Graph(directed=True)"
51 |    ]
52 |   },
53 |   {
54 |    "cell_type": "code",
55 |    "execution_count": null,
56 |    "metadata": {},
57 |    "outputs": [],
58 |    "source": [
59 |     "# add vertices\n",
60 |     "authors = nodes['authors']\n",
61 |     "authors_set = list(set(authors.dropna().sum()))\n",
62 |     "g.add_vertices(authors_set)"
63 |    ]
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 3",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.5.2"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/tests/test_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<div>\n",
 12 |        "<style scoped>\n",
 13 |        "    .dataframe tbody tr th:only-of-type {\n",
 14 |        "        vertical-align: middle;\n",
 15 |        "    }\n",
 16 |        "\n",
 17 |        "    .dataframe tbody tr th {\n",
 18 |        "        vertical-align: top;\n",
 19 |        "    }\n",
 20 |        "\n",
 21 |        "    .dataframe thead th {\n",
 22 |        "        text-align: right;\n",
 23 |        "    }\n",
 24 |        "</style>\n",
 25 |        "<table border=\"1\" class=\"dataframe\">\n",
 26 |        "  <thead>\n",
 27 |        "    <tr style=\"text-align: right;\">\n",
 28 |        "      <th></th>\n",
 29 |        "      <th>my_index</th>\n",
 30 |        "      <th>id1</th>\n",
 31 |        "      <th>id2</th>\n",
 32 |        "      <th>target</th>\n",
 33 |        "      <th>overlap_title</th>\n",
 34 |        "      <th>date_diff</th>\n",
 35 |        "      <th>common_author</th>\n",
 36 |        "      <th>score_1_2</th>\n",
 37 |        "      <th>score_2_1</th>\n",
 38 |        "      <th>cosine_distance</th>\n",
 39 |        "      <th>jaccard</th>\n",
 40 |        "      <th>adar</th>\n",
 41 |        "      <th>preferential_attachment</th>\n",
 42 |        "      <th>resource_allocation_index</th>\n",
 43 |        "      <th>common_neighbors</th>\n",
 44 |        "    </tr>\n",
 45 |        "  </thead>\n",
 46 |        "  <tbody>\n",
 47 |        "    <tr>\n",
 48 |        "      <th>0</th>\n",
 49 |        "      <td>9510123|9502114</td>\n",
 50 |        "      <td>9510123</td>\n",
 51 |        "      <td>9502114</td>\n",
 52 |        "      <td>1</td>\n",
 53 |        "      <td>2</td>\n",
 54 |        "      <td>0</td>\n",
 55 |        "      <td>0</td>\n",
 56 |        "      <td>17.844392</td>\n",
 57 |        "      <td>14.535935</td>\n",
 58 |        "      <td>0.075791</td>\n",
 59 |        "      <td>0.066667</td>\n",
 60 |        "      <td>0.513898</td>\n",
 61 |        "      <td>55.0</td>\n",
 62 |        "      <td>0.142857</td>\n",
 63 |        "      <td>0.142857</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>1</th>\n",
 67 |        "      <td>9707075|9604178</td>\n",
 68 |        "      <td>9707075</td>\n",
 69 |        "      <td>9604178</td>\n",
 70 |        "      <td>1</td>\n",
 71 |        "      <td>1</td>\n",
 72 |        "      <td>1</td>\n",
 73 |        "      <td>0</td>\n",
 74 |        "      <td>19.415184</td>\n",
 75 |        "      <td>24.296850</td>\n",
 76 |        "      <td>0.082450</td>\n",
 77 |        "      <td>0.098039</td>\n",
 78 |        "      <td>4.320366</td>\n",
 79 |        "      <td>11388.0</td>\n",
 80 |        "      <td>0.226401</td>\n",
 81 |        "      <td>0.226401</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>2</th>\n",
 85 |        "      <td>9312155|9506142</td>\n",
 86 |        "      <td>9312155</td>\n",
 87 |        "      <td>9506142</td>\n",
 88 |        "      <td>0</td>\n",
 89 |        "      <td>0</td>\n",
 90 |        "      <td>-2</td>\n",
 91 |        "      <td>0</td>\n",
 92 |        "      <td>15.116037</td>\n",
 93 |        "      <td>10.080194</td>\n",
 94 |        "      <td>0.018402</td>\n",
 95 |        "      <td>0.000000</td>\n",
 96 |        "      <td>0.000000</td>\n",
 97 |        "      <td>5.0</td>\n",
 98 |        "      <td>0.000000</td>\n",
 99 |        "      <td>0.000000</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>3</th>\n",
103 |        "      <td>9911255|302165</td>\n",
104 |        "      <td>9911255</td>\n",
105 |        "      <td>302165</td>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>0</td>\n",
108 |        "      <td>-4</td>\n",
109 |        "      <td>0</td>\n",
110 |        "      <td>16.765770</td>\n",
111 |        "      <td>20.295904</td>\n",
112 |        "      <td>0.058245</td>\n",
113 |        "      <td>0.000000</td>\n",
114 |        "      <td>0.000000</td>\n",
115 |        "      <td>280.0</td>\n",
116 |        "      <td>0.000000</td>\n",
117 |        "      <td>0.000000</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>4</th>\n",
121 |        "      <td>9701033|209076</td>\n",
122 |        "      <td>9701033</td>\n",
123 |        "      <td>209076</td>\n",
124 |        "      <td>0</td>\n",
125 |        "      <td>0</td>\n",
126 |        "      <td>-5</td>\n",
127 |        "      <td>0</td>\n",
128 |        "      <td>21.457809</td>\n",
129 |        "      <td>25.240819</td>\n",
130 |        "      <td>0.069025</td>\n",
131 |        "      <td>0.000000</td>\n",
132 |        "      <td>0.000000</td>\n",
133 |        "      <td>168.0</td>\n",
134 |        "      <td>0.000000</td>\n",
135 |        "      <td>0.000000</td>\n",
136 |        "    </tr>\n",
137 |        "  </tbody>\n",
138 |        "</table>\n",
139 |        "</div>"
140 |       ],
141 |       "text/plain": [
142 |        "          my_index      id1      id2  target  overlap_title  date_diff  \\\n",
143 |        "0  9510123|9502114  9510123  9502114       1              2          0   \n",
144 |        "1  9707075|9604178  9707075  9604178       1              1          1   \n",
145 |        "2  9312155|9506142  9312155  9506142       0              0         -2   \n",
146 |        "3   9911255|302165  9911255   302165       0              0         -4   \n",
147 |        "4   9701033|209076  9701033   209076       0              0         -5   \n",
148 |        "\n",
149 |        "   common_author  score_1_2  score_2_1  cosine_distance   jaccard      adar  \\\n",
150 |        "0              0  17.844392  14.535935         0.075791  0.066667  0.513898   \n",
151 |        "1              0  19.415184  24.296850         0.082450  0.098039  4.320366   \n",
152 |        "2              0  15.116037  10.080194         0.018402  0.000000  0.000000   \n",
153 |        "3              0  16.765770  20.295904         0.058245  0.000000  0.000000   \n",
154 |        "4              0  21.457809  25.240819         0.069025  0.000000  0.000000   \n",
155 |        "\n",
156 |        "   preferential_attachment  resource_allocation_index  common_neighbors  \n",
157 |        "0                     55.0                   0.142857          0.142857  \n",
158 |        "1                  11388.0                   0.226401          0.226401  \n",
159 |        "2                      5.0                   0.000000          0.000000  \n",
160 |        "3                    280.0                   0.000000          0.000000  \n",
161 |        "4                    168.0                   0.000000          0.000000  "
162 |       ]
163 |      },
164 |      "execution_count": 9,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "import numpy as np\n",
171 |     "import pandas as pd\n",
172 |     "from tqdm import tqdm\n",
173 |     "\n",
174 |     "path_to_data = \"../data/\"\n",
175 |     "training = pd.read_csv(path_to_data+\"training_features.txt\")\n",
176 |     "training.head()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 7,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/html": [
187 |        "<div>\n",
188 |        "<style scoped>\n",
189 |        "    .dataframe tbody tr th:only-of-type {\n",
190 |        "        vertical-align: middle;\n",
191 |        "    }\n",
192 |        "\n",
193 |        "    .dataframe tbody tr th {\n",
194 |        "        vertical-align: top;\n",
195 |        "    }\n",
196 |        "\n",
197 |        "    .dataframe thead th {\n",
198 |        "        text-align: right;\n",
199 |        "    }\n",
200 |        "</style>\n",
201 |        "<table border=\"1\" class=\"dataframe\">\n",
202 |        "  <thead>\n",
203 |        "    <tr style=\"text-align: right;\">\n",
204 |        "      <th></th>\n",
205 |        "      <th>id1</th>\n",
206 |        "      <th>id2</th>\n",
207 |        "      <th>target</th>\n",
208 |        "      <th>overlap_title</th>\n",
209 |        "      <th>date_diff</th>\n",
210 |        "      <th>common_author</th>\n",
211 |        "    </tr>\n",
212 |        "  </thead>\n",
213 |        "  <tbody>\n",
214 |        "    <tr>\n",
215 |        "      <th>count</th>\n",
216 |        "      <td>6.155120e+05</td>\n",
217 |        "      <td>6.155120e+05</td>\n",
218 |        "      <td>615512.000000</td>\n",
219 |        "      <td>615512.000000</td>\n",
220 |        "      <td>615512.000000</td>\n",
221 |        "      <td>615512.000000</td>\n",
222 |        "    </tr>\n",
223 |        "    <tr>\n",
224 |        "      <th>mean</th>\n",
225 |        "      <td>5.317422e+06</td>\n",
226 |        "      <td>6.798460e+06</td>\n",
227 |        "      <td>0.544474</td>\n",
228 |        "      <td>0.518416</td>\n",
229 |        "      <td>1.156681</td>\n",
230 |        "      <td>0.079396</td>\n",
231 |        "    </tr>\n",
232 |        "    <tr>\n",
233 |        "      <th>std</th>\n",
234 |        "      <td>4.749198e+06</td>\n",
235 |        "      <td>4.343138e+06</td>\n",
236 |        "      <td>0.498019</td>\n",
237 |        "      <td>0.907113</td>\n",
238 |        "      <td>3.521691</td>\n",
239 |        "      <td>0.372206</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>min</th>\n",
243 |        "      <td>1.001000e+03</td>\n",
244 |        "      <td>1.001000e+03</td>\n",
245 |        "      <td>0.000000</td>\n",
246 |        "      <td>0.000000</td>\n",
247 |        "      <td>-11.000000</td>\n",
248 |        "      <td>0.000000</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>25%</th>\n",
252 |        "      <td>1.112660e+05</td>\n",
253 |        "      <td>2.080790e+05</td>\n",
254 |        "      <td>0.000000</td>\n",
255 |        "      <td>0.000000</td>\n",
256 |        "      <td>0.000000</td>\n",
257 |        "      <td>0.000000</td>\n",
258 |        "    </tr>\n",
259 |        "    <tr>\n",
260 |        "      <th>50%</th>\n",
261 |        "      <td>9.310036e+06</td>\n",
262 |        "      <td>9.505058e+06</td>\n",
263 |        "      <td>1.000000</td>\n",
264 |        "      <td>0.000000</td>\n",
265 |        "      <td>1.000000</td>\n",
266 |        "      <td>0.000000</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>75%</th>\n",
270 |        "      <td>9.708050e+06</td>\n",
271 |        "      <td>9.709097e+06</td>\n",
272 |        "      <td>1.000000</td>\n",
273 |        "      <td>1.000000</td>\n",
274 |        "      <td>3.000000</td>\n",
275 |        "      <td>0.000000</td>\n",
276 |        "    </tr>\n",
277 |        "    <tr>\n",
278 |        "      <th>max</th>\n",
279 |        "      <td>9.912293e+06</td>\n",
280 |        "      <td>9.912293e+06</td>\n",
281 |        "      <td>1.000000</td>\n",
282 |        "      <td>10.000000</td>\n",
283 |        "      <td>11.000000</td>\n",
284 |        "      <td>8.000000</td>\n",
285 |        "    </tr>\n",
286 |        "  </tbody>\n",
287 |        "</table>\n",
288 |        "</div>"
289 |       ],
290 |       "text/plain": [
291 |        "                id1           id2         target  overlap_title  \\\n",
292 |        "count  6.155120e+05  6.155120e+05  615512.000000  615512.000000   \n",
293 |        "mean   5.317422e+06  6.798460e+06       0.544474       0.518416   \n",
294 |        "std    4.749198e+06  4.343138e+06       0.498019       0.907113   \n",
295 |        "min    1.001000e+03  1.001000e+03       0.000000       0.000000   \n",
296 |        "25%    1.112660e+05  2.080790e+05       0.000000       0.000000   \n",
297 |        "50%    9.310036e+06  9.505058e+06       1.000000       0.000000   \n",
298 |        "75%    9.708050e+06  9.709097e+06       1.000000       1.000000   \n",
299 |        "max    9.912293e+06  9.912293e+06       1.000000      10.000000   \n",
300 |        "\n",
301 |        "           date_diff  common_author  \n",
302 |        "count  615512.000000  615512.000000  \n",
303 |        "mean        1.156681       0.079396  \n",
304 |        "std         3.521691       0.372206  \n",
305 |        "min       -11.000000       0.000000  \n",
306 |        "25%         0.000000       0.000000  \n",
307 |        "50%         1.000000       0.000000  \n",
308 |        "75%         3.000000       0.000000  \n",
309 |        "max        11.000000       8.000000  "
310 |       ]
311 |      },
312 |      "execution_count": 7,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "training.describe()"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": []
327 |   }
328 |  ],
329 |  "metadata": {
330 |   "kernelspec": {
331 |    "display_name": "Python 3",
332 |    "language": "python",
333 |    "name": "python3"
334 |   },
335 |   "language_info": {
336 |    "codemirror_mode": {
337 |     "name": "ipython",
338 |     "version": 3
339 |    },
340 |    "file_extension": ".py",
341 |    "mimetype": "text/x-python",
342 |    "name": "python",
343 |    "nbconvert_exporter": "python",
344 |    "pygments_lexer": "ipython3",
345 |    "version": "3.5.2"
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 2
350 | }
351 | 


--------------------------------------------------------------------------------
/tests/test_preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<div>\n",
 12 |        "<style scoped>\n",
 13 |        "    .dataframe tbody tr th:only-of-type {\n",
 14 |        "        vertical-align: middle;\n",
 15 |        "    }\n",
 16 |        "\n",
 17 |        "    .dataframe tbody tr th {\n",
 18 |        "        vertical-align: top;\n",
 19 |        "    }\n",
 20 |        "\n",
 21 |        "    .dataframe thead th {\n",
 22 |        "        text-align: right;\n",
 23 |        "    }\n",
 24 |        "</style>\n",
 25 |        "<table border=\"1\" class=\"dataframe\">\n",
 26 |        "  <thead>\n",
 27 |        "    <tr style=\"text-align: right;\">\n",
 28 |        "      <th></th>\n",
 29 |        "      <th>id</th>\n",
 30 |        "      <th>year</th>\n",
 31 |        "      <th>title</th>\n",
 32 |        "      <th>authors</th>\n",
 33 |        "      <th>journal</th>\n",
 34 |        "      <th>abstract</th>\n",
 35 |        "    </tr>\n",
 36 |        "  </thead>\n",
 37 |        "  <tbody>\n",
 38 |        "    <tr>\n",
 39 |        "      <th>0</th>\n",
 40 |        "      <td>1001</td>\n",
 41 |        "      <td>2000</td>\n",
 42 |        "      <td>compactification geometry and duality</td>\n",
 43 |        "      <td>Paul S. Aspinwall</td>\n",
 44 |        "      <td>NaN</td>\n",
 45 |        "      <td>these are notes based on lectures given at tas...</td>\n",
 46 |        "    </tr>\n",
 47 |        "    <tr>\n",
 48 |        "      <th>1</th>\n",
 49 |        "      <td>1002</td>\n",
 50 |        "      <td>2000</td>\n",
 51 |        "      <td>domain walls and massive gauged supergravity p...</td>\n",
 52 |        "      <td>M. Cvetic, H. Lu, C.N. Pope</td>\n",
 53 |        "      <td>Class.Quant.Grav.</td>\n",
 54 |        "      <td>we point out that massive gauged supergravity ...</td>\n",
 55 |        "    </tr>\n",
 56 |        "    <tr>\n",
 57 |        "      <th>2</th>\n",
 58 |        "      <td>1003</td>\n",
 59 |        "      <td>2000</td>\n",
 60 |        "      <td>comment on metric fluctuations in brane worlds</td>\n",
 61 |        "      <td>Y.S. Myung, Gungwon Kang</td>\n",
 62 |        "      <td>NaN</td>\n",
 63 |        "      <td>recently ivanov and volovich hep-th 9912242 cl...</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>3</th>\n",
 67 |        "      <td>1004</td>\n",
 68 |        "      <td>2000</td>\n",
 69 |        "      <td>moving mirrors and thermodynamic paradoxes</td>\n",
 70 |        "      <td>Adam D. Helfer</td>\n",
 71 |        "      <td>Phys.Rev.</td>\n",
 72 |        "      <td>quantum fields responding to moving mirrors ha...</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>4</th>\n",
 76 |        "      <td>1005</td>\n",
 77 |        "      <td>2000</td>\n",
 78 |        "      <td>bundles of chiral blocks and boundary conditio...</td>\n",
 79 |        "      <td>J. Fuchs, C. Schweigert</td>\n",
 80 |        "      <td>NaN</td>\n",
 81 |        "      <td>proceedings of lie iii clausthal july 1999 var...</td>\n",
 82 |        "    </tr>\n",
 83 |        "  </tbody>\n",
 84 |        "</table>\n",
 85 |        "</div>"
 86 |       ],
 87 |       "text/plain": [
 88 |        "     id  year                                              title  \\\n",
 89 |        "0  1001  2000              compactification geometry and duality   \n",
 90 |        "1  1002  2000  domain walls and massive gauged supergravity p...   \n",
 91 |        "2  1003  2000     comment on metric fluctuations in brane worlds   \n",
 92 |        "3  1004  2000         moving mirrors and thermodynamic paradoxes   \n",
 93 |        "4  1005  2000  bundles of chiral blocks and boundary conditio...   \n",
 94 |        "\n",
 95 |        "                       authors            journal  \\\n",
 96 |        "0            Paul S. Aspinwall                NaN   \n",
 97 |        "1  M. Cvetic, H. Lu, C.N. Pope  Class.Quant.Grav.   \n",
 98 |        "2     Y.S. Myung, Gungwon Kang                NaN   \n",
 99 |        "3               Adam D. Helfer          Phys.Rev.   \n",
100 |        "4      J. Fuchs, C. Schweigert                NaN   \n",
101 |        "\n",
102 |        "                                            abstract  \n",
103 |        "0  these are notes based on lectures given at tas...  \n",
104 |        "1  we point out that massive gauged supergravity ...  \n",
105 |        "2  recently ivanov and volovich hep-th 9912242 cl...  \n",
106 |        "3  quantum fields responding to moving mirrors ha...  \n",
107 |        "4  proceedings of lie iii clausthal july 1999 var...  "
108 |       ]
109 |      },
110 |      "execution_count": 1,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "import numpy as np\n",
117 |     "import pandas as pd\n",
118 |     "from tqdm import tqdm\n",
119 |     "\n",
120 |     "path_to_data = \"../../data/\"\n",
121 |     "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n",
122 |     "nodes = pd.read_csv(path_to_data+\"node_information.csv\", names=nodes_header)\n",
123 |     "nodes.head()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 2,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/html": [
134 |        "<div>\n",
135 |        "<style scoped>\n",
136 |        "    .dataframe tbody tr th:only-of-type {\n",
137 |        "        vertical-align: middle;\n",
138 |        "    }\n",
139 |        "\n",
140 |        "    .dataframe tbody tr th {\n",
141 |        "        vertical-align: top;\n",
142 |        "    }\n",
143 |        "\n",
144 |        "    .dataframe thead th {\n",
145 |        "        text-align: right;\n",
146 |        "    }\n",
147 |        "</style>\n",
148 |        "<table border=\"1\" class=\"dataframe\">\n",
149 |        "  <thead>\n",
150 |        "    <tr style=\"text-align: right;\">\n",
151 |        "      <th></th>\n",
152 |        "      <th>id</th>\n",
153 |        "      <th>year</th>\n",
154 |        "    </tr>\n",
155 |        "  </thead>\n",
156 |        "  <tbody>\n",
157 |        "    <tr>\n",
158 |        "      <th>count</th>\n",
159 |        "      <td>2.777000e+04</td>\n",
160 |        "      <td>27770.000000</td>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <th>mean</th>\n",
164 |        "      <td>6.096134e+06</td>\n",
165 |        "      <td>1998.009039</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>std</th>\n",
169 |        "      <td>4.581677e+06</td>\n",
170 |        "      <td>3.124684</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>min</th>\n",
174 |        "      <td>1.001000e+03</td>\n",
175 |        "      <td>1992.000000</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>25%</th>\n",
179 |        "      <td>2.041122e+05</td>\n",
180 |        "      <td>1995.000000</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>50%</th>\n",
184 |        "      <td>9.405182e+06</td>\n",
185 |        "      <td>1998.000000</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>75%</th>\n",
189 |        "      <td>9.705204e+06</td>\n",
190 |        "      <td>2001.000000</td>\n",
191 |        "    </tr>\n",
192 |        "    <tr>\n",
193 |        "      <th>max</th>\n",
194 |        "      <td>9.912293e+06</td>\n",
195 |        "      <td>2003.000000</td>\n",
196 |        "    </tr>\n",
197 |        "  </tbody>\n",
198 |        "</table>\n",
199 |        "</div>"
200 |       ],
201 |       "text/plain": [
202 |        "                 id          year\n",
203 |        "count  2.777000e+04  27770.000000\n",
204 |        "mean   6.096134e+06   1998.009039\n",
205 |        "std    4.581677e+06      3.124684\n",
206 |        "min    1.001000e+03   1992.000000\n",
207 |        "25%    2.041122e+05   1995.000000\n",
208 |        "50%    9.405182e+06   1998.000000\n",
209 |        "75%    9.705204e+06   2001.000000\n",
210 |        "max    9.912293e+06   2003.000000"
211 |       ]
212 |      },
213 |      "execution_count": 2,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "nodes.describe()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 5,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/html": [
230 |        "<div>\n",
231 |        "<style scoped>\n",
232 |        "    .dataframe tbody tr th:only-of-type {\n",
233 |        "        vertical-align: middle;\n",
234 |        "    }\n",
235 |        "\n",
236 |        "    .dataframe tbody tr th {\n",
237 |        "        vertical-align: top;\n",
238 |        "    }\n",
239 |        "\n",
240 |        "    .dataframe thead th {\n",
241 |        "        text-align: right;\n",
242 |        "    }\n",
243 |        "</style>\n",
244 |        "<table border=\"1\" class=\"dataframe\">\n",
245 |        "  <thead>\n",
246 |        "    <tr style=\"text-align: right;\">\n",
247 |        "      <th></th>\n",
248 |        "      <th>id</th>\n",
249 |        "      <th>year</th>\n",
250 |        "      <th>title</th>\n",
251 |        "      <th>authors</th>\n",
252 |        "      <th>journal</th>\n",
253 |        "      <th>abstract</th>\n",
254 |        "    </tr>\n",
255 |        "  </thead>\n",
256 |        "  <tbody>\n",
257 |        "    <tr>\n",
258 |        "      <th>0</th>\n",
259 |        "      <td>id</td>\n",
260 |        "      <td>year</td>\n",
261 |        "      <td>title</td>\n",
262 |        "      <td>authors</td>\n",
263 |        "      <td>journal</td>\n",
264 |        "      <td>abstract</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>1</th>\n",
268 |        "      <td>1001</td>\n",
269 |        "      <td>2000</td>\n",
270 |        "      <td>['compactif', 'geometri', 'dualiti']</td>\n",
271 |        "      <td>['paul s. aspinwall']</td>\n",
272 |        "      <td>NaN</td>\n",
273 |        "      <td>['note', 'base', 'lectur', 'given', 'tasi99', ...</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <th>2</th>\n",
277 |        "      <td>1002</td>\n",
278 |        "      <td>2000</td>\n",
279 |        "      <td>['domain', 'wall', 'massiv', 'gaug', 'supergra...</td>\n",
280 |        "      <td>['m. cvetic', 'h. lu', 'c.n. pope']</td>\n",
281 |        "      <td>['class', 'quant', 'grav']</td>\n",
282 |        "      <td>['point', 'massiv', 'gaug', 'supergrav', 'pote...</td>\n",
283 |        "    </tr>\n",
284 |        "    <tr>\n",
285 |        "      <th>3</th>\n",
286 |        "      <td>1003</td>\n",
287 |        "      <td>2000</td>\n",
288 |        "      <td>['comment', 'metric', 'fluctuat', 'brane', 'wo...</td>\n",
289 |        "      <td>['y.s. myung', 'gungwon kang']</td>\n",
290 |        "      <td>NaN</td>\n",
291 |        "      <td>['recent', 'ivanov', 'volovich', 'hep-th', '99...</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>4</th>\n",
295 |        "      <td>1004</td>\n",
296 |        "      <td>2000</td>\n",
297 |        "      <td>['move', 'mirror', 'thermodynam', 'paradox']</td>\n",
298 |        "      <td>['adam d. helfer']</td>\n",
299 |        "      <td>['phys', 'rev']</td>\n",
300 |        "      <td>['quantum', 'field', 'respond', 'move', 'mirro...</td>\n",
301 |        "    </tr>\n",
302 |        "  </tbody>\n",
303 |        "</table>\n",
304 |        "</div>"
305 |       ],
306 |       "text/plain": [
307 |        "     id  year                                              title  \\\n",
308 |        "0    id  year                                              title   \n",
309 |        "1  1001  2000               ['compactif', 'geometri', 'dualiti']   \n",
310 |        "2  1002  2000  ['domain', 'wall', 'massiv', 'gaug', 'supergra...   \n",
311 |        "3  1003  2000  ['comment', 'metric', 'fluctuat', 'brane', 'wo...   \n",
312 |        "4  1004  2000       ['move', 'mirror', 'thermodynam', 'paradox']   \n",
313 |        "\n",
314 |        "                               authors                     journal  \\\n",
315 |        "0                              authors                     journal   \n",
316 |        "1                ['paul s. aspinwall']                         NaN   \n",
317 |        "2  ['m. cvetic', 'h. lu', 'c.n. pope']  ['class', 'quant', 'grav']   \n",
318 |        "3       ['y.s. myung', 'gungwon kang']                         NaN   \n",
319 |        "4                   ['adam d. helfer']             ['phys', 'rev']   \n",
320 |        "\n",
321 |        "                                            abstract  \n",
322 |        "0                                           abstract  \n",
323 |        "1  ['note', 'base', 'lectur', 'given', 'tasi99', ...  \n",
324 |        "2  ['point', 'massiv', 'gaug', 'supergrav', 'pote...  \n",
325 |        "3  ['recent', 'ivanov', 'volovich', 'hep-th', '99...  \n",
326 |        "4  ['quantum', 'field', 'respond', 'move', 'mirro...  "
327 |       ]
328 |      },
329 |      "execution_count": 5,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n",
336 |     "nodes_preprocessed = pd.read_csv(path_to_data+\"nodes_preprocessed.csv\", names=nodes_header)\n",
337 |     "nodes_preprocessed.head()"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 6,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/html": [
348 |        "<div>\n",
349 |        "<style scoped>\n",
350 |        "    .dataframe tbody tr th:only-of-type {\n",
351 |        "        vertical-align: middle;\n",
352 |        "    }\n",
353 |        "\n",
354 |        "    .dataframe tbody tr th {\n",
355 |        "        vertical-align: top;\n",
356 |        "    }\n",
357 |        "\n",
358 |        "    .dataframe thead th {\n",
359 |        "        text-align: right;\n",
360 |        "    }\n",
361 |        "</style>\n",
362 |        "<table border=\"1\" class=\"dataframe\">\n",
363 |        "  <thead>\n",
364 |        "    <tr style=\"text-align: right;\">\n",
365 |        "      <th></th>\n",
366 |        "      <th>id</th>\n",
367 |        "      <th>year</th>\n",
368 |        "      <th>title</th>\n",
369 |        "      <th>authors</th>\n",
370 |        "      <th>journal</th>\n",
371 |        "      <th>abstract</th>\n",
372 |        "    </tr>\n",
373 |        "  </thead>\n",
374 |        "  <tbody>\n",
375 |        "    <tr>\n",
376 |        "      <th>count</th>\n",
377 |        "      <td>27771</td>\n",
378 |        "      <td>27771</td>\n",
379 |        "      <td>27771</td>\n",
380 |        "      <td>23738</td>\n",
381 |        "      <td>20299</td>\n",
382 |        "      <td>27771</td>\n",
383 |        "    </tr>\n",
384 |        "    <tr>\n",
385 |        "      <th>unique</th>\n",
386 |        "      <td>27771</td>\n",
387 |        "      <td>13</td>\n",
388 |        "      <td>27496</td>\n",
389 |        "      <td>15834</td>\n",
390 |        "      <td>280</td>\n",
391 |        "      <td>27765</td>\n",
392 |        "    </tr>\n",
393 |        "    <tr>\n",
394 |        "      <th>top</th>\n",
395 |        "      <td>9702094</td>\n",
396 |        "      <td>2002</td>\n",
397 |        "      <td>['black', 'hole', 'entropi']</td>\n",
398 |        "      <td>[\"shin'ichi nojiri\", 'sergei d. odintsov']</td>\n",
399 |        "      <td>['phys', 'lett']</td>\n",
400 |        "      <td>['comment', 'start', 'paper', 'hep-th', '01060...</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>freq</th>\n",
404 |        "      <td>1</td>\n",
405 |        "      <td>3335</td>\n",
406 |        "      <td>7</td>\n",
407 |        "      <td>38</td>\n",
408 |        "      <td>3575</td>\n",
409 |        "      <td>3</td>\n",
410 |        "    </tr>\n",
411 |        "  </tbody>\n",
412 |        "</table>\n",
413 |        "</div>"
414 |       ],
415 |       "text/plain": [
416 |        "             id   year                         title  \\\n",
417 |        "count     27771  27771                         27771   \n",
418 |        "unique    27771     13                         27496   \n",
419 |        "top     9702094   2002  ['black', 'hole', 'entropi']   \n",
420 |        "freq          1   3335                             7   \n",
421 |        "\n",
422 |        "                                           authors           journal  \\\n",
423 |        "count                                        23738             20299   \n",
424 |        "unique                                       15834               280   \n",
425 |        "top     [\"shin'ichi nojiri\", 'sergei d. odintsov']  ['phys', 'lett']   \n",
426 |        "freq                                            38              3575   \n",
427 |        "\n",
428 |        "                                                 abstract  \n",
429 |        "count                                               27771  \n",
430 |        "unique                                              27765  \n",
431 |        "top     ['comment', 'start', 'paper', 'hep-th', '01060...  \n",
432 |        "freq                                                    3  "
433 |       ]
434 |      },
435 |      "execution_count": 6,
436 |      "metadata": {},
437 |      "output_type": "execute_result"
438 |     }
439 |    ],
440 |    "source": [
441 |     "nodes_preprocessed.describe()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": []
450 |   }
451 |  ],
452 |  "metadata": {
453 |   "kernelspec": {
454 |    "display_name": "Python 3",
455 |    "language": "python",
456 |    "name": "python3"
457 |   },
458 |   "language_info": {
459 |    "codemirror_mode": {
460 |     "name": "ipython",
461 |     "version": 3
462 |    },
463 |    "file_extension": ".py",
464 |    "mimetype": "text/x-python",
465 |    "name": "python",
466 |    "nbconvert_exporter": "python",
467 |    "pygments_lexer": "ipython3",
468 |    "version": "3.5.2"
469 |   }
470 |  },
471 |  "nbformat": 4,
472 |  "nbformat_minor": 2
473 | }
474 | 


--------------------------------------------------------------------------------