├── .gitignore ├── .idea └── vcs.xml ├── LICENSE ├── README.md ├── __init__.py ├── cloud_setup.py ├── feature_engineering ├── __init__.py ├── author_graph_features.py ├── authors.py ├── authors_2.py ├── baseline_feature_engineering.py ├── basic_features.py ├── citation_graph_features.py ├── cosine_distance.py ├── networkx_bigraph.py ├── networkx_bigraph_long.py ├── networkx_bigraph_long2.py ├── networkx_digraph.py ├── preprocessing.py └── tools.py ├── link-prediction-report.pdf ├── main.py ├── models ├── __init__.py ├── camboui.ipynb ├── feature_selection.ipynb ├── feature_selection.py ├── lgbm.py ├── logistic_regression.py ├── nn.py ├── nn_deep.py ├── plots │ ├── rf_importance.png │ └── rf_importance_full.png ├── random_forest.py ├── svm.py ├── tools.py └── tuning │ ├── __init__.py │ ├── console_nn_grid_search_example.txt │ ├── objective_function.py │ ├── plots │ └── grid_lgbm.png │ ├── tools.py │ ├── tuning_lgbm.py │ ├── tuning_nn.py │ ├── tuning_random_forest.py │ ├── tuning_svm.py │ └── tuning_svm_feat_selec.py ├── notes ├── ressources └── data_challenge_description.pdf ├── results └── results ├── sampling ├── sampling.ipynb └── sampling.py ├── stacking ├── __init__.py ├── stacking.py ├── stacking_tuning.py └── stacking_tuning_micro.py ├── task_manager.py └── tests ├── __init__.py ├── test_author_graph_features.ipynb ├── test_baseline.ipynb └── test_preprocessing.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | /data/ 103 | /submissions/ 104 | /calculated_features/ 105 | .idea/ 106 | code/data_exploration/camboui.ipynb 107 | /code/data_exploration/camboui.ipynb 108 | /code/data_exploration/data_exploration.ipynb 109 | /code/feature_engineering/camboui.ipynb 110 | /code/models/camboui.ipynb 111 | /code/feature_engineering/camboui.py 112 | /code/feature_engineering/camboui_network_x.ipynb 113 | /tests/multiprocessing_tuto.py 114 | /data_exploration/data_exploration.ipynb 115 | /models/feature_selection_2.ipynb 116 | /results/log.txt 117 | /log.txt 118 | /bigraph_from_root.py 119 | /illustrate_report.ipynb 120 | /camboui.py 121 | *.csv 122 | results/* 123 | /storage.py 124 | Untitled.ipynb 125 | Untitled1.ipynb 126 | requirements.txt 127 | 128 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 raph-m 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # link-prediction 2 | 3 | Predict links in a citation network. 4 | You can find the project instructions in the ressources folder. 5 | Our project report is available in the link-prediction-report pdf. 6 | 7 | ## Feature Engineering 8 | 9 | In the feature engineering folder you can find scripts to compute new features from the dataset. The features computed are described at the beginning of the scripts, and you can find more information in our project report. 10 | 11 | ## Feature Selection 12 | 13 | Running the feature_selection.py script will print the results of a forward selection algorithm. We chose the set of features that we were going to use for the rest of the project from these results. 14 | 15 | ## Models 16 | 17 | You can find several implementations of models to fit to our data. Running the scripts will give you the results and create a submission file. 18 | 19 | ## Tuning 20 | 21 | Running the tuning scripts will output best paramaters resulting from a cross validated grid search on a hand picked parameter grid. 22 | 23 | ## Main 24 | 25 | The main.py script processes all you need (feature engineering and machine learning) in order to create our final submission. The svm fit might take a substantial amount of time. 26 | You may use the generated "stack_sub_rf.csv" as a reproduction of our best submission. If they were to be reproducibility issues with runtimes and what not we left our original submission under the name ("stack_sub_rf_reference.csv") 27 | 28 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/__init__.py -------------------------------------------------------------------------------- /cloud_setup.py: -------------------------------------------------------------------------------- 1 | # how to setup the environment for cloud computing (install python tools and libraries, download database from 2 | # google drive public link and run python file) 3 | 4 | """ 5 | sudo apt update 6 | sudo apt install python python-dev python3 python3-dev 7 | sudo apt-get install python3-setuptools 8 | wget https://bootstrap.pypa.io/get-pip.py 9 | sudo python get-pip.py 10 | sudo pip install --upgrade virtualenv 11 | sudo pip install virtualenvwrapper 12 | echo "export WORKON_HOME=$HOME/.virtualenvs" >> .bashrc 13 | echo "export PROJECT_HOME=$HOME/Devel" >> .bashrc 14 | echo "source /usr/local/bin/virtualenvwrapper.sh" >> .bashrc 15 | echo "source "/usr/bin/virtualenvwrapper.sh"" >> .bashrc 16 | echo "export WORKON_HOME="/opt/virtual_env/"" >> .bashrc 17 | source `which virtualenvwrapper.sh` 18 | mkvirtualenv -p /usr/bin/python3.5 ml1 19 | sudo pip install pandas 20 | sudo pip install requests 21 | sudo pip install dotenv 22 | sudo pip install 23 | git clone https://github.com/raph-m/safe_driver_prediction 24 | cd safe_driver_prediction/proj2 25 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip 26 | cd .. 27 | cd .. 28 | sudo apt-get install unzip 29 | unzip data.zip 30 | cd safe_driver_prediction 31 | git pull origin master 32 | echo "ENV_NAME=vm" > .env 33 | python proj2/feature_engineering.py train ../../churn/ 3000000 34 | """ 35 | 36 | # une version qui marche (sans virtualenv): 37 | """ 38 | sudo apt update 39 | sudo apt install python python-dev python3 python3-dev 40 | sudo apt-get install python3-setuptools 41 | wget https://bootstrap.pypa.io/get-pip.py 42 | sudo python get-pip.py 43 | alias python=python3 44 | sudo apt-get python3-setuptools 45 | sudo easy_install3 pip 46 | sudo pip3 install pandas 47 | sudo pip3 install requests 48 | sudo pip3 install dotenv 49 | git clone https://github.com/raph-m/safe_driver_prediction 50 | cd safe_driver_prediction/proj2 51 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip 52 | cd .. 53 | cd .. 54 | sudo apt-get install unzip 55 | unzip data.zip 56 | cd safe_driver_prediction 57 | echo "ENV_NAME=vm" > .env 58 | cd proj2 59 | python feature_engineering.py 60 | """ 61 | 62 | # une autre façon de faire c'est avec `alias python=python3` 63 | 64 | # pour automatiser ces commandes, il faudrait mettre les commandes dans ce bashCommand et lancer ce script: 65 | # bashCommand = "cwm --rdf test.rdf --ntriples > test.nt" 66 | # process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) 67 | # output, error = process.communicate() 68 | 69 | """ 70 | git clone https://github.com/raph-m/link-prediction 71 | cd link-prediction/ 72 | # get and API token from kaggle (kaggle.json) 73 | sudo pip install kaggle 74 | mv kaggle.json .kaggle/ 75 | mkdir data 76 | cd data 77 | kaggle competitions download -c link-prediction-challenge-tm-and-nlp 78 | sudo pip install nltk 79 | sudo pip install tqdm 80 | 81 | """ 82 | 83 | import requests 84 | 85 | 86 | # python script to download a file from a google drive public link 87 | 88 | 89 | def download_file_from_google_drive(id, destination): 90 | def get_confirm_token(response): 91 | for key, value in response.cookies.items(): 92 | if key.startswith('download_warning'): 93 | return value 94 | 95 | return None 96 | 97 | def save_response_content(response, destination): 98 | CHUNK_SIZE = 32768 99 | 100 | with open(destination, "wb") as f: 101 | for chunk in response.iter_content(CHUNK_SIZE): 102 | if chunk: # filter out keep-alive new chunks 103 | f.write(chunk) 104 | 105 | URL = "https://docs.google.com/uc?export=download" 106 | 107 | session = requests.Session() 108 | 109 | response = session.get(URL, params={'id': id}, stream=True) 110 | token = get_confirm_token(response) 111 | 112 | if token: 113 | params = {'id': id, 'confirm': token} 114 | response = session.get(URL, params=params, stream=True) 115 | 116 | save_response_content(response, destination) 117 | 118 | 119 | if __name__ == "__main__": 120 | import sys 121 | 122 | if len(sys.argv) is not 3: 123 | print("Usage: python google_drive.py drive_file_id destination_file_path") 124 | else: 125 | # TAKE ID FROM SHAREABLE LINK 126 | file_id = sys.argv[1] 127 | # DESTINATION FILE ON YOUR DISK 128 | destination = sys.argv[2] 129 | download_file_from_google_drive(file_id, destination) 130 | -------------------------------------------------------------------------------- /feature_engineering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/feature_engineering/__init__.py -------------------------------------------------------------------------------- /feature_engineering/author_graph_features.py: -------------------------------------------------------------------------------- 1 | import time 2 | from itertools import permutations, product 3 | 4 | import igraph 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | from feature_engineering.tools import lit_eval_nan_proof 10 | 11 | # progress bar for pandas 12 | tqdm.pandas(tqdm()) 13 | 14 | # path 15 | path_to_data = "data/" 16 | 17 | # loading data 18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 19 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 21 | nodes.set_index("id", inplace=True) 22 | training = pd.read_csv(path_to_data + "training_features.txt") 23 | training.set_index("my_index", inplace=True) 24 | testing = pd.read_csv(path_to_data + "testing_features.txt") 25 | testing.set_index("my_index", inplace=True) 26 | 27 | # create author graph 28 | # vertices are authors 29 | # edge of weight 1 if they cowrote a paper, 2 if they only cite each other 30 | 31 | # create empty directed graph 32 | g = igraph.Graph(directed=True) 33 | 34 | # add vertices 35 | authors = nodes['authors'] 36 | authors_set = list(set(authors.dropna().sum())) 37 | g.add_vertices(authors_set) 38 | 39 | # first, add citation edges 40 | edges = {} 41 | # store edge ids for each edge 42 | ids = {} 43 | # store weights 44 | weights = {} 45 | id1 = training['id1'].values 46 | id2 = training['id2'].values 47 | index_train = training.index 48 | target = training["target"].values 49 | # edge id 50 | id = 0 51 | # store all the edges related to each citation 52 | eid = {} 53 | for i in tqdm(range(len(id1))): 54 | # if there is a 55 | if target[i] == 1: 56 | authors1 = nodes.at[id1[i], 'authors'] 57 | authors2 = nodes.at[id2[i], 'authors'] 58 | # check that author information is not missing 59 | if isinstance(authors1, float) or isinstance(authors2, float): 60 | continue 61 | # if authors available then add edges 62 | pairs = list(product(authors1, authors2)) 63 | # for each pair of authors 64 | for pair in pairs: 65 | # if edge already exists 66 | if pair in edges: 67 | # increment weight 68 | weights[pair] += 1 69 | # add id to edges related to this citation 70 | if index_train[i] in eid: 71 | eid[index_train[i]] += [id] 72 | else: 73 | eid[index_train[i]] = [id] 74 | # if doesn't exist 75 | else: 76 | # create edge 77 | edges[pair] = 1 78 | # keep track of edge id 79 | ids[pair] = id 80 | # add id to edges related to this citation 81 | if index_train[i] in eid: 82 | eid[index_train[i]] += [id] 83 | else: 84 | eid[index_train[i]] = [id] 85 | # store weight 86 | weights[pair] = 1 87 | # increment id 88 | id += 1 89 | 90 | # then, add coauthor edges 91 | authors_array = authors.values 92 | index_nodes = nodes.index.values 93 | # for each document 94 | for i in tqdm(range(len(authors_array))): 95 | # if missing author info, skip 96 | if isinstance(authors_array[i], float): 97 | continue 98 | # if not for each pair of coauthors 99 | coauthors = permutations(authors_array[i], 2) 100 | for pair in coauthors: 101 | # if edge already exists 102 | if pair in edges: 103 | # increment weight 104 | weights[pair] += 2 105 | # if doesn't exist 106 | else: 107 | # create edge 108 | edges[pair] = 1 109 | # store weight 110 | weights[pair] = 2 111 | 112 | # add edges to graph 113 | g.add_edges(list(edges)) 114 | 115 | # add weights 116 | weights = list(edges.values()) 117 | max_weight = max(weights) 118 | weights = max_weight - np.array(weights) + 1 119 | g.es['weight'] = list(weights) 120 | 121 | # compute features such as shortest path 122 | 123 | # features placeholders 124 | min_shortest_path = [] 125 | max_shortest_path = [] 126 | mean_shortest_path = [] 127 | author_in_degree_mean_target = [] 128 | author_in_degree_max_target = [] 129 | author_out_degree_mean_source = [] 130 | author_out_degree_max_source = [] 131 | author_common_neigbors_mean = [] 132 | author_common_neigbors_max = [] 133 | author_jaccard_mean = [] 134 | author_jaccard_max = [] 135 | 136 | # get training ids 137 | id1 = training['id1'].values 138 | id2 = training['id2'].values 139 | target = training["target"].values 140 | index_train = training.index 141 | 142 | # compute features for all samples 143 | for i in tqdm(range(len(id1))): 144 | authors1 = nodes.at[id1[i], 'authors'] 145 | authors2 = nodes.at[id2[i], 'authors'] 146 | # if one of the articles has missing author info 147 | if isinstance(authors1, float) or isinstance(authors2, float): 148 | # print("NAN") 149 | # no shortest path can be computed 150 | min_shortest_path.append(np.nan) 151 | max_shortest_path.append(np.nan) 152 | mean_shortest_path.append(np.nan) 153 | # if author info is missing for first doc 154 | if isinstance(authors1, float): 155 | # no degree can be computed 156 | author_out_degree_max_source.append(np.nan) 157 | author_out_degree_mean_source.append(np.nan) 158 | # if not missing 159 | else: 160 | # compute degrees 161 | out = g.strength(authors1, weights="weight") 162 | mean_out = np.mean(out) 163 | max_out = np.max(out) 164 | author_out_degree_max_source.append(max_out) 165 | author_out_degree_mean_source.append(mean_out) 166 | # if it is missing for the second document 167 | if isinstance(authors2, float): 168 | # no degree can be computed 169 | author_in_degree_max_target.append(np.nan) 170 | author_in_degree_mean_target.append(np.nan) 171 | # if not 172 | else: 173 | # compute degrees for other document 174 | in_ = g.strength(authors2, weights="weight") 175 | mean_in = np.mean(in_) 176 | max_in = np.max(in_) 177 | author_in_degree_max_target.append(max_in) 178 | author_in_degree_mean_target.append(mean_in) 179 | continue 180 | # print("NO NAN") 181 | # if there's no missing author information 182 | # set weights of unwanted edges to zero 183 | if target[i] == 1: 184 | # print('target is 1') 185 | t0 = time.time() 186 | # print('fetching edge ids') 187 | eids_to_unweigh = eid[index_train[i]] 188 | t1 = time.time() 189 | for id in eids_to_unweigh: 190 | g.es['weight'][id] += 1 191 | t1_bis = time.time() 192 | print('bottleneck', t1 - t0, t1_bis - t1) 193 | # compute shortest paths 194 | # print("computing shortest path") 195 | t1 = time.time() 196 | # paths = g.shortest_paths_dijkstra(source=authors1, target=authors2, 197 | # mode="OUT", weights="weight")[0][0] 198 | # min_value = np.min(paths) 199 | # max_value = np.max(paths) 200 | # mean_value = np.mean(paths) 201 | t2 = time.time() 202 | print('shortest_path', t2 - t1) 203 | # compute degrees 204 | out = g.strength(authors1, weights="weight") 205 | in_ = g.strength(authors2, weights="weight") 206 | mean_out = np.mean(out) 207 | max_out = np.max(out) 208 | in_ = g.strength(authors2, weights="weight") 209 | mean_in = np.mean(in_) 210 | max_in = np.max(in_) 211 | t3 = time.time() 212 | print('weighted degree', t3 - t2) 213 | # create set of pairs as vertex ids as well as index values 214 | pairs = list(product(authors1, authors2)) 215 | pairs_index = list(product(range(len(authors1)), range(len(authors2)))) 216 | # compute jaccard similarity 217 | # jaccards = g.similarity_jaccard(pairs=pairs) 218 | # max_jacc = np.max(jaccards) 219 | # mean_jacc = np.mean(jaccards) 220 | t4 = time.time() 221 | # print('jacc', t4 - t3) 222 | # compute common neighbours 223 | hoods1 = g.neighborhood(vertices=authors1) 224 | hoods2 = g.neighborhood(vertices=authors2) 225 | common_hoods = [set(hoods1[i]).intersection(set(hoods2[j])) for (i, j) in pairs_index] 226 | common_hoods_size = list(map(len, common_hoods)) 227 | max_hood = np.max(common_hoods_size) 228 | mean_hood = np.mean(common_hoods_size) 229 | t5 = time.time() 230 | print('common hoods', t5 - t4) 231 | # append features to corresponding set 232 | # min_shortest_path.append(min_value) 233 | # max_shortest_path.append(max_value) 234 | # mean_shortest_path.append(mean_value) 235 | author_out_degree_max_source.append(max_out) 236 | author_out_degree_mean_source.append(mean_out) 237 | author_in_degree_max_target.append(max_in) 238 | author_in_degree_mean_target.append(mean_in) 239 | author_common_neigbors_mean.append(mean_hood) 240 | author_common_neigbors_max.append(max_hood) 241 | # author_jaccard_mean.append(max_jacc) 242 | # author_jaccard_max.append(mean_jacc) 243 | if target[i] == 1: 244 | for id in eids_to_unweigh: 245 | g.es['weight'][id] = 0 246 | t6 = time.time() 247 | # print("append features", t6 - t5) 248 | 249 | # add feature to dataframe 250 | # training["author_min_shortest_path"] = min_shortest_path 251 | # training["author_max_shortest_path"] = max_shortest_path 252 | # training["author_sum_shortest_path"] = sum_shortest_path 253 | # training["author_mean_shortest_path"] = mean_shortest_path 254 | training['author_out_degree_max_source'] = author_out_degree_max_source 255 | training['author_out_degree_mean_source'] = author_out_degree_mean_source 256 | training['author_in_degree_max_target'] = author_in_degree_max_target 257 | training['author_in_degree_mean_target'] = author_in_degree_mean_target 258 | training['author_common_neigbors_mean'] = author_common_neigbors_mean 259 | training['author_common_neigbors_max'] = author_common_neigbors_max 260 | # training['author_jaccard_mean'] = author_jaccard_mean 261 | # training['author_jaccard_max'] = author_jaccard_max 262 | 263 | # repeat process for test set 264 | min_shortest_path_test = [] 265 | max_shortest_path_test = [] 266 | sum_shortest_path_test = [] 267 | mean_shortest_path_test = [] 268 | author_in_degree_mean_target_test = [] 269 | author_in_degree_sum_target_test = [] 270 | author_out_degree_mean_source_test = [] 271 | author_out_degree_sum_source_test = [] 272 | id1 = testing['id1'].values 273 | id2 = testing['id2'].values 274 | for i in tqdm(range(len(id1))): 275 | authors1 = nodes.at[id1[i], 'authors'] 276 | authors2 = nodes.at[id2[i], 'authors'] 277 | if isinstance(authors1, float) or isinstance(authors2, float): 278 | min_shortest_path_test.append(np.nan) 279 | max_shortest_path_test.append(np.nan) 280 | sum_shortest_path_test.append(np.nan) 281 | mean_shortest_path_test.append(np.nan) 282 | if isinstance(authors1, float): 283 | author_out_degree_sum_source_test.append(np.nan) 284 | author_out_degree_mean_source_test.append(np.nan) 285 | else: 286 | sum_out = 0 287 | n_source = len(authors1) 288 | for author1 in authors1: 289 | sum_out += g.strength(author1, mode='OUT', weights="weight") 290 | mean_out = sum_out / n_source 291 | author_out_degree_sum_source_test.append(sum_out) 292 | author_out_degree_mean_source_test.append(mean_out) 293 | if isinstance(authors2, float): 294 | author_in_degree_sum_target_test.append(np.nan) 295 | author_in_degree_mean_target_test.append(np.nan) 296 | else: 297 | sum_in = 0 298 | n_target = len(authors2) 299 | for author2 in authors2: 300 | sum_in += g.strength(author2, mode='IN', weights="weight") 301 | mean_in = sum_in / n_target 302 | author_in_degree_sum_target_test.append(sum_in) 303 | author_in_degree_mean_target_test.append(mean_in) 304 | continue 305 | min_value = float('inf') 306 | max_value = - float('inf') 307 | sum_value = 0 308 | n = len(authors1) * len(authors2) 309 | for author1 in authors1: 310 | for author2 in authors2: 311 | current = g.shortest_paths_dijkstra(source=author1, target=author2, 312 | mode="OUT", weights=g.es["weight"])[0][0] 313 | min_value = current if current < min_value else min_value 314 | max_value = current if current > max_value else max_value 315 | sum_value += current 316 | mean_value = sum_value / n 317 | sum_out = 0 318 | sum_in = 0 319 | n_source = len(authors1) 320 | n_target = len(authors2) 321 | for author1 in authors1: 322 | sum_out += g.strength(author1, mode='OUT', weights="weight") 323 | for author2 in authors2: 324 | sum_in += g.strength(author2, mode='IN', weights="weight") 325 | mean_out = sum_out / n_source 326 | mean_in = sum_in / n_target 327 | min_shortest_path_test.append(min_value) 328 | max_shortest_path_test.append(max_value) 329 | sum_shortest_path_test.append(sum_value) 330 | mean_shortest_path_test.append(mean_value) 331 | author_out_degree_sum_source_test.append(sum_out) 332 | author_out_degree_mean_source_test.append(mean_out) 333 | author_in_degree_sum_target_test.append(sum_in) 334 | author_in_degree_mean_target_test.append(mean_in) 335 | 336 | # add feature to dataframe 337 | testing["author_min_shortest_path"] = min_shortest_path_test 338 | testing["author_max_shortest_path"] = max_shortest_path_test 339 | testing["author_sum_shortest_path"] = sum_shortest_path_test 340 | testing["author_mean_shortest_path"] = mean_shortest_path_test 341 | testing['author_out_degree_sum_source'] = author_out_degree_sum_source_test 342 | testing['author_out_degree_mean_source'] = author_out_degree_mean_source_test 343 | testing['author_in_degree_sum_target'] = author_in_degree_sum_target_test 344 | testing['author_in_degree_mean_target'] = author_in_degree_mean_target_test 345 | 346 | # save data sets 347 | training.to_csv(path_to_data + "training_features.txt") 348 | testing.to_csv(path_to_data + "testing_features.txt") 349 | -------------------------------------------------------------------------------- /feature_engineering/authors.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from feature_engineering.tools import lit_eval_nan_proof 7 | 8 | # this script computes the features authors_citation and coauthor score by considering the graph of coauthorship and 9 | # the author's graph of citations. 10 | # the script takes approximately 5 minutes to run 11 | 12 | # progress bar for pandas 13 | tqdm.pandas(tqdm()) 14 | 15 | # path 16 | path_to_data = "data/" 17 | 18 | # loading data 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 22 | nodes.set_index("id", inplace=True) 23 | training = pd.read_csv(path_to_data + "training_features.txt") 24 | training.set_index("my_index", inplace=True) 25 | testing = pd.read_csv(path_to_data + "testing_features.txt") 26 | testing.set_index("my_index", inplace=True) 27 | 28 | # loading data 29 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 30 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 31 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", 32 | converters=converter_dict) 33 | nodes.set_index("id", inplace=True) 34 | 35 | G = nx.DiGraph() 36 | coauthors = nx.Graph() 37 | 38 | print("building coauthor graph") 39 | nodes_id = nodes.index.values 40 | for i in tqdm(range(len(nodes_id))): 41 | 42 | authors = nodes.loc[nodes_id[i]]["authors"] 43 | if authors is np.nan: 44 | authors = [] 45 | 46 | authors = np.unique([a for a in authors if a != ""]) 47 | 48 | for a in authors: 49 | G.add_node(a) 50 | coauthors.add_node(a) 51 | 52 | for a1 in authors: 53 | for a2 in authors: 54 | if a1 != a2: 55 | if coauthors.has_edge(a1, a2): 56 | coauthors[a1][a2]["weight"] += 1 57 | else: 58 | coauthors.add_edge(a1, a2, weight=1) 59 | 60 | id1 = training["id1"].values 61 | id2 = training["id2"].values 62 | 63 | print("building citation graph") 64 | for i in tqdm(range(len(id1))): 65 | current_authors_1 = nodes.loc[id1[i]]["authors"] 66 | current_authors_2 = nodes.loc[id2[i]]["authors"] 67 | 68 | if current_authors_1 is np.nan: 69 | current_authors_1 = [] 70 | 71 | if current_authors_2 is np.nan: 72 | current_authors_2 = [] 73 | 74 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 75 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 76 | 77 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 78 | for a1 in current_authors_1: 79 | for a2 in current_authors_2: 80 | if G.has_edge(a1, a2): 81 | G[a1][a2]["weight"] += 1 82 | else: 83 | G.add_edge(a1, a2, weight=1) 84 | 85 | coauthor_score = np.zeros(len(id1)) 86 | normalized_coauthor_score = np.zeros(len(id1)) 87 | best_coauthor_score = np.zeros(len(id1)) 88 | authors_citation = np.zeros(len(id1)) 89 | normalized_authors_citation = np.zeros(len(id1)) 90 | best_authors_citation = np.zeros(len(id1)) 91 | 92 | print("building features for training") 93 | for i in tqdm(range(len(id1))): 94 | current_authors_1 = nodes.loc[id1[i]]["authors"] 95 | current_authors_2 = nodes.loc[id2[i]]["authors"] 96 | 97 | if current_authors_1 is np.nan: 98 | current_authors_1 = [] 99 | 100 | if current_authors_2 is np.nan: 101 | current_authors_2 = [] 102 | 103 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 104 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 105 | 106 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 107 | for a1 in current_authors_1: 108 | for a2 in current_authors_2: 109 | G[a1][a2]["weight"] -= 1 110 | 111 | best = 0 112 | for a1 in current_authors_1: 113 | for a2 in current_authors_2: 114 | if G.has_edge(a1, a2): 115 | current = G[a1][a2]["weight"] 116 | authors_citation[i] += current 117 | if current > best: 118 | best = current 119 | 120 | best_authors_citation[i] = best 121 | 122 | best = 0 123 | for a1 in current_authors_1: 124 | for a2 in current_authors_2: 125 | if coauthors.has_edge(a1, a2): 126 | current = coauthors[a1][a2]["weight"] 127 | coauthor_score[i] += current 128 | if current > best: 129 | best = current 130 | 131 | best_coauthor_score[i] = best 132 | 133 | # normalize features 134 | denom = len(current_authors_1) * len(current_authors_2) 135 | if denom > 0: 136 | normalized_authors_citation[i] = authors_citation[i] / denom 137 | normalized_coauthor_score[i] = coauthor_score[i] / denom 138 | 139 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 140 | for a1 in current_authors_1: 141 | for a2 in current_authors_2: 142 | G[a1][a2]["weight"] += 1 143 | 144 | training["authors_citation"] = authors_citation 145 | training["normalized_authors_citation"] = normalized_authors_citation 146 | training["coauthor_score"] = coauthor_score 147 | training["normalized_coauthor_score"] = normalized_coauthor_score 148 | training["best_coauthor_score"] = best_coauthor_score 149 | training["best_authors_citation"] = best_authors_citation 150 | 151 | id1 = testing["id1"].values 152 | id2 = testing["id2"].values 153 | 154 | coauthor_score = np.zeros(len(id1)) 155 | normalized_coauthor_score = np.zeros(len(id1)) 156 | best_coauthor_score = np.zeros(len(id1)) 157 | authors_citation = np.zeros(len(id1)) 158 | normalized_authors_citation = np.zeros(len(id1)) 159 | best_authors_citation = np.zeros(len(id1)) 160 | 161 | print("building features for testing") 162 | for i in tqdm(range(len(id1))): 163 | current_authors_1 = nodes.loc[id1[i]]["authors"] 164 | current_authors_2 = nodes.loc[id2[i]]["authors"] 165 | 166 | if current_authors_1 is np.nan: 167 | current_authors_1 = [] 168 | 169 | if current_authors_2 is np.nan: 170 | current_authors_2 = [] 171 | 172 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 173 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 174 | 175 | best = 0 176 | for a1 in current_authors_1: 177 | for a2 in current_authors_2: 178 | if G.has_edge(a1, a2): 179 | current = G[a1][a2]["weight"] 180 | authors_citation[i] += current 181 | if current > best: 182 | best = current 183 | 184 | best_authors_citation[i] = best 185 | 186 | best = 0 187 | for a1 in current_authors_1: 188 | for a2 in current_authors_2: 189 | if coauthors.has_edge(a1, a2): 190 | current = coauthors[a1][a2]["weight"] 191 | coauthor_score[i] += current 192 | if current > best: 193 | best = current 194 | 195 | best_coauthor_score[i] = best 196 | 197 | # normalize features 198 | denom = len(current_authors_1) * len(current_authors_2) 199 | if denom > 0: 200 | normalized_authors_citation[i] = authors_citation[i] / denom 201 | normalized_coauthor_score[i] = coauthor_score[i] / denom 202 | 203 | testing["authors_citation"] = authors_citation 204 | testing["normalized_authors_citation"] = normalized_authors_citation 205 | testing["coauthor_score"] = coauthor_score 206 | testing["normalized_coauthor_score"] = normalized_coauthor_score 207 | testing["best_coauthor_score"] = best_coauthor_score 208 | testing["best_authors_citation"] = best_authors_citation 209 | 210 | print("done, saving data") 211 | # save data-frame 212 | training.to_csv(path_to_data + "training_features.txt") 213 | testing.to_csv(path_to_data + "testing_features.txt") 214 | -------------------------------------------------------------------------------- /feature_engineering/authors_2.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import networkx as nx 4 | import numpy as np 5 | import pandas as pd 6 | from tqdm import tqdm 7 | 8 | from feature_engineering.tools import lit_eval_nan_proof 9 | 10 | # this script computes the features authors_in_neighbors and authors_common_neighbors by considering 11 | # the author's graph of citations. 12 | # the script takes approximately 5 minutes to run 13 | 14 | # progress bar for pandas 15 | tqdm.pandas(tqdm()) 16 | 17 | # path 18 | path_to_data = "data/" 19 | 20 | # loading data 21 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 22 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 23 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 24 | nodes.set_index("id", inplace=True) 25 | training = pd.read_csv(path_to_data + "training_features.txt") 26 | training.set_index("my_index", inplace=True) 27 | testing = pd.read_csv(path_to_data + "testing_features.txt") 28 | testing.set_index("my_index", inplace=True) 29 | 30 | # loading data 31 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 32 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 33 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", 34 | converters=converter_dict) 35 | nodes.set_index("id", inplace=True) 36 | 37 | G = nx.DiGraph() 38 | coauthors = nx.Graph() 39 | 40 | print("building coauthor graph") 41 | nodes_id = nodes.index.values 42 | for i in tqdm(range(len(nodes_id))): 43 | 44 | authors = nodes.loc[nodes_id[i]]["authors"] 45 | if authors is np.nan: 46 | authors = [] 47 | 48 | authors = np.unique([a for a in authors if a != ""]) 49 | 50 | for a in authors: 51 | G.add_node(a) 52 | coauthors.add_node(a) 53 | 54 | for a1 in authors: 55 | for a2 in authors: 56 | if a1 != a2: 57 | if coauthors.has_edge(a1, a2): 58 | coauthors[a1][a2]["weight"] += 1 59 | else: 60 | coauthors.add_edge(a1, a2, weight=1) 61 | 62 | id1 = training["id1"].values 63 | id2 = training["id2"].values 64 | 65 | print("building citation graph") 66 | for i in tqdm(range(len(id1))): 67 | 68 | current_authors_1 = nodes.loc[id1[i]]["authors"] 69 | current_authors_2 = nodes.loc[id2[i]]["authors"] 70 | 71 | if current_authors_1 is np.nan: 72 | current_authors_1 = [] 73 | 74 | if current_authors_2 is np.nan: 75 | current_authors_2 = [] 76 | 77 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 78 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 79 | 80 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 81 | for a1 in current_authors_1: 82 | for a2 in current_authors_2: 83 | if G.has_edge(a1, a2): 84 | G[a1][a2]["weight"] += 1 85 | else: 86 | G.add_edge(a1, a2, weight=1) 87 | 88 | authors_in_neighbors = np.zeros(len(id1)) 89 | normalized_authors_in_neighbors = np.zeros(len(id1)) 90 | best_authors_in_neighbors = np.zeros(len(id1)) 91 | authors_common_neighbors = np.zeros(len(id1)) 92 | 93 | print("building features for training") 94 | for i in tqdm(range(len(id1))): 95 | current_authors_1 = nodes.loc[id1[i]]["authors"] 96 | current_authors_2 = nodes.loc[id2[i]]["authors"] 97 | 98 | if current_authors_1 is np.nan: 99 | current_authors_1 = [] 100 | 101 | if current_authors_2 is np.nan: 102 | current_authors_2 = [] 103 | 104 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 105 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 106 | 107 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 108 | for a1 in current_authors_1: 109 | for a2 in current_authors_2: 110 | G[a1][a2]["weight"] -= 1 111 | 112 | # this feature is commented because too long to compute 113 | # for a1 in current_authors_1: 114 | # for p in G.successors(a1): 115 | # for a2 in G.successors(p): 116 | # if a2 in current_authors_2: 117 | # authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"]) 118 | 119 | best = 0 120 | for a1 in current_authors_2: 121 | current = len([g for g in G.predecessors(a1)]) 122 | authors_in_neighbors[i] += current 123 | if current > best: 124 | best = current 125 | 126 | best_authors_in_neighbors[i] = best 127 | 128 | # normalize feature 129 | denom = len(current_authors_2) 130 | if denom > 0: 131 | normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom 132 | 133 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 134 | for a1 in current_authors_1: 135 | for a2 in current_authors_2: 136 | G[a1][a2]["weight"] += 1 137 | 138 | training["authors_in_neighbors"] = authors_in_neighbors 139 | training["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors 140 | training["best_authors_in_neighbors"] = best_authors_in_neighbors 141 | training["authors_common_neighbors"] = authors_common_neighbors 142 | 143 | id1 = testing["id1"].values 144 | id2 = testing["id2"].values 145 | 146 | authors_in_neighbors = np.zeros(len(id1)) 147 | normalized_authors_in_neighbors = np.zeros(len(id1)) 148 | best_authors_in_neighbors = np.zeros(len(id1)) 149 | authors_common_neighbors = np.zeros(len(id1)) 150 | 151 | print("building features for testing") 152 | for i in tqdm(range(len(id1))): 153 | current_authors_1 = nodes.loc[id1[i]]["authors"] 154 | current_authors_2 = nodes.loc[id2[i]]["authors"] 155 | 156 | if current_authors_1 is np.nan: 157 | current_authors_1 = [] 158 | 159 | if current_authors_2 is np.nan: 160 | current_authors_2 = [] 161 | 162 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""]) 163 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""]) 164 | 165 | # for a1 in current_authors_1: 166 | # for p in G.successors(a1): 167 | # for a2 in G.successors(p): 168 | # if a2 in current_authors_2: 169 | # authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"]) 170 | 171 | best = 0 172 | for a1 in current_authors_2: 173 | current = len([g for g in G.predecessors(a1)]) 174 | authors_in_neighbors[i] += current 175 | if current > best: 176 | best = current 177 | 178 | best_authors_in_neighbors[i] = best 179 | 180 | # normalize feature 181 | denom = len(current_authors_2) 182 | if denom > 0: 183 | normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom 184 | 185 | testing["authors_in_neighbors"] = authors_in_neighbors 186 | testing["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors 187 | testing["best_authors_in_neighbors"] = best_authors_in_neighbors 188 | testing["authors_common_neighbors"] = authors_common_neighbors 189 | 190 | print("done, saving data") 191 | # save data-frame 192 | training.to_csv(path_to_data + "training_features.txt") 193 | testing.to_csv(path_to_data + "testing_features.txt") 194 | -------------------------------------------------------------------------------- /feature_engineering/baseline_feature_engineering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | 4 | from feature_engineering.tools import lit_eval_nan_proof 5 | 6 | # this script produces the following features: overlap_title, date_diff and common_author 7 | # this is the script that you should run right after the pre-processing 8 | 9 | # progress bar for pandas 10 | tqdm.pandas(tqdm()) 11 | 12 | # path 13 | path_to_data = "data/" 14 | 15 | # loading preprocessed data 16 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 17 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 18 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 19 | nodes.set_index("id", inplace=True) 20 | training = pd.read_csv(path_to_data + "training_new_index.txt") 21 | training.set_index("my_index", inplace=True) 22 | testing = pd.read_csv(path_to_data + "testing_new_index.txt") 23 | testing.set_index("my_index", inplace=True) 24 | 25 | # adding baseline features in training dataframe 26 | 27 | # features placeholders 28 | overlap_title = [] 29 | date_diff = [] 30 | common_author = [] 31 | 32 | # IDs for training set 33 | id1 = training['id1'].values 34 | id2 = training['id2'].values 35 | 36 | # computing features for training set 37 | for i in tqdm(range(len(id1))): 38 | title1 = nodes.at[id1[i], 'title'] 39 | title2 = nodes.at[id2[i], 'title'] 40 | date1 = nodes.at[id1[i], 'year'] 41 | date2 = nodes.at[id2[i], 'year'] 42 | author1 = nodes.at[id1[i], 'authors'] 43 | author2 = nodes.at[id2[i], 'authors'] 44 | overlap_title.append(len(set(title1).intersection(set(title2)))) 45 | date_diff.append(int(date1) - int(date2)) 46 | if isinstance(author1, float) or isinstance(author2, float): 47 | common_author.append(0) 48 | else: 49 | common_author.append(len(set(author1).intersection(set(author2)))) 50 | 51 | # adding feature to data-frame 52 | training["overlap_title"] = overlap_title 53 | training["date_diff"] = date_diff 54 | training["common_author"] = common_author 55 | 56 | # repeat process for test set 57 | overlap_title_test = [] 58 | date_diff_test = [] 59 | common_author_test = [] 60 | id1 = testing['id1'].values 61 | id2 = testing['id2'].values 62 | for i in tqdm(range(len(id1))): 63 | title1 = nodes.at[id1[i], 'title'] 64 | title2 = nodes.at[id2[i], 'title'] 65 | date1 = nodes.at[id1[i], 'year'] 66 | date2 = nodes.at[id2[i], 'year'] 67 | author1 = nodes.at[id1[i], 'authors'] 68 | author2 = nodes.at[id2[i], 'authors'] 69 | overlap_title_test.append(len(set(title1).intersection(set(title2)))) 70 | date_diff_test.append(int(date1) - int(date2)) 71 | if isinstance(author1, float) or isinstance(author2, float): 72 | common_author_test.append(0) 73 | else: 74 | common_author_test.append(len(set(author1).intersection(set(author2)))) 75 | testing["overlap_title"] = overlap_title_test 76 | testing["date_diff"] = date_diff_test 77 | testing["common_author"] = common_author_test 78 | 79 | # save data sets 80 | training.to_csv(path_to_data + "training_features.txt") 81 | testing.to_csv(path_to_data + "testing_features.txt") 82 | -------------------------------------------------------------------------------- /feature_engineering/basic_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | 4 | from feature_engineering.tools import compare_journals, lit_eval_nan_proof 5 | 6 | # this script adds the features journal_similarity and overlapping_words_abstract to the csv features files 7 | 8 | # progress bar for pandas 9 | tqdm.pandas(tqdm()) 10 | 11 | # path 12 | path_to_data = "data/" 13 | 14 | # loading data 15 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 16 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 17 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", 18 | converters=converter_dict) 19 | nodes.set_index("id", inplace=True) 20 | training = pd.read_csv(path_to_data + "training_features.txt") 21 | training.set_index("my_index", inplace=True) 22 | testing = pd.read_csv(path_to_data + "testing_features.txt") 23 | testing.set_index("my_index", inplace=True) 24 | 25 | # placeholder for second batch of features 26 | journal_similarity = [] 27 | overlapping_words_abstract = [] 28 | 29 | # IDs for training set 30 | id1 = training['id1'].values 31 | id2 = training['id2'].values 32 | 33 | # computing features for training set 34 | for i in tqdm(range(len(id1))): 35 | journal1 = nodes.at[id1[i], 'journal'] 36 | journal2 = nodes.at[id2[i], 'journal'] 37 | abstract1 = nodes.at[id1[i], "abstract"] 38 | abstract2 = nodes.at[id2[i], "abstract"] 39 | if isinstance(journal1, float) or isinstance(journal2, float): 40 | journal_similarity.append(0) 41 | else: 42 | journal_similarity.append(compare_journals(journal1, journal2)) 43 | overlapping_words_abstract.append(len(set(abstract1).intersection(set(abstract2)))) 44 | 45 | # adding feature to dataframe 46 | training["journal_similarity"] = journal_similarity 47 | training["overlapping_words_abstract"] = overlapping_words_abstract 48 | 49 | # repeat process for test set 50 | journal_similarity_test = [] 51 | overlapping_words_abstract_test = [] 52 | id1 = testing['id1'].values 53 | id2 = testing['id2'].values 54 | for i in tqdm(range(len(id1))): 55 | journal1 = nodes.at[id1[i], 'journal'] 56 | journal2 = nodes.at[id2[i], 'journal'] 57 | abstract1 = nodes.at[id1[i], "abstract"] 58 | abstract2 = nodes.at[id2[i], "abstract"] 59 | if isinstance(journal1, float) or isinstance(journal2, float): 60 | journal_similarity_test.append(0) 61 | else: 62 | journal_similarity_test.append(compare_journals(journal1, journal2)) 63 | overlapping_words_abstract_test.append(len(set(abstract1).intersection(set(abstract2)))) 64 | testing["journal_similarity"] = journal_similarity_test 65 | testing["overlapping_words_abstract"] = overlapping_words_abstract_test 66 | 67 | # save data sets 68 | training.to_csv(path_to_data + "training_features.txt") 69 | testing.to_csv(path_to_data + "testing_features.txt") 70 | -------------------------------------------------------------------------------- /feature_engineering/citation_graph_features.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from feature_engineering.tools import lit_eval_nan_proof 7 | 8 | # this script adds the feature shortest_path to the files training_features and testing_features 9 | # this script takes approximately 1000 minutes to execute 10 | 11 | # progress bar for pandas 12 | tqdm.pandas(tqdm()) 13 | 14 | # path 15 | path_to_data = "data/" 16 | 17 | # loading data 18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 19 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 21 | nodes.set_index("id", inplace=True) 22 | training = pd.read_csv(path_to_data + "training_features.txt") 23 | training.set_index("my_index", inplace=True) 24 | testing = pd.read_csv(path_to_data + "testing_features.txt") 25 | testing.set_index("my_index", inplace=True) 26 | 27 | # placeholders for graph features 28 | shortest_path = [] 29 | 30 | # IDs for training set 31 | id1 = training['id1'].values 32 | id2 = training['id2'].values 33 | target = training["target"].values 34 | 35 | # creating graph of citations 36 | 37 | # create empty directed graph 38 | g = igraph.Graph(directed=True) 39 | 40 | # some nodes may not be connected to any other node 41 | # hence the need to create the nodes of the graph from node_info.csv, 42 | # not just from the edge list 43 | nodes = nodes.index.values 44 | str_vec = np.vectorize(str) 45 | nodes = str_vec(nodes) 46 | 47 | # add vertices 48 | g.add_vertices(nodes) 49 | 50 | # create and add edges 51 | edges = [(str(id1[i]), str(id2[i])) for i in range(len(id1)) if target[i] == 1] 52 | g.add_edges(edges) 53 | 54 | for i in tqdm(range(len(id1))): 55 | if target[i] == 1: 56 | g.delete_edges([(str(id1[i]), str(id2[i]))]) 57 | shortest_path.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0]) 58 | if target[i] == 1: 59 | g.add_edge(str(id1[i]), str(id2[i])) 60 | # adding feature to dataframe 61 | training["shortest_path"] = shortest_path 62 | 63 | # repeat process for test set 64 | shortest_path_test = [] 65 | id1 = testing['id1'].values 66 | id2 = testing['id2'].values 67 | for i in tqdm(range(len(id1))): 68 | shortest_path_test.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0]) 69 | if target[i] == 1: 70 | g.add_edge(str(id1[i]), str(id2[i])) 71 | testing["shortest_path"] = shortest_path_test 72 | 73 | # save data sets 74 | training.to_csv(path_to_data + "training_features.txt") 75 | testing.to_csv(path_to_data + "testing_features.txt") 76 | -------------------------------------------------------------------------------- /feature_engineering/cosine_distance.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from gensim import corpora, models 6 | from tqdm import tqdm 7 | 8 | from feature_engineering.tools import lit_eval_nan_proof 9 | 10 | # this script adds the features score_1_2, score_2_1 and cosine_distance to the features csv files. 11 | # this script takes approximately 10 minutes to run 12 | 13 | # progress bar for pandas 14 | tqdm.pandas(tqdm()) 15 | 16 | # path 17 | path_to_data = "data/" 18 | 19 | # loading data 20 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 21 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 22 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 23 | nodes.set_index("id", inplace=True) 24 | training = pd.read_csv(path_to_data + "training_features.txt") 25 | training.set_index("my_index", inplace=True) 26 | testing = pd.read_csv(path_to_data + "testing_features.txt") 27 | testing.set_index("my_index", inplace=True) 28 | 29 | # create dictionary for tfidf 30 | abstracts = nodes['abstract'].values 31 | average_len = np.mean(np.array([len(a) for a in abstracts])) 32 | dictionary = corpora.Dictionary(abstracts) 33 | 34 | 35 | def my_tf(p): 36 | return math.log(1.0 + p) 37 | 38 | 39 | # instantiate tf-idf model 40 | tfidf = models.TfidfModel(dictionary=dictionary, wlocal=my_tf) 41 | 42 | 43 | # handy functions to compute cosine distance 44 | def get_tf_idf_encoding(index): 45 | abstract = nodes.at[index, "abstract"] 46 | abstract = dictionary.doc2bow(abstract) 47 | ans = tfidf[[abstract]] 48 | return ans[0] 49 | 50 | 51 | def my_norm(tfidf_abstract): 52 | ans = 0.0 53 | for (k, v) in tfidf_abstract: 54 | ans += v ** 2 55 | return np.sqrt(ans) 56 | 57 | 58 | def cosine_distance(id1, id2): 59 | tfidf_abstract1 = get_tf_idf_encoding(id1) 60 | tfidf_abstract2 = get_tf_idf_encoding(id2) 61 | f1 = dict(tfidf_abstract1) 62 | f2 = dict(tfidf_abstract2) 63 | ans = 0.0 64 | for k, v in f1.items(): 65 | if k in f2.keys(): 66 | ans += v * f2[k] 67 | return ans 68 | 69 | 70 | def get_score(id1, id2, avglen, k1=1.2, b=0.75): 71 | abstract_1 = nodes.at[id1, "abstract"] 72 | len_1 = len(abstract_1) 73 | abstract_1 = dictionary.doc2bow(abstract_1) 74 | tf_1 = dict([ 75 | (termid, tfidf.wlocal(tf)) 76 | for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0 77 | ]) 78 | idf_1 = dict([ 79 | (termid, tfidf.idfs.get(termid)) 80 | for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0 81 | ]) 82 | 83 | abstract_2 = nodes.at[id2, "abstract"] 84 | abstract_2 = dictionary.doc2bow(abstract_2) 85 | tf_2 = dict([ 86 | (termid, tfidf.wlocal(tf)) 87 | for termid, tf in abstract_2 if tfidf.idfs.get(termid, 0.0) != 0.0 88 | ]) 89 | 90 | ans = 0.0 91 | for k, v in tf_1.items(): 92 | if k in tf_2.keys(): 93 | ans += idf_1[k] * (v * (k1 + 1)) / (v + k1 * (1 - b + b * len_1 / avglen)) 94 | return ans 95 | 96 | 97 | # placeholder for feature 98 | score_1_2 = [] 99 | score_2_1 = [] 100 | cosine_dist = [] 101 | 102 | # IDs for training set 103 | id1 = training['id1'].values 104 | id2 = training['id2'].values 105 | 106 | # computing features for training set 107 | for i in tqdm(range(len(id1))): 108 | score_1_2.append(get_score(id1[i], id2[i], average_len)) 109 | score_2_1.append(get_score(id2[i], id1[i], average_len)) 110 | cosine_dist.append(cosine_distance(id1[i], id2[i])) 111 | 112 | # add feature to data-frame 113 | training["score_1_2"] = score_1_2 114 | training["score_2_1"] = score_2_1 115 | training["cosine_distance"] = cosine_dist 116 | 117 | score_1_2 = [] 118 | score_2_1 = [] 119 | cosine_dist = [] 120 | 121 | # IDs for training set 122 | id1 = testing['id1'].values 123 | id2 = testing['id2'].values 124 | 125 | # computing features for training set 126 | for i in tqdm(range(len(id1))): 127 | score_1_2.append(get_score(id1[i], id2[i], average_len)) 128 | score_2_1.append(get_score(id2[i], id1[i], average_len)) 129 | cosine_dist.append(cosine_distance(id1[i], id2[i])) 130 | 131 | # add feature to data-frame 132 | testing["score_1_2"] = score_1_2 133 | testing["score_2_1"] = score_2_1 134 | testing["cosine_distance"] = cosine_dist 135 | 136 | # save data-frame 137 | training.to_csv(path_to_data + "training_features.txt") 138 | testing.to_csv(path_to_data + "testing_features.txt") 139 | -------------------------------------------------------------------------------- /feature_engineering/networkx_bigraph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from feature_engineering.tools import lit_eval_nan_proof 7 | 8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar, 9 | # preferential_attachment, resource_allocation_index and common_neighbors 10 | # approx 10 minutes to run it 11 | 12 | # progress bar for pandas 13 | tqdm.pandas(tqdm()) 14 | 15 | # path 16 | path_to_data = "data/" 17 | 18 | # loading data 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 22 | nodes.set_index("id", inplace=True) 23 | training = pd.read_csv(path_to_data + "training_features.txt") 24 | training.set_index("my_index", inplace=True) 25 | testing = pd.read_csv(path_to_data + "testing_features.txt") 26 | testing.set_index("my_index", inplace=True) 27 | 28 | G = nx.Graph() 29 | G.add_nodes_from(nodes.index.values) 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"])) 31 | 32 | # IDs for training set 33 | id1 = training['id1'].values 34 | id2 = training['id2'].values 35 | 36 | # placeholder for feature 37 | n = len(id1) 38 | jaccard = np.zeros(n) 39 | adar = np.zeros(n) 40 | preferential_attachment = np.zeros(n) 41 | resource_allocation_index = np.zeros(n) 42 | common_neighbors = np.zeros(n) 43 | 44 | # computing features for training set 45 | for i in tqdm(range(len(id1))): 46 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 47 | G.remove_edge(id1[i], id2[i]) 48 | 49 | pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])]) 50 | pred = [(u, v, p) for (u, v, p) in pred] 51 | jaccard[i] = pred[0][2] 52 | 53 | pred = nx.adamic_adar_index(G, [(id1[i], id2[i])]) 54 | pred = [(u, v, p) for (u, v, p) in pred] 55 | adar[i] = pred[0][2] 56 | 57 | pred = nx.preferential_attachment(G, [(id1[i], id2[i])]) 58 | pred = [(u, v, p) for (u, v, p) in pred] 59 | preferential_attachment[i] = pred[0][2] 60 | 61 | pred = nx.resource_allocation_index(G, [(id1[i], id2[i])]) 62 | pred = [(u, v, p) for (u, v, p) in pred] 63 | resource_allocation_index[i] = pred[0][2] 64 | 65 | pred = nx.common_neighbors(G, id1[i], id2[i]) 66 | pred = len([u for u in pred]) 67 | common_neighbors[i] = pred 68 | 69 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 70 | G.add_edge(id1[i], id2[i]) 71 | 72 | # add feature to data-frame 73 | training["jaccard"] = jaccard 74 | training["adar"] = adar 75 | training["preferential_attachment"] = preferential_attachment 76 | training["resource_allocation_index"] = resource_allocation_index 77 | training["common_neighbors"] = resource_allocation_index 78 | 79 | # IDs for training set 80 | id1 = testing['id1'].values 81 | id2 = testing['id2'].values 82 | 83 | # placeholder for feature 84 | n = len(id1) 85 | jaccard = np.zeros(n) 86 | adar = np.zeros(n) 87 | preferential_attachment = np.zeros(n) 88 | resource_allocation_index = np.zeros(n) 89 | common_neighbors = np.zeros(n) 90 | 91 | # computing features for training set 92 | for i in tqdm(range(len(id1))): 93 | pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])]) 94 | pred = [(u, v, p) for (u, v, p) in pred] 95 | jaccard[i] = pred[0][2] 96 | 97 | pred = nx.adamic_adar_index(G, [(id1[i], id2[i])]) 98 | pred = [(u, v, p) for (u, v, p) in pred] 99 | adar[i] = pred[0][2] 100 | 101 | pred = nx.preferential_attachment(G, [(id1[i], id2[i])]) 102 | pred = [(u, v, p) for (u, v, p) in pred] 103 | preferential_attachment[i] = pred[0][2] 104 | 105 | pred = nx.resource_allocation_index(G, [(id1[i], id2[i])]) 106 | pred = [(u, v, p) for (u, v, p) in pred] 107 | resource_allocation_index[i] = pred[0][2] 108 | 109 | pred = nx.common_neighbors(G, id1[i], id2[i]) 110 | pred = len([u for u in pred]) 111 | common_neighbors[i] = pred 112 | 113 | # add feature to data-frame 114 | testing["jaccard"] = jaccard 115 | testing["adar"] = adar 116 | testing["preferential_attachment"] = preferential_attachment 117 | testing["resource_allocation_index"] = resource_allocation_index 118 | testing["common_neighbors"] = resource_allocation_index 119 | 120 | # save data-frame 121 | training.to_csv(path_to_data + "training_features.txt") 122 | testing.to_csv(path_to_data + "testing_features.txt") 123 | -------------------------------------------------------------------------------- /feature_engineering/networkx_bigraph_long.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | 3 | import networkx as nx 4 | import numpy as np 5 | import pandas as pd 6 | from tqdm import tqdm 7 | 8 | from feature_engineering.tools import lit_eval_nan_proof 9 | 10 | # this script computes some features by considering the bidirectional graph of citations: katz 11 | 12 | # progress bar for pandas 13 | tqdm.pandas(tqdm()) 14 | 15 | # path 16 | path_to_data = "data/" 17 | 18 | # loading data 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 22 | nodes.set_index("id", inplace=True) 23 | training = pd.read_csv(path_to_data + "training_features.txt") 24 | training.set_index("my_index", inplace=True) 25 | testing = pd.read_csv(path_to_data + "testing_features.txt") 26 | testing.set_index("my_index", inplace=True) 27 | 28 | # IDs for training set 29 | id1 = training['id1'].values 30 | id2 = training['id2'].values 31 | 32 | # placeholder for feature 33 | n = len(id1) 34 | print("start computing for training: ") 35 | print("size of data to process: " + str(n)) 36 | katz = np.zeros(n) 37 | katz_2 = np.zeros(n) 38 | breaking_point = 10 39 | beta = 0.98 40 | beta_2 = 0.9 41 | 42 | 43 | def work(i0=None, n=None, is_training=True): 44 | print(i0) 45 | G = nx.Graph() 46 | G.add_nodes_from(nodes.index.values) 47 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"])) 48 | 49 | ans = np.zeros(n) 50 | ans_2 = np.zeros(n) 51 | 52 | for i in range(n): 53 | if is_training: 54 | if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1: 55 | G.remove_edge(id1[i0 + i], id2[i0 + i]) 56 | 57 | katz_acc = 0.0 58 | katz_2_acc = 0.0 59 | counter = 0 60 | try: 61 | iterator = nx.all_shortest_paths(G, source=id1[i0 + i], target=id2[i0 + i]) 62 | for p in iterator: 63 | len_p = len(p) 64 | katz_acc += (beta ** len_p) 65 | katz_2_acc += (beta_2 ** len_p) 66 | counter += 1 67 | if counter >= breaking_point: 68 | break 69 | except: 70 | ans[i] = -1 71 | ans_2[i] = -1 72 | 73 | if is_training: 74 | if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1: 75 | G.add_edge(id1[i0 + i], id2[i0 + i]) 76 | 77 | ans[i] = katz_acc 78 | ans_2[i] = katz_2_acc 79 | 80 | print(i0) 81 | 82 | return ans, ans_2, i0 83 | 84 | 85 | def callback(r): 86 | ans, ans_2, i0 = r 87 | 88 | 89 | # computing features for training set 90 | 91 | pool = Pool() 92 | print("starting pool...") 93 | import time 94 | 95 | start = time.time() 96 | n_tasks = 512 97 | tasks = [] 98 | step = int(n / n_tasks) 99 | print(step) 100 | for i0 in range(n_tasks): 101 | kwds = { 102 | "i0": i0 * step, 103 | "n": step, 104 | "is_training": True 105 | } 106 | tasks.append(pool.apply_async(work, kwds=kwds, callback=callback)) 107 | pool.close() 108 | pool.join() 109 | for i in range(n_tasks): 110 | katz[i * step: (i + 1) * step], \ 111 | katz_2[i * step: (i + 1) * step], _ = tasks[i].get() 112 | 113 | end = time.time() 114 | print(end - start) 115 | # add feature to data-frame 116 | training["katz"] = katz 117 | training["katz_2"] = katz_2 118 | 119 | # IDs for testing set 120 | print("start computing for training: ") 121 | id1 = testing['id1'].values 122 | id2 = testing['id2'].values 123 | 124 | # placeholder for feature 125 | n = len(id1) 126 | print("size of data to process: " + str(n)) 127 | 128 | katz = np.zeros(n) 129 | katz_2 = np.zeros(n) 130 | 131 | pool = Pool() 132 | print("starting pool...") 133 | n_tasks = 512 134 | tasks = [] 135 | step = int(n / n_tasks) 136 | for i0 in range(n_tasks): 137 | kwds = { 138 | "i0": i0 * step, 139 | "n": step, 140 | "is_training": False 141 | } 142 | tasks.append(pool.apply_async(work, kwds=kwds, callback=callback)) 143 | pool.close() 144 | pool.join() 145 | for i in range(n_tasks): 146 | katz[i * step: (i + 1) * step], \ 147 | katz_2[i * step: (i + 1) * step], _ = tasks[i].get() 148 | 149 | # add feature to data-frame 150 | testing["katz"] = katz 151 | testing["katz_2"] = katz_2 152 | 153 | print("done, saving data") 154 | # save data-frame 155 | training.to_csv(path_to_data + "training_features.txt") 156 | testing.to_csv(path_to_data + "testing_features.txt") 157 | -------------------------------------------------------------------------------- /feature_engineering/networkx_bigraph_long2.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from feature_engineering.tools import lit_eval_nan_proof 7 | 8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar, 9 | # preferential_attachment, resource_allocation_index and common_neighbors 10 | # approx 10 minutes to run it 11 | 12 | # progress bar for pandas 13 | tqdm.pandas(tqdm()) 14 | 15 | # path 16 | path_to_data = "data/" 17 | 18 | # loading data 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 22 | nodes.set_index("id", inplace=True) 23 | training = pd.read_csv(path_to_data + "training_features.txt") 24 | training.set_index("my_index", inplace=True) 25 | testing = pd.read_csv(path_to_data + "testing_features.txt") 26 | testing.set_index("my_index", inplace=True) 27 | 28 | G = nx.Graph() 29 | G.add_nodes_from(nodes.index.values) 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"])) 31 | 32 | # IDs for training set 33 | id1 = training['id1'].values 34 | id2 = training['id2'].values 35 | 36 | # placeholder for feature 37 | n = len(id1) 38 | katz = np.zeros(n) 39 | katz_2 = np.zeros(n) 40 | beta = 0.98 41 | beta_2 = 0.90 42 | breaking_point = 10 43 | 44 | # computing features for training set 45 | for i in tqdm(range(len(id1))): 46 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 47 | G.remove_edge(id1[i], id2[i]) 48 | 49 | katz_acc = 0.0 50 | katz_2_acc = 0.0 51 | counter = 0 52 | try: 53 | iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i]) 54 | for p in iterator: 55 | len_p = len(p) 56 | katz_acc += len_p * (beta ** len_p) 57 | katz_2_acc += len_p * (beta_2 ** len_p) 58 | counter += 1 59 | if counter >= breaking_point: 60 | break 61 | katz[i] = katz_acc 62 | katz[i] = katz_2_acc 63 | except: 64 | katz[i] = -1 65 | katz_2[i] = -1 66 | 67 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 68 | G.add_edge(id1[i], id2[i]) 69 | 70 | # add feature to data-frame 71 | training["katz"] = katz 72 | training["katz_2"] = katz_2 73 | 74 | # IDs for training set 75 | id1 = testing['id1'].values 76 | id2 = testing['id2'].values 77 | 78 | # placeholder for feature 79 | n = len(id1) 80 | katz = np.zeros(n) 81 | katz_2 = np.zeros(n) 82 | 83 | # computing features for training set 84 | for i in tqdm(range(len(id1))): 85 | katz_acc = 0.0 86 | katz_2_acc = 0.0 87 | counter = 0 88 | try: 89 | iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i]) 90 | for p in iterator: 91 | len_p = len(p) 92 | katz_acc += len_p * (beta ** len_p) 93 | katz_2_acc += len_p * (beta_2 ** len_p) 94 | counter += 1 95 | if counter >= breaking_point: 96 | break 97 | katz[i] = katz_acc 98 | katz[i] = katz_2_acc 99 | except: 100 | katz[i] = -1 101 | katz_2[i] = -1 102 | 103 | # add feature to data-frame 104 | testing["katz"] = katz 105 | testing["katz_2"] = katz_2 106 | 107 | # save data-frame 108 | training.to_csv(path_to_data + "training_features.txt") 109 | testing.to_csv(path_to_data + "testing_features.txt") 110 | -------------------------------------------------------------------------------- /feature_engineering/networkx_digraph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from feature_engineering.tools import lit_eval_nan_proof 7 | 8 | # this script computes the features out_neighbors, in_neighbors and popularity by considering the directed 9 | # graph of citations. Popularity is the sum of in degrees of predecessors. 10 | # the script takes approximately 5 minutes to run 11 | 12 | # progress bar for pandas 13 | tqdm.pandas(tqdm()) 14 | 15 | # path 16 | path_to_data = "data/" 17 | 18 | # loading data 19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof, 20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof} 21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict) 22 | nodes.set_index("id", inplace=True) 23 | training = pd.read_csv(path_to_data + "training_features.txt") 24 | training.set_index("my_index", inplace=True) 25 | testing = pd.read_csv(path_to_data + "testing_features.txt") 26 | testing.set_index("my_index", inplace=True) 27 | 28 | G = nx.DiGraph() 29 | G.add_nodes_from(nodes.index.values) 30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"])) 31 | 32 | # IDs for training set 33 | id1 = training['id1'].values 34 | id2 = training['id2'].values 35 | 36 | # placeholder for feature 37 | n = len(id1) 38 | out_neighbors = np.zeros(n) 39 | in_neighbors = np.zeros(n) 40 | popularity = np.zeros(n) 41 | common_predecessors = np.zeros(n) 42 | common_successors = np.zeros(n) 43 | paths_of_length_one = np.zeros(n) 44 | 45 | # computing features for training set 46 | for i in tqdm(range(len(id1))): 47 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 48 | G.remove_edge(id1[i], id2[i]) 49 | 50 | in_neighbors[i] = G.in_degree(id2[i]) 51 | out_neighbors[i] = G.out_degree(id1[i]) 52 | 53 | current_common_successors = 0 54 | current_common_predecessors = 0 55 | current_paths_of_length_one = 0 56 | 57 | predecessors_2 = G.predecessors(id2[i]) 58 | predecessors_1 = G.predecessors(id1[i]) 59 | 60 | pop = 0 61 | for p in predecessors_2: 62 | pop += G.in_degree(p) 63 | if p in predecessors_1: 64 | current_common_predecessors += 1 65 | popularity[i] = pop 66 | 67 | successors_2 = G.successors(id2[i]) 68 | successors_1 = G.successors(id1[i]) 69 | 70 | for p in successors_1: 71 | if p in successors_2: 72 | current_common_successors += 1 73 | 74 | for p in successors_1: 75 | if p in predecessors_2: 76 | current_paths_of_length_one += 1 77 | 78 | common_successors[i] = current_common_successors 79 | common_predecessors[i] = current_common_predecessors 80 | paths_of_length_one[i] = current_paths_of_length_one 81 | 82 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: 83 | G.add_edge(id1[i], id2[i]) 84 | 85 | # add feature to data-frame 86 | training["out_neighbors"] = out_neighbors 87 | training["in_neighbors"] = in_neighbors 88 | training["popularity"] = popularity 89 | training["common_successors"] = out_neighbors 90 | training["common_predecessors"] = in_neighbors 91 | training["paths_of_length_one"] = popularity 92 | 93 | # IDs for training set 94 | id1 = testing['id1'].values 95 | id2 = testing['id2'].values 96 | 97 | # placeholder for feature 98 | n = len(id1) 99 | out_neighbors = np.zeros(n) 100 | in_neighbors = np.zeros(n) 101 | popularity = np.zeros(n) 102 | 103 | # computing features for training set 104 | for i in tqdm(range(len(id1))): 105 | 106 | in_neighbors[i] = G.in_degree(id2[i]) 107 | out_neighbors[i] = G.out_degree(id1[i]) 108 | 109 | current_common_successors = 0 110 | current_common_predecessors = 0 111 | current_paths_of_length_one = 0 112 | 113 | predecessors_2 = G.predecessors(id2[i]) 114 | predecessors_1 = G.predecessors(id1[i]) 115 | 116 | pop = 0 117 | for p in predecessors_2: 118 | pop += G.in_degree(p) 119 | if p in predecessors_1: 120 | current_common_predecessors += 1 121 | popularity[i] = pop 122 | 123 | successors_2 = G.successors(id2[i]) 124 | successors_1 = G.successors(id1[i]) 125 | 126 | for p in successors_1: 127 | if p in successors_2: 128 | current_common_successors += 1 129 | 130 | for p in successors_1: 131 | if p in predecessors_2: 132 | current_paths_of_length_one += 1 133 | 134 | common_successors[i] = current_common_successors 135 | common_predecessors[i] = current_common_predecessors 136 | paths_of_length_one[i] = current_paths_of_length_one 137 | 138 | popularity[i] = pop 139 | 140 | # add feature to data-frame 141 | testing["out_neighbors"] = out_neighbors 142 | testing["in_neighbors"] = in_neighbors 143 | testing["popularity"] = popularity 144 | testing["common_successors"] = out_neighbors 145 | testing["common_predecessors"] = in_neighbors 146 | testing["paths_of_length_one"] = popularity 147 | 148 | # save data-frame 149 | training.to_csv(path_to_data + "training_features.txt") 150 | testing.to_csv(path_to_data + "testing_features.txt") 151 | -------------------------------------------------------------------------------- /feature_engineering/preprocessing.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pandas as pd 3 | from tqdm import tqdm 4 | 5 | from feature_engineering.tools import \ 6 | text_element_wise_preprocess, \ 7 | authors_element_wise_preprocess, \ 8 | journal_element_wise_preprocess 9 | 10 | # This script reads the data in node_information.csv and training_set and testing_set.csv, and creates the 11 | # files "nodes_preprocessed.csv", "training_new_index.txt" and "testing_new_index.txt". 12 | 13 | 14 | # pre-processing tools 15 | nltk.download('punkt') # for tokenization 16 | nltk.download('stopwords') 17 | 18 | # progress bar for pandas 19 | tqdm.pandas(tqdm()) 20 | 21 | # path 22 | path_to_data = "data/" 23 | 24 | # pre-processing tools 25 | nltk.download('punkt') # for tokenization 26 | nltk.download('stopwords') 27 | stpwds = set(nltk.corpus.stopwords.words("english")) 28 | stemmer = nltk.stem.PorterStemmer() 29 | 30 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"] 31 | nodes = pd.read_csv(path_to_data + "node_information.csv", names=nodes_header) 32 | nodes.set_index("id", inplace=True) 33 | 34 | # apply to DF 35 | nodes['title'] = nodes['title'].progress_apply(text_element_wise_preprocess) 36 | nodes['abstract'] = nodes['abstract'].progress_apply(text_element_wise_preprocess) 37 | nodes['authors'] = nodes['authors'].progress_apply(authors_element_wise_preprocess) 38 | nodes['journal'] = nodes['journal'].progress_apply(journal_element_wise_preprocess) 39 | 40 | # loading train 41 | names = ["id1", "id2", "target"] 42 | training = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ") 43 | 44 | # indexing consistent throughout project 45 | training["my_index"] = training["id1"].astype(str) + "|" + training["id2"].astype(str) 46 | training.set_index("my_index", inplace=True) 47 | 48 | # same process for testing set 49 | names = ["id1", "id2"] 50 | testing = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ") 51 | testing["my_index"] = testing["id1"].astype(str) + "|" + testing["id2"].astype(str) 52 | testing.set_index("my_index", inplace=True) 53 | 54 | # save preprocessed data sets 55 | nodes.to_csv(path_to_data + "nodes_preprocessed.csv") 56 | training.to_csv(path_to_data + "training_new_index.txt") 57 | testing.to_csv(path_to_data + "testing_new_index.txt") 58 | -------------------------------------------------------------------------------- /feature_engineering/tools.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | import nltk 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | # journal similarity feature 9 | def compare_journals(journal1, journal2): 10 | if len(journal1) == 0 or len(journal2) == 0: 11 | return 0 12 | if journal1[0] == journal2[0]: 13 | return 1 + compare_journals(journal1[1:], journal2[1:]) 14 | else: 15 | return 0 16 | 17 | 18 | # nan-proof string converter wrapper 19 | def lit_eval_nan_proof(string): 20 | if len(string) == 0: 21 | return np.nan 22 | else: 23 | return ast.literal_eval(string) 24 | 25 | 26 | # element-wise stemmed tokenization and stopwords removal for titles and abstracts 27 | def text_element_wise_preprocess(string): 28 | stpwds = set(nltk.corpus.stopwords.words("english")) 29 | stemmer = nltk.stem.PorterStemmer() 30 | tokens = string.lower().split(" ") 31 | tokens_wo_stpwds = [stemmer.stem(token) for token in tokens if token not in stpwds] 32 | return tokens_wo_stpwds 33 | 34 | 35 | # element-wise lower case tokenization for authors 36 | def authors_element_wise_preprocess(string): 37 | if pd.isna(string): 38 | return string 39 | tokens = string.lower().split(", ") 40 | for i in range(len(tokens)): 41 | tokens[i] = tokens[i].split('(', 1)[0].strip(' ') 42 | return tokens 43 | 44 | 45 | # element-wise lower case tokenization for journals 46 | def journal_element_wise_preprocess(string): 47 | if pd.isna(string): 48 | return string 49 | tokens = string.lower().rstrip(".").split(".") 50 | return tokens 51 | -------------------------------------------------------------------------------- /link-prediction-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/link-prediction-report.pdf -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # feature engineering 2 | 3 | import time 4 | 5 | start = time.time() 6 | print("preprocessing:") 7 | import feature_engineering.preprocessing 8 | 9 | end = time.time() 10 | print("done in: " + str(end - start)) 11 | 12 | start = time.time() 13 | print("baseline_feature_engineering:") 14 | import feature_engineering.baseline_feature_engineering 15 | 16 | end = time.time() 17 | print("done in: " + str(end - start)) 18 | 19 | start = time.time() 20 | print("basic_features:") 21 | import feature_engineering.basic_features 22 | 23 | end = time.time() 24 | print("done in: " + str(end - start)) 25 | 26 | start = time.time() 27 | print("cosine_distance:") 28 | import feature_engineering.cosine_distance 29 | 30 | end = time.time() 31 | print("done in: " + str(end - start)) 32 | 33 | start = time.time() 34 | print("networkx_bigraph:") 35 | import feature_engineering.networkx_bigraph 36 | 37 | end = time.time() 38 | print("done in: " + str(end - start)) 39 | 40 | start = time.time() 41 | print("networkx_digraph:") 42 | import feature_engineering.networkx_digraph 43 | 44 | end = time.time() 45 | print("done in: " + str(end - start)) 46 | 47 | start = time.time() 48 | print("author's features:") 49 | import feature_engineering.authors 50 | 51 | end = time.time() 52 | print("done in: " + str(end - start)) 53 | 54 | start = time.time() 55 | print("author's features:") 56 | import feature_engineering.authors_2 57 | 58 | end = time.time() 59 | print("done in: " + str(end - start)) 60 | 61 | # models : train them and store the output probits for stacking purposes 62 | 63 | start = time.time() 64 | print("SVM:") 65 | import models.svm 66 | 67 | end = time.time() 68 | print("done in: " + str(end - start)) 69 | 70 | start = time.time() 71 | print("Random Forest:") 72 | import models.random_forest 73 | 74 | end = time.time() 75 | print("done in: " + str(end - start)) 76 | 77 | start = time.time() 78 | print("LightGBM:") 79 | import models.lgbm 80 | 81 | end = time.time() 82 | print("done in: " + str(end - start)) 83 | 84 | start = time.time() 85 | print("shallow NN:") 86 | import models.nn 87 | 88 | end = time.time() 89 | print("done in: " + str(end - start)) 90 | 91 | start = time.time() 92 | print("deep NN:") 93 | import models.nn_deep 94 | 95 | end = time.time() 96 | print("done in: " + str(end - start)) 97 | 98 | # train the model stack and generate final submission "stack_sub_rf.csv" 99 | 100 | start = time.time() 101 | print("stack :") 102 | import stacking.stacking 103 | 104 | end = time.time() 105 | print("done in: " + str(end - start)) 106 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/__init__.py -------------------------------------------------------------------------------- /models/camboui.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "date: 2018-02-16 16:16:34.322166\n", 13 | "features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance', 'shortest_path', 'jaccard', 'adar', 'preferential_attachment', 'resource_allocation_index', 'out_neighbors', 'in_neighbors', 'popularity', 'common_neighbors']\n", 14 | "model: Random Forest\n", 15 | "parameters:\n", 16 | "{'n_estimators': 10}\n", 17 | "cross validation:\n", 18 | "train: 0.9966042778250185\n", 19 | "test: 0.9720086406139066\n", 20 | "train: 0.9967559756127237\n", 21 | "test: 0.9717386898461955\n", 22 | "train: 0.9965911639381028\n", 23 | "test: 0.9717295528568946\n", 24 | "train: 0.9965775073031881\n", 25 | "test: 0.9722326963394218\n", 26 | "train: 0.9965775816654026\n", 27 | "test: 0.9718838998969885\n", 28 | "kaggle score: \n", 29 | "overlap_title: 0.01665081496613972\n", 30 | "date_diff: 0.02190514883983991\n", 31 | "common_author: 0.005109300600450039\n", 32 | "journal_similarity: 0.002403034365747304\n", 33 | "shortest_path: 0.019781629572646377\n", 34 | "overlapping_words_abstract: 0.01535330054775155\n", 35 | "jaccard: 0.19108201273444772\n", 36 | "adar: 0.006316136251304461\n", 37 | "preferential_attachment: 0.052909861150268744\n", 38 | "resource_allocation_index: 0.43101242342404056\n", 39 | "out_neighbors: 0.015096505980321603\n", 40 | "in_neighbors: 0.015219239486567731\n", 41 | "popularity: 0.018873794971630692\n", 42 | "common_neighbors: 0.18828679710884363\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "import datetime\n", 48 | "from sklearn.model_selection import KFold\n", 49 | "from sklearn.ensemble import RandomForestClassifier\n", 50 | "from sklearn.metrics import accuracy_score\n", 51 | "import pandas as pd\n", 52 | "import numpy as np\n", 53 | "\n", 54 | "from tools import f1_score\n", 55 | "\n", 56 | "# path\n", 57 | "path_to_data = \"../../data/\"\n", 58 | "path_to_submissions = \"../../submissions/\"\n", 59 | "\n", 60 | "parameters = {\n", 61 | " \"n_estimators\": 10\n", 62 | "}\n", 63 | "# parameters\n", 64 | "\n", 65 | "# load data\n", 66 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n", 67 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n", 68 | "del training[\"my_index\"]\n", 69 | "del testing[\"my_index\"]\n", 70 | "\n", 71 | "# replace inf in shortest_path with -1\n", 72 | "training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])\n", 73 | "testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])\n", 74 | "\n", 75 | "my_features_string = [\n", 76 | " \"overlap_title\",\n", 77 | " \"date_diff\",\n", 78 | " \"common_author\",\n", 79 | " \"journal_similarity\",\n", 80 | " \"overlapping_words_abstract\",\n", 81 | " \"cosine_distance\",\n", 82 | " \"shortest_path\",\n", 83 | " \"jaccard\",\n", 84 | " \"adar\",\n", 85 | " \"preferential_attachment\",\n", 86 | " \"resource_allocation_index\",\n", 87 | " \"out_neighbors\",\n", 88 | " \"in_neighbors\",\n", 89 | " \"popularity\",\n", 90 | " \"common_neighbors\"\n", 91 | "]\n", 92 | "my_features_index = []\n", 93 | "my_features_dic = {}\n", 94 | "\n", 95 | "target = 0\n", 96 | "for i in range(len(training.columns)):\n", 97 | " if training.columns[i] == \"target\":\n", 98 | " target = i\n", 99 | " elif training.columns[i] in my_features_string:\n", 100 | " my_features_dic.update({len(my_features_index): training.columns[i]})\n", 101 | " my_features_index.append(i)\n", 102 | "\n", 103 | "# separating features and labels\n", 104 | "training_val = training.values\n", 105 | "testing_val = testing.values\n", 106 | "X_train, Y_train = training_val[:, my_features_index].astype(float), training_val[:, target].astype(int)\n", 107 | "X_test = testing_val[:, my_features_index]\n", 108 | "\n", 109 | "now = datetime.datetime.now()\n", 110 | "print(\"date: \"+str(now))\n", 111 | "print(\"features: \"+str(my_features_string))\n", 112 | "print(\"model: Random Forest\")\n", 113 | "print(\"parameters:\")\n", 114 | "print(parameters)\n", 115 | "print(\"cross validation:\")\n", 116 | "\n", 117 | "RF = RandomForestClassifier(n_estimators=parameters[\"n_estimators\"])\n", 118 | "k = 5\n", 119 | "kf = KFold(k)\n", 120 | "predictions = np.zeros((X_test.shape[0], k))\n", 121 | "i = 0\n", 122 | "\n", 123 | "for train_index, test_index in kf.split(X_train, Y_train):\n", 124 | " RF.fit(X_train[train_index], Y_train[train_index])\n", 125 | " Y_pred = RF.predict(X_train[test_index])\n", 126 | " Y_pred_train = RF.predict(X_train[train_index])\n", 127 | " predictions[:, i] = RF.predict(X_test)\n", 128 | " print(\"train: \"+str(f1_score(Y_train[train_index], Y_pred_train)))\n", 129 | " print(\"test: \"+str(f1_score(Y_train[test_index], Y_pred)))\n", 130 | " i += 1\n", 131 | "\n", 132 | "Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)\n", 133 | "\n", 134 | "submission = pd.DataFrame(Y_test)\n", 135 | "submission.to_csv(\n", 136 | " path_or_buf=path_to_submissions+\"-\".join(my_features_string)+\".csv\",\n", 137 | " index=True,\n", 138 | " index_label=\"id\",\n", 139 | " header=[\"category\"]\n", 140 | ")\n", 141 | "print(\"kaggle score: \")\n", 142 | "\n", 143 | "for i in range(len(RF.feature_importances_)):\n", 144 | " print(str(my_features_dic[i]) + \": \" + str(RF.feature_importances_[i]))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 3, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "
\n", 156 | "\n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
id1id2targetoverlap_titledate_diffcommon_authorjournal_similarityshortest_pathoverlapping_words_abstractjaccardadarpreferential_attachmentresource_allocation_indexout_neighborsin_neighborspopularity
09510123950211412002-1.040.0666670.51389855.00.1428572.07.076.0
197070759604178111002.070.0980394.32036611388.00.22640167.0123.04019.0
29312155950614200-200-1.060.0000000.0000005.00.0000000.02.08.0
3991125530216500-400-1.080.0000000.000000280.00.00000016.02.03.0
4970103320907600-500-1.080.0000000.000000168.00.0000000.02.01.0
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " id1 id2 target overlap_title date_diff common_author \\\n", 293 | "0 9510123 9502114 1 2 0 0 \n", 294 | "1 9707075 9604178 1 1 1 0 \n", 295 | "2 9312155 9506142 0 0 -2 0 \n", 296 | "3 9911255 302165 0 0 -4 0 \n", 297 | "4 9701033 209076 0 0 -5 0 \n", 298 | "\n", 299 | " journal_similarity shortest_path overlapping_words_abstract jaccard \\\n", 300 | "0 2 -1.0 4 0.066667 \n", 301 | "1 0 2.0 7 0.098039 \n", 302 | "2 0 -1.0 6 0.000000 \n", 303 | "3 0 -1.0 8 0.000000 \n", 304 | "4 0 -1.0 8 0.000000 \n", 305 | "\n", 306 | " adar preferential_attachment resource_allocation_index \\\n", 307 | "0 0.513898 55.0 0.142857 \n", 308 | "1 4.320366 11388.0 0.226401 \n", 309 | "2 0.000000 5.0 0.000000 \n", 310 | "3 0.000000 280.0 0.000000 \n", 311 | "4 0.000000 168.0 0.000000 \n", 312 | "\n", 313 | " out_neighbors in_neighbors popularity \n", 314 | "0 2.0 7.0 76.0 \n", 315 | "1 67.0 123.0 4019.0 \n", 316 | "2 0.0 2.0 8.0 \n", 317 | "3 16.0 2.0 3.0 \n", 318 | "4 0.0 2.0 1.0 " 319 | ] 320 | }, 321 | "execution_count": 3, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "training.head()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "Python 3", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.5.2" 355 | } 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 2 359 | } 360 | -------------------------------------------------------------------------------- /models/feature_selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import datetime\n", 10 | "from sklearn.model_selection import KFold\n", 11 | "from sklearn.ensemble import RandomForestClassifier\n", 12 | "from sklearn.metrics import accuracy_score\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "\n", 16 | "from tools import f1_score\n", 17 | "\n", 18 | "# path\n", 19 | "path_to_data = \"../../data/\"\n", 20 | "path_to_submissions = \"../../submissions/\"\n", 21 | "\n", 22 | "parameters = {\n", 23 | " \"n_estimators\": 10,\n", 24 | " \"criterion\": \"entropy\", # default = gini\n", 25 | " \"bootstrap\": True\n", 26 | "}\n", 27 | "# parameters\n", 28 | "\n", 29 | "# load data\n", 30 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n", 31 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n", 32 | "del training[\"my_index\"]\n", 33 | "del testing[\"my_index\"]\n", 34 | "\n", 35 | "\n", 36 | "import pandas\n", 37 | "import numpy\n", 38 | "from sklearn.feature_selection import SelectKBest\n", 39 | "from sklearn.feature_selection import chi2\n", 40 | "# load data\n", 41 | "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data\"\n", 42 | "names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n", 43 | "dataframe = pandas.read_csv(url, names=names)\n", 44 | "array = dataframe.values\n", 45 | "X = array[:,0:8]\n", 46 | "Y = array[:,8]\n", 47 | "# feature extraction\n", 48 | "test = SelectKBest(score_func=chi2, k=4)\n", 49 | "fit = test.fit(X, Y)\n", 50 | "# summarize scores\n", 51 | "numpy.set_printoptions(precision=3)\n", 52 | "print(fit.scores_)\n", 53 | "features = fit.transform(X)\n", 54 | "# summarize selected features\n", 55 | "print(features[0:5,:])" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.5.2" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 0 80 | } 81 | -------------------------------------------------------------------------------- /models/feature_selection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.model_selection import KFold 4 | 5 | from models.tools import f1_score 6 | 7 | # path 8 | path_to_data = "data/" 9 | path_to_submissions = "submissions/" 10 | path_to_stacking = "stacking" 11 | path_to_plots = "plots" 12 | 13 | # tuned hyper-parameters 14 | 15 | parameters = { 16 | "n_estimators": 100, 17 | "criterion": "entropy", # default = gini 18 | "max_depth": 20, 19 | "min_samples_leaf": 10, 20 | "bootstrap": True, 21 | "n_jobs": -1 22 | } 23 | 24 | # load data 25 | training = pd.read_csv(path_to_data + "training_features.txt") 26 | del training["my_index"] 27 | 28 | # replace inf in shortest_path with -1 29 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1]) 30 | 31 | my_features_string = [ 32 | "date_diff", 33 | "overlap_title", 34 | "common_author", 35 | "score_1_2", 36 | "score_2_1", 37 | "cosine_distance", 38 | "journal_similarity", 39 | "overlapping_words_abstract", 40 | "jaccard", 41 | "adar", 42 | "preferential_attachment", 43 | "resource_allocation_index", 44 | "out_neighbors", 45 | "in_neighbors", 46 | "common_neighbors", 47 | "shortest_path", 48 | "popularity", 49 | "common_successors", 50 | "common_predecessors", 51 | "paths_of_length_one", 52 | "authors_citation", 53 | "normalized_authors_citation", 54 | "best_authors_citation", 55 | "coauthor_score", 56 | "normalized_coauthor_score", 57 | "best_coauthor_score", 58 | "authors_in_neighbors", 59 | "normalized_authors_in_neighbors", 60 | "best_authors_in_neighbors" 61 | ] 62 | 63 | my_features_index = [] 64 | my_features_dic = {} 65 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 66 | 67 | target = 0 68 | for i in range(len(training.columns)): 69 | if training.columns[i] == "target": 70 | target = i 71 | 72 | Y_train = training.values[:, target].astype(int) 73 | 74 | del training["target"] 75 | 76 | already_computed_names = [] 77 | already_computed = [] 78 | 79 | for i in range(len(training.columns)): 80 | if training.columns[i] in my_features_string: 81 | my_features_dic.update({i: training.columns[i]}) 82 | my_features_index.append(i) 83 | if training.columns[i] in already_computed_names: 84 | already_computed.append(i) 85 | 86 | features_to_keep = [] 87 | for u in range(len(my_features_index)): 88 | 89 | try: 90 | features_to_keep.append(already_computed[u]) 91 | print("added already computed feature " + str(my_features_dic[already_computed[u]])) 92 | except: 93 | 94 | features_to_keep_names = [my_features_dic[i] for i in features_to_keep] 95 | print("new round !") 96 | print("u = " + str(u) + ", current features are: " + str(features_to_keep_names)) 97 | best_test_score = 0.0 98 | best_train_score = 0.0 99 | best_index = 0 100 | for i, f in my_features_dic.items(): 101 | if i not in features_to_keep: 102 | # separating features and labels 103 | print("testing additional feature: " + f) 104 | current_features = features_to_keep + [i] 105 | 106 | X_train = training.values[:, current_features] 107 | 108 | RF = RandomForestClassifier( 109 | n_estimators=parameters["n_estimators"], 110 | criterion=parameters["criterion"], 111 | max_depth=parameters["max_depth"], 112 | min_samples_leaf=parameters["min_samples_leaf"], 113 | bootstrap=parameters["bootstrap"], 114 | n_jobs=parameters["n_jobs"] 115 | ) 116 | k = 2 117 | kf = KFold(k) 118 | train_score = 0.0 119 | test_score = 0.0 120 | 121 | for train_index, test_index in kf.split(X_train, Y_train): 122 | RF.fit(X_train[train_index], Y_train[train_index]) 123 | Y_pred = RF.predict(X_train[test_index]) 124 | Y_pred_train = RF.predict(X_train[train_index]) 125 | train_score += f1_score(Y_train[train_index], Y_pred_train) 126 | test_score += f1_score(Y_train[test_index], Y_pred) 127 | 128 | train_score /= k 129 | test_score /= k 130 | 131 | if test_score > best_test_score: 132 | best_index = i 133 | best_train_score = train_score 134 | best_test_score = test_score 135 | 136 | print("train score: " + str(train_score)) 137 | print("test score: " + str(test_score)) 138 | print("") 139 | 140 | print("for this round, the best feature was " + my_features_dic[best_index]) 141 | features_to_keep.append(best_index) 142 | print("the scores obtained were: ") 143 | print("train score: " + str(best_train_score)) 144 | print("test score: " + str(best_test_score)) 145 | print("\n\n\n\n") 146 | 147 | # # print feature importances 148 | # for i in range(len(RF.feature_importances_)): 149 | # print(str(my_features_dic[i]) + ": " + str(RF.feature_importances_[i])) 150 | -------------------------------------------------------------------------------- /models/lgbm.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.model_selection import KFold 7 | 8 | from models.tools import f1_score, f1_score_lgbm, load_data 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_submissions = "submissions/" 13 | path_to_stacking = "stacking/" 14 | path_to_plots = "models/plots/" 15 | 16 | # tuned hyper-parameters 17 | parameters = { 18 | 'task': 'train', 19 | 'boosting_type': 'gbdt', 20 | 'objective': 'binary', 21 | # 'metric': {}, 22 | 'num_leaves': 200, 23 | 'learning_rate': 0.1, 24 | 'feature_fraction': 0.5, 25 | 'bagging_fraction': 0.6, 26 | 'bagging_freq': 5, 27 | 'verbose': 0, 28 | "min_data_in_leaf": 3, 29 | "max_depth": 150 30 | } 31 | # used features 32 | 33 | my_features_string = [ 34 | "date_diff", 35 | "overlap_title", 36 | "common_author", 37 | # "score_1_2", 38 | # "score_2_1", 39 | "cosine_distance", 40 | "journal_similarity", 41 | # "overlapping_words_abstract", 42 | # "jaccard", 43 | # "adar", 44 | "preferential_attachment", 45 | # "resource_allocation_index", 46 | "out_neighbors", 47 | "in_neighbors", 48 | "common_neighbors", 49 | "shortest_path", 50 | "popularity", 51 | "common_successors", 52 | "common_predecessors", 53 | "paths_of_length_one", 54 | "authors_citation", 55 | "coauthor_score" 56 | # "katz" 57 | # "katz_2" 58 | ] 59 | 60 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 61 | 62 | # load data 63 | 64 | (X_train, 65 | X_test, 66 | Y_train, 67 | my_features_index, 68 | my_features_dic) = load_data(my_features_string) 69 | 70 | # print user info 71 | now = datetime.datetime.now() 72 | print("date: " + str(now)) 73 | print("features: " + str(my_features_string)) 74 | print("model: LGBM") 75 | print("parameters:") 76 | print(parameters) 77 | print("cross validation:") 78 | 79 | # instantiate Kfold and predictions placeholder 80 | k = 5 81 | kf = KFold(k) 82 | predictions = np.zeros((X_test.shape[0], k)) 83 | predictions_train = np.zeros(X_train.shape[0]) 84 | i = 0 85 | 86 | # for each fold store predictions on test set and print validation results 87 | results = [] 88 | print('Start training...') 89 | for train_index, test_index in kf.split(X_train): 90 | lgb_train = lgb.Dataset(X_train[train_index], Y_train[train_index]) 91 | lgb_eval = lgb.Dataset(X_train[test_index], Y_train[test_index], reference=lgb_train) 92 | gbm = lgb.train(parameters, 93 | train_set=lgb_train, 94 | num_boost_round=100, 95 | valid_sets=lgb_eval, 96 | verbose_eval=40, 97 | feval=f1_score_lgbm 98 | ) 99 | res = gbm.predict(X_test) 100 | Y_pred = gbm.predict(X_train[test_index]) 101 | Y_pred_train = gbm.predict(X_train[train_index]) 102 | predictions[:, i] = res 103 | predictions_train[test_index] = Y_pred 104 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train.round()))) 105 | print("test: " + str(f1_score(Y_train[test_index], Y_pred.round()))) 106 | i += 1 107 | 108 | # save submission file 109 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 110 | submission = pd.DataFrame(Y_test) 111 | submission.to_csv( 112 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "lgbm" + ".csv", 113 | index=True, 114 | index_label="id", 115 | header=["category"] 116 | ) 117 | 118 | # save probabilities for stacking 119 | stacking_logits_test = np.sum(predictions, axis=1) 120 | stacking_test = pd.DataFrame(stacking_logits_test) 121 | stacking_test.to_csv( 122 | path_or_buf=path_to_stacking + "lgbm_test" + ".csv", 123 | index=True, 124 | index_label="id", 125 | header=["category"] 126 | ) 127 | 128 | stacking_train = pd.DataFrame(predictions_train) 129 | stacking_train.to_csv( 130 | path_or_buf=path_to_stacking + "lgbm_train" + ".csv", 131 | index=True, 132 | index_label="id", 133 | header=["category"] 134 | ) 135 | -------------------------------------------------------------------------------- /models/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.linear_model import LogisticRegressionCV 6 | from sklearn.model_selection import KFold 7 | 8 | from models.tools import f1_score 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_submissions = "submissions/" 13 | 14 | # parameters 15 | parameters = { 16 | "max_iter": 100, 17 | "tol": 1e-6, 18 | "penalty": "l2" 19 | } 20 | 21 | # load data 22 | training = pd.read_csv(path_to_data + "training_features.txt") 23 | testing = pd.read_csv(path_to_data + "testing_features.txt") 24 | del training["my_index"] 25 | del testing["my_index"] 26 | 27 | # replace inf in shortest_path with -1 28 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1]) 29 | testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1]) 30 | 31 | my_features_string = [ 32 | "date_diff", 33 | "overlap_title", 34 | "common_author", 35 | "score_1_2", 36 | # "score_2_1", 37 | # "cosine_distance", 38 | # "journal_similarity", 39 | # "overlapping_words_abstract", 40 | # "jaccard", 41 | # "adar", 42 | # "preferential_attachment", 43 | # "resource_allocation_index", 44 | # "out_neighbors", 45 | # "in_neighbors", 46 | # "common_neighbors", 47 | "shortest_path", 48 | "popularity" 49 | ] 50 | my_features_index = [] 51 | my_features_dic = {} 52 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 53 | 54 | target = 0 55 | for i in range(len(training.columns)): 56 | if training.columns[i] == "target": 57 | target = i 58 | 59 | Y_train = training.values[:, target].astype(int) 60 | 61 | del training["target"] 62 | 63 | for i in range(len(training.columns)): 64 | if training.columns[i] in my_features_string: 65 | my_features_dic.update({i: training.columns[i]}) 66 | my_features_index.append(i) 67 | 68 | # separating features and labels 69 | training_val = training.values 70 | testing_val = testing.values 71 | X_train = training_val[:, my_features_index].astype(float) 72 | X_test = testing_val[:, my_features_index] 73 | 74 | now = datetime.datetime.now() 75 | print("date: " + str(now)) 76 | print("features: " + str(my_features_string)) 77 | print("model: Random Forest") 78 | print("parameters:") 79 | print(parameters) 80 | print("cross validation:") 81 | 82 | LogReg = LogisticRegressionCV(max_iter=parameters['max_iter'], 83 | tol=parameters['tol'], 84 | penalty=parameters['penalty']) 85 | k = 5 86 | kf = KFold(k) 87 | predictions = np.zeros((X_test.shape[0], k)) 88 | i = 0 89 | 90 | for train_index, test_index in kf.split(X_train, Y_train): 91 | LogReg.fit(X_train[train_index], Y_train[train_index]) 92 | Y_pred = LogReg.predict(X_train[test_index]) 93 | Y_pred_train = LogReg.predict(X_train[train_index]) 94 | predictions[:, i] = LogReg.predict(X_test) 95 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 96 | print("test: " + str(f1_score(Y_train[test_index], Y_pred))) 97 | i += 1 98 | 99 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 100 | 101 | # submission = pd.DataFrame(Y_test) 102 | # submission.to_csv( 103 | # path_or_buf=path_to_submissions+"-".join(my_features_string)+"LogReg.csv", 104 | # index=True, 105 | # index_label="id", 106 | # header=["category"] 107 | # ) 108 | -------------------------------------------------------------------------------- /models/nn.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy 4 | import numpy as np 5 | import pandas as pd 6 | from keras.layers import Dense, Dropout 7 | from keras.models import Sequential 8 | from keras.wrappers.scikit_learn import KerasClassifier 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | from models.tools import load_data 13 | 14 | # path 15 | path_to_data = "data/" 16 | path_to_submissions = "submissions/" 17 | path_to_stacking = "stacking/" 18 | path_to_plots = "plots/" 19 | 20 | # tuned hyper-parameters 21 | 22 | parameters = { 23 | "n_estimators": 150, 24 | "criterion": "entropy", # default = gini 25 | "max_depth": 15, # 9 26 | "min_samples_leaf": 4, # 10 27 | "bootstrap": True, 28 | "n_jobs": -1 29 | } 30 | 31 | # features used 32 | 33 | my_features_string = [ 34 | "date_diff", 35 | "overlap_title", 36 | "common_author", 37 | "score_1_2", 38 | "score_2_1", 39 | "cosine_distance", 40 | "journal_similarity", 41 | # "overlapping_words_abstract", 42 | "jaccard", 43 | "adar", 44 | "preferential_attachment", 45 | "resource_allocation_index", 46 | "out_neighbors", 47 | "in_neighbors", 48 | "common_neighbors", 49 | # "shortest_path", 50 | "popularity", 51 | "authors_citation", 52 | "coauthor_score" 53 | # "paths_of_length_one" 54 | # "katz" 55 | # "katz_2" 56 | ] 57 | 58 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 59 | 60 | (X_train, 61 | X_test, 62 | Y_train, 63 | my_features_index, 64 | my_features_dic) = load_data(my_features_string) 65 | 66 | # normalize data 67 | scaler = StandardScaler() 68 | X_train = scaler.fit_transform(X_train) 69 | X_test = scaler.transform(X_test) 70 | 71 | # Function to create model, required for KerasClassifier 72 | nb_input = len(my_features_string) 73 | 74 | 75 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'): 76 | # create model 77 | model = Sequential() 78 | model.add(Dense(neurons, input_dim=nb_input, activation=activation)) 79 | model.add(Dropout(dropout_rate)) 80 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid')) 81 | # Compile model 82 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 83 | return model 84 | 85 | 86 | # parameters 87 | epochs = 30 88 | batch_size = 128 89 | 90 | # tuned parameters 91 | dropout_rate = 0.2 92 | neurons = 75 93 | 94 | # fix random seed for reproducibility 95 | seed = 7 96 | numpy.random.seed(seed) 97 | 98 | # instantiate classifier 99 | nn = KerasClassifier(build_fn=create_model, 100 | epochs=epochs, 101 | batch_size=batch_size, 102 | dropout_rate=dropout_rate, 103 | neurons=neurons, 104 | verbose=1 105 | ) 106 | 107 | # print user info 108 | now = datetime.datetime.now() 109 | print("date: " + str(now)) 110 | print("features: " + str(my_features_string)) 111 | print("model: Neural Network") 112 | print("parameters:") 113 | print(parameters) 114 | print("cross validation:") 115 | 116 | # instantiate Kfold and predictions placeholder 117 | k = 5 118 | kf = StratifiedKFold(k) 119 | predictions = np.zeros((X_test.shape[0], k)) 120 | predictions_test = np.zeros((X_test.shape[0], k)) 121 | predictions_train = np.zeros(X_train.shape[0]) 122 | i = 0 123 | 124 | # for each fold store predictions on test set and print validation results 125 | test_score = 0.0 126 | for train_index, test_index in kf.split(X_train, Y_train): 127 | nn.fit(X_train[train_index], Y_train[train_index]) 128 | Y_pred = nn.predict(X_train[test_index])[:, 0] 129 | Y_pred_train = nn.predict(X_train[train_index])[:, 0] 130 | predictions[:, i] = nn.predict(X_test)[:, 0] 131 | predictions_test[:, i] = nn.predict_proba(X_test)[:, 1] 132 | predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1] 133 | # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0] 134 | # test_score += current_test_score 135 | # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 136 | # print("test: " + str(current_test_score)) 137 | i += 1 138 | # print("CV test score: "+str(test_score/k)) 139 | 140 | # save submission file 141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 142 | submission = pd.DataFrame(Y_test) 143 | submission.to_csv( 144 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn.csv", 145 | index=True, 146 | index_label="id", 147 | header=["category"] 148 | ) 149 | 150 | # save probabilities for stacking 151 | stacking_logits_test = np.sum(predictions_test, axis=1) 152 | stacking_test = pd.DataFrame(stacking_logits_test) 153 | stacking_test.to_csv( 154 | path_or_buf=path_to_stacking + "nn_test" + ".csv", 155 | index=True, 156 | index_label="id", 157 | header=["category"] 158 | ) 159 | 160 | stacking_train = pd.DataFrame(predictions_train) 161 | stacking_train.to_csv( 162 | path_or_buf=path_to_stacking + "nn_train" + ".csv", 163 | index=True, 164 | index_label="id", 165 | header=["category"] 166 | ) 167 | -------------------------------------------------------------------------------- /models/nn_deep.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy 4 | import numpy as np 5 | import pandas as pd 6 | from keras.layers import Dense, Dropout 7 | from keras.models import Sequential 8 | from keras.wrappers.scikit_learn import KerasClassifier 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | from models.tools import load_data 13 | 14 | # path 15 | path_to_data = "data/" 16 | path_to_submissions = "submissions/" 17 | path_to_stacking = "stacking/" 18 | path_to_plots = "plots/" 19 | 20 | # tuned hyper-parameters 21 | 22 | parameters = { 23 | "n_estimators": 150, 24 | "criterion": "entropy", # default = gini 25 | "max_depth": 15, # 9 26 | "min_samples_leaf": 4, # 10 27 | "bootstrap": True, 28 | "n_jobs": -1 29 | } 30 | 31 | # used features 32 | 33 | my_features_string = [ 34 | "date_diff", 35 | "overlap_title", 36 | "common_author", 37 | "score_1_2", 38 | "score_2_1", 39 | "cosine_distance", 40 | "journal_similarity", 41 | # "overlapping_words_abstract", 42 | "jaccard", 43 | "adar", 44 | "preferential_attachment", 45 | "resource_allocation_index", 46 | "out_neighbors", 47 | "in_neighbors", 48 | "common_neighbors", 49 | # "shortest_path", 50 | "popularity", 51 | # "paths_of_length_one" 52 | # "katz" 53 | # "katz_2" 54 | ] 55 | 56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 57 | 58 | (X_train, 59 | X_test, 60 | Y_train, 61 | my_features_index, 62 | my_features_dic) = load_data(my_features_string) 63 | 64 | # normalize data 65 | scaler = StandardScaler() 66 | X_train = scaler.fit_transform(X_train) 67 | X_test = scaler.transform(X_test) 68 | 69 | # Function to create model, required for KerasClassifier 70 | nb_input = len(my_features_string) 71 | 72 | 73 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'): 74 | # create model 75 | model = Sequential() 76 | model.add(Dense(neurons, input_dim=nb_input, activation=activation)) 77 | model.add(Dropout(dropout_rate)) 78 | model.add(Dense(2 * neurons, activation=activation)) 79 | model.add(Dropout(dropout_rate)) 80 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid')) 81 | # Compile model 82 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 83 | return model 84 | 85 | 86 | # parameters 87 | epochs = 30 88 | batch_size = 128 89 | 90 | # tuned parameters 91 | dropout_rate = 0.2 92 | neurons = 75 93 | 94 | # fix random seed for reproducibility 95 | seed = 7 96 | numpy.random.seed(seed) 97 | 98 | # instantiate classifier 99 | nn = KerasClassifier(build_fn=create_model, 100 | epochs=epochs, 101 | batch_size=batch_size, 102 | dropout_rate=dropout_rate, 103 | neurons=neurons, 104 | verbose=1 105 | ) 106 | 107 | # print user info 108 | now = datetime.datetime.now() 109 | print("date: " + str(now)) 110 | print("features: " + str(my_features_string)) 111 | print("model: Neural Network") 112 | print("parameters:") 113 | print(parameters) 114 | print("cross validation:") 115 | 116 | # instantiate Kfold and predictions placeholder 117 | k = 5 118 | kf = StratifiedKFold(k) 119 | predictions = np.zeros((X_test.shape[0], k)) 120 | predictions_test = np.zeros((X_test.shape[0], k)) 121 | predictions_train = np.zeros(X_train.shape[0]) 122 | i = 0 123 | 124 | # for each fold store predictions on test set and print validation results 125 | test_score = 0.0 126 | for train_index, test_index in kf.split(X_train, Y_train): 127 | nn.fit(X_train[train_index], Y_train[train_index]) 128 | Y_pred = nn.predict(X_train[test_index])[:, 0] 129 | Y_pred_train = nn.predict(X_train[train_index])[:, 0] 130 | predictions[:, i] = nn.predict(X_test)[:, 0] 131 | predictions_test[:, i] = nn.predict_proba(X_test)[:, 1] 132 | predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1] 133 | # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0] 134 | # test_score += current_test_score 135 | # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 136 | # print("test: " + str(current_test_score)) 137 | i += 1 138 | # print("CV test score: "+str(test_score/k)) 139 | 140 | # save submission file 141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 142 | submission = pd.DataFrame(Y_test) 143 | submission.to_csv( 144 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn_deep.csv", 145 | index=True, 146 | index_label="id", 147 | header=["category"] 148 | ) 149 | 150 | # save probabilities for stacking 151 | stacking_logits_test = np.sum(predictions_test, axis=1) 152 | stacking_test = pd.DataFrame(stacking_logits_test) 153 | stacking_test.to_csv( 154 | path_or_buf=path_to_stacking + "nn_deep_test" + ".csv", 155 | index=True, 156 | index_label="id", 157 | header=["category"] 158 | ) 159 | 160 | stacking_train = pd.DataFrame(predictions_train) 161 | stacking_train.to_csv( 162 | path_or_buf=path_to_stacking + "nn_deep_train" + ".csv", 163 | index=True, 164 | index_label="id", 165 | header=["category"] 166 | ) 167 | -------------------------------------------------------------------------------- /models/plots/rf_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance.png -------------------------------------------------------------------------------- /models/plots/rf_importance_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance_full.png -------------------------------------------------------------------------------- /models/random_forest.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.model_selection import KFold 7 | 8 | from models.tools import f1_score, plot_importance, load_data 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_submissions = "submissions/" 13 | path_to_stacking = "stacking/" 14 | path_to_plots = "plots/" 15 | 16 | # tuned hyper-parameters 17 | 18 | parameters = { 19 | "n_estimators": 150, 20 | "criterion": "entropy", # default = gini 21 | "max_depth": 20, # 9 22 | "min_samples_leaf": 10, # 10 23 | "bootstrap": True, 24 | "n_jobs": -1 25 | } 26 | 27 | # used features 28 | 29 | my_features_string = [ 30 | "date_diff", 31 | # "overlap_title", 32 | "common_author", 33 | # "score_1_2", 34 | # "score_2_1", 35 | "cosine_distance", 36 | # "journal_similarity", 37 | # "overlapping_words_abstract", 38 | # "jaccard", 39 | # "adar", 40 | "preferential_attachment", 41 | # "resource_allocation_index", 42 | # "out_neighbors", 43 | "in_neighbors", 44 | "common_neighbors", 45 | # "shortest_path", 46 | # "popularity", 47 | # "common_successors", 48 | # "common_predecessors", 49 | # "paths_of_length_one", 50 | "authors_citation", 51 | # "coauthor_score", 52 | # "katz", 53 | # "katz_2" 54 | ] 55 | 56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 57 | 58 | # load data 59 | 60 | (X_train, 61 | X_test, 62 | Y_train, 63 | my_features_index, 64 | my_features_dic) = load_data(my_features_string) 65 | 66 | # print user info 67 | now = datetime.datetime.now() 68 | print("date: " + str(now)) 69 | print("features: " + str(my_features_string)) 70 | print("model: Random Forest") 71 | print("parameters:") 72 | print(parameters) 73 | print("cross validation:") 74 | 75 | # instantiate classifier 76 | RF = RandomForestClassifier( 77 | n_estimators=parameters["n_estimators"], 78 | criterion=parameters["criterion"], 79 | max_depth=parameters["max_depth"], 80 | min_samples_leaf=parameters["min_samples_leaf"], 81 | bootstrap=parameters["bootstrap"], 82 | n_jobs=parameters["n_jobs"] 83 | ) 84 | 85 | # instantiate Kfold and predictions placeholder 86 | k = 2 87 | kf = KFold(k) 88 | predictions = np.zeros((X_test.shape[0], k)) 89 | predictions_test = np.zeros((X_test.shape[0], k)) 90 | predictions_train = np.zeros(X_train.shape[0]) 91 | i = 0 92 | 93 | # for each fold store predictions on test set and print validation results 94 | test_score = 0.0 95 | for train_index, test_index in kf.split(X_train, Y_train): 96 | RF.fit(X_train[train_index], Y_train[train_index]) 97 | Y_pred = RF.predict(X_train[test_index]) 98 | Y_pred_train = RF.predict(X_train[train_index]) 99 | predictions[:, i] = RF.predict(X_test) 100 | predictions_test[:, i] = RF.predict_proba(X_test)[:, 1] 101 | predictions_train[test_index] = RF.predict_proba(X_train[test_index])[:, 1] 102 | current_test_score = f1_score(Y_train[test_index], Y_pred) 103 | test_score += current_test_score 104 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 105 | print("test: " + str(current_test_score)) 106 | i += 1 107 | 108 | print("CV test score: " + str(test_score / k)) 109 | # save submission file 110 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 111 | submission = pd.DataFrame(Y_test) 112 | submission.to_csv( 113 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "RF.csv", 114 | index=True, 115 | index_label="id", 116 | header=["category"] 117 | ) 118 | 119 | # save probabilities for stacking 120 | stacking_logits_test = np.sum(predictions_test, axis=1) 121 | stacking_test = pd.DataFrame(stacking_logits_test) 122 | stacking_test.to_csv( 123 | path_or_buf=path_to_stacking + "rf_test_2" + ".csv", 124 | index=True, 125 | index_label="id", 126 | header=["category"] 127 | ) 128 | 129 | stacking_train = pd.DataFrame(predictions_train) 130 | stacking_train.to_csv( 131 | path_or_buf=path_to_stacking + "rf_train_2" + ".csv", 132 | index=True, 133 | index_label="id", 134 | header=["category"] 135 | ) 136 | 137 | # plot feature importances 138 | plot_importance(RF, 139 | features_dict=my_features_dic, 140 | features_index=my_features_index, 141 | name='rf_importance') 142 | -------------------------------------------------------------------------------- /models/svm.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn import svm 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | from models.tools import f1_score, load_data 10 | 11 | # path 12 | path_to_data = "data/" 13 | path_to_submissions = "submissions/" 14 | path_to_stacking = "stacking/" 15 | path_to_plots = "models/plots" 16 | 17 | # used features 18 | 19 | my_features_string = [ 20 | "date_diff", 21 | "overlap_title", 22 | "common_author", 23 | # "score_1_2", 24 | # "score_2_1", 25 | "cosine_distance", 26 | # "journal_similarity", 27 | # # "overlapping_words_abstract", 28 | # "jaccard", 29 | # "adar", 30 | "preferential_attachment", 31 | # "resource_allocation_index", 32 | "out_neighbors", 33 | "in_neighbors", 34 | "common_neighbors", 35 | # # "shortest_path", 36 | # "popularity", 37 | # # "paths_of_length_one" 38 | # "katz" 39 | # "katz_2" 40 | ] 41 | 42 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string] 43 | 44 | # load data 45 | 46 | (X_train, 47 | X_test, 48 | Y_train, 49 | my_features_index, 50 | my_features_dic) = load_data(my_features_string) 51 | 52 | # normalize data 53 | scaler = StandardScaler() 54 | X_train = scaler.fit_transform(X_train) 55 | X_test = scaler.transform(X_test) 56 | 57 | # tuned hyperparameters 58 | parameters = { 59 | 'C': 0.1, 60 | 'gamma': 0.01, 61 | 'kernel': "linear" 62 | } 63 | 64 | # print user info 65 | now = datetime.datetime.now() 66 | print("date: " + str(now)) 67 | print("features: " + str(my_features_string)) 68 | print("model: SVM") 69 | print("parameters:") 70 | print(parameters) 71 | print("cross validation:") 72 | 73 | # instantiate classifier 74 | svm_classifier = svm.SVC(C=parameters['C'], 75 | gamma=parameters['gamma'], 76 | kernel=parameters['kernel'], 77 | probability=True, 78 | verbose=1) 79 | 80 | # instantiate Kfold and predictions placeholder 81 | k = 2 82 | kf = StratifiedKFold(k) 83 | predictions = np.zeros((X_test.shape[0], k)) 84 | predictions_test = np.zeros((X_test.shape[0], k)) 85 | predictions_train = np.zeros(X_train.shape[0]) 86 | i = 0 87 | 88 | # for each fold store predictions on test set and print validation results 89 | for train_index, test_index in kf.split(X_train, Y_train): 90 | svm_classifier.fit(X_train[train_index], Y_train[train_index]) 91 | Y_pred = svm_classifier.predict(X_train[test_index]) 92 | Y_pred_train = svm_classifier.predict(X_train[train_index]) 93 | predictions[:, i] = svm_classifier.predict(X_test) 94 | predictions_test[:, i] = svm_classifier.predict_proba(X_test)[:, 1] 95 | predictions_train[test_index] = svm_classifier.predict_proba(X_train[test_index])[:, 1] 96 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 97 | print("test: " + str(f1_score(Y_train[test_index], Y_pred))) 98 | i += 1 99 | 100 | # save submission file 101 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 102 | submission = pd.DataFrame(Y_test) 103 | submission.to_csv( 104 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "SVM.csv", 105 | index=True, 106 | index_label="id", 107 | header=["category"] 108 | ) 109 | 110 | # save probabilities for stacking 111 | stacking_logits_test = np.sum(predictions_test, axis=1) 112 | stacking_test = pd.DataFrame(stacking_logits_test) 113 | stacking_test.to_csv( 114 | path_or_buf=path_to_stacking + "svmlinear_test" + ".csv", 115 | index=True, 116 | index_label="id", 117 | header=["category"] 118 | ) 119 | 120 | stacking_train = pd.DataFrame(predictions_train) 121 | stacking_train.to_csv( 122 | path_or_buf=path_to_stacking + "svmlinear_train" + ".csv", 123 | index=True, 124 | index_label="id", 125 | header=["category"] 126 | ) 127 | -------------------------------------------------------------------------------- /models/tools.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | 8 | def binary_error(preds, train_data): 9 | labels = train_data.get_label() 10 | return 'error', np.mean(labels != (preds > 0.5)), False 11 | 12 | 13 | def f1_score_lgbm(preds, train_data): 14 | labels = train_data.get_label() 15 | tp = np.sum(labels[labels == 1] == (preds[labels == 1] > 0.5)) 16 | tn = np.sum(labels[labels == 0] == (preds[labels == 0] > 0.5)) 17 | fp = np.sum(labels[labels == 1] != (preds[labels == 1] > 0.5)) 18 | fn = np.sum(labels[labels == 0] != (preds[labels == 0] > 0.5)) 19 | p = tp / (tp + fp) 20 | r = tp / (tp + fn) 21 | 22 | return 'f1 score', 2 * p * r / (p + r), False 23 | 24 | 25 | def f1_score(preds, labels): 26 | tp = np.sum(labels[labels == 1] == preds[labels == 1]) 27 | tn = np.sum(labels[labels == 0] == preds[labels == 0]) 28 | fp = np.sum(labels[labels == 1] != preds[labels == 1]) 29 | fn = np.sum(labels[labels == 0] != preds[labels == 0]) 30 | p = tp / (tp + fp) 31 | r = tp / (tp + fn) 32 | 33 | return 2 * p * r / (p + r) 34 | 35 | 36 | def load_data(my_features_string): 37 | # path 38 | path_to_data = "data/" 39 | 40 | # feature tracking utils 41 | my_features_index = [] 42 | my_features_dic = {} 43 | 44 | # load raw data 45 | training = pd.read_csv(path_to_data + "training_features.txt") 46 | testing = pd.read_csv(path_to_data + "testing_features.txt") 47 | 48 | del training["my_index"] 49 | del testing["my_index"] 50 | 51 | # track features and target 52 | target = 0 53 | for i in range(len(training.columns)): 54 | if training.columns[i] == "target": 55 | target = i 56 | 57 | Y_train = training.values[:, target].astype(int) 58 | 59 | del training["target"] 60 | 61 | for i in range(len(training.columns)): 62 | if training.columns[i] in my_features_string: 63 | my_features_dic.update({i: training.columns[i]}) 64 | my_features_index.append(i) 65 | 66 | # separating features and labels 67 | training_val = training.values 68 | testing_val = testing.values 69 | X_train = training_val[:, my_features_index].astype(float) 70 | X_test = testing_val[:, my_features_index] 71 | 72 | del training_val 73 | del testing_val 74 | 75 | print(training.head()) 76 | print(testing.head()) 77 | 78 | return X_train, X_test, Y_train, my_features_index, my_features_dic 79 | 80 | 81 | # plotting feature importances 82 | def plot_importance(rf, features_dict, features_index, name): 83 | # plot settings 84 | sns.set_style("darkgrid") 85 | mpl.rcParams['figure.dpi'] = 200 86 | # mpl.rcParams['figure.tight_layout'] = True 87 | path_to_plot = "models/plots/" 88 | 89 | # fetch mean importances 90 | importances = rf.feature_importances_ 91 | # compute std using each estimator in the forest 92 | std = np.std([tree.feature_importances_ for tree in rf.estimators_], 93 | axis=0) 94 | # argsort the values 95 | index = list(map(int, np.argsort(importances)[::-1])) 96 | # Plot the feature importances of the rf 97 | plt.figure() 98 | # get axis 99 | fig, ax = plt.subplots(figsize=(6, 3)) 100 | # add space for x labels 101 | plt.subplots_adjust(bottom=0.30) 102 | plt.title("Feature importances") 103 | # get number of features 104 | nb_features = len(features_dict) 105 | # plot with error bars 106 | plt.bar(range(nb_features), importances[index], 107 | color="r", yerr=std[index], align="center") 108 | # create x axis tickers 109 | plt.xticks(range(nb_features), index) 110 | # get feature names in right order 111 | index_features_sorted = np.array(features_index)[index] 112 | feature_names = list(map(lambda x: features_dict[x], index_features_sorted)) 113 | # font dict to control x tickers labels 114 | ax.set_xticklabels(feature_names, rotation=40, fontsize=9, ha='right') 115 | plt.xlim([-1, nb_features]) 116 | plt.ylim([0, 0.8]) 117 | plt.savefig(path_to_plot + name) 118 | plt.show() 119 | -------------------------------------------------------------------------------- /models/tuning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/__init__.py -------------------------------------------------------------------------------- /models/tuning/objective_function.py: -------------------------------------------------------------------------------- 1 | class ObjectiveFunction: 2 | """class to analyze objective function optimization : hyperparameter tuning""" 3 | 4 | def __init__(self, func): 5 | self.f = func 6 | self.history_f = [] 7 | self.history_fbest = None 8 | self.history_bests = [] 9 | 10 | def __call__(self, x): 11 | val = self.f(x) 12 | self.history_f.append(-val) 13 | if self.history_fbest is None: 14 | self.history_fbest = val 15 | self.history_bests.append(-val) 16 | elif self.history_fbest > val: 17 | self.history_fbest = val 18 | self.history_bests.append(-val) 19 | else: 20 | self.history_bests.append(-self.history_fbest) 21 | return val 22 | -------------------------------------------------------------------------------- /models/tuning/plots/grid_lgbm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/plots/grid_lgbm.png -------------------------------------------------------------------------------- /models/tuning/tools.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import seaborn as sns 5 | 6 | 7 | # Function to help plotting the results of our cross-validation for the random forest algorithm. 8 | def triple_ticker_grid(tuned_parameters, parameter_1, parameter_2, parameter_3): 9 | ticker_labels = [] 10 | for i in tuned_parameters[parameter_1]: 11 | for j in tuned_parameters[parameter_2]: 12 | for k in tuned_parameters[parameter_3]: 13 | ticker_labels.append(str((i, j, k))) 14 | return ticker_labels 15 | 16 | 17 | def double_ticker_grid(tuned_parameters, parameter_1, parameter_2): 18 | ticker_labels = [] 19 | for i in tuned_parameters[parameter_1]: 20 | for j in tuned_parameters[parameter_2]: 21 | ticker_labels.append(str((i, j))) 22 | return ticker_labels 23 | 24 | 25 | # fucntion for plotting the results of the grid search 26 | def plot_grid(metrics, params, param_names, index, name): 27 | # plot settings 28 | sns.reset_orig() 29 | mpl.rcParams['figure.dpi'] = 200 30 | path_to_plot = "models/tuning/plots/" 31 | 32 | # For the test set 33 | plt.figure(figsize=(30, 30)) 34 | f, (ax1, ax2) = plt.subplots(2, 1, sharey=True) 35 | x = range(len(metrics['params'])) 36 | ax1.plot(x, list(metrics['mean_test_precision']), '--', 37 | label='mean test precision', color='g') 38 | ax1.plot(x, list(metrics['mean_test_recall']), '-.', 39 | label='mean test recall', color='r') 40 | ax1.plot(x, list(metrics['mean_test_roc_auc']), '-o', 41 | label='mean test roc auc', color='b') 42 | ax1.plot(x, list(metrics['mean_test_accuracy']), '-*', 43 | label='mean test accuracy', color='purple') 44 | ax1.plot(x, list(metrics['mean_test_f1']), '-', 45 | label='mean test f1', color='orange') 46 | y_13 = np.arange(0.9, 1.05, 0.05) 47 | x_13 = np.repeat(index, len(y_13)) 48 | 49 | ax1.plot(x_13, y_13, '-.', color='pink', lw=2.0) 50 | plt.ylim([0.95, 1.0]) 51 | ax1.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=7) 52 | ax1.set_ylabel('Test metrics') 53 | plt.subplots_adjust(left=0.40, bottom=0.15) 54 | plt.title(" ".join(param_names) + " Grid Search") 55 | 56 | # Setting the labels for the x-axis (gridsearch combination) 57 | # x_ticks_labels = double_parameter_cross_validation(params, 58 | # 'max_depth', 59 | # 'min_samples_leaf', 60 | # 'n_estimators') 61 | # Set number of ticks for x-axis 62 | ax1.set_xticks([]) 63 | 64 | # Set ticks labels for x-axis 65 | # ax1.set_xticklabels(x_ticks_labels, rotation=70, fontsize=6); 66 | 67 | # For the train set 68 | ax2.plot(x, list(metrics['mean_train_precision']), '--', 69 | label='mean train precision', color='c') 70 | ax2.plot(x, list(metrics['mean_train_recall']), '-.', 71 | label='mean train recall', color='m') 72 | ax2.plot(x, list(metrics['mean_train_roc_auc']), '-o', 73 | label='mean train roc auc', color='y') 74 | ax2.plot(x, list(metrics['mean_train_accuracy']), '-*', 75 | label='mean train accuracy', color='k') 76 | ax2.plot(x, list(metrics['mean_train_f1']), '-', 77 | label='mean train f1', color='orange') 78 | ax2.plot(x_13, y_13, '-.', color='pink', lw=2.0) 79 | ax2.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=6) 80 | plt.ylim([0.95, 1.0]) 81 | if len(param_names) == 2: 82 | x_ticks_labels = double_ticker_grid(params, 83 | param_names[0], 84 | param_names[1]) 85 | if len(param_names) == 3: 86 | x_ticks_labels = triple_ticker_grid(params, 87 | param_names[0], 88 | param_names[1], 89 | param_names[2]) 90 | # Set number of ticks for x-axis 91 | ax2.set_xticks(x) 92 | # Set ticks labels for x-axis 93 | ax2.set_xticklabels(x_ticks_labels, rotation=70, fontsize=7, ha='right') 94 | ax2.set_ylabel('Train metrics') 95 | plt.savefig(path_to_plot + name) 96 | plt.show() 97 | -------------------------------------------------------------------------------- /models/tuning/tuning_lgbm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from lightgbm import LGBMClassifier 4 | from sklearn.model_selection import GridSearchCV 5 | 6 | from models.tools import load_data 7 | from models.tuning.tools import plot_grid 8 | 9 | # deactivate deprecation warnings 10 | warnings.simplefilter("ignore", DeprecationWarning) 11 | 12 | n_jobs = 2 13 | 14 | # path 15 | path_to_data = "data/" 16 | path_to_submissions = "submissions/" 17 | path_to_stacking = "stacking/" 18 | path_to_plots = "plots/" 19 | 20 | # used features 21 | 22 | my_features_string = [ 23 | "date_diff", 24 | "overlap_title", 25 | "common_author", 26 | "score_1_2", 27 | "score_2_1", 28 | "cosine_distance", 29 | "journal_similarity", 30 | "overlapping_words_abstract", 31 | "jaccard", 32 | "adar", 33 | "preferential_attachment", 34 | "resource_allocation_index", 35 | "out_neighbors", 36 | "in_neighbors", 37 | "common_neighbors", 38 | # "shortest_path", 39 | "popularity", 40 | "common_successors", 41 | "common_predecessors", 42 | # "paths_of_length_one", 43 | "authors_citation" 44 | "coauthor_score" 45 | # # "katz" 46 | # # "katz_2" 47 | ] 48 | 49 | # load data 50 | 51 | (X_train, 52 | X_test, 53 | Y_train, 54 | my_features_index, 55 | my_features_dic) = load_data(my_features_string) 56 | 57 | # GridSearchCV 58 | 59 | # param grid 60 | 61 | tuned_parameters = { 62 | # 'metric': {}, 63 | 'num_leaves': [150, 200, 250], 64 | "min_data_in_leaf": [2, 4, 6], 65 | "max_depth": [150, 200, 250] 66 | } 67 | 68 | # tuning 69 | gbm = LGBMClassifier( 70 | boosting_type='gbdt', 71 | objective='binary', 72 | # 'metric': {}, 73 | learning_rate=0.1, 74 | feature_fraction=0.4, 75 | bagging_fraction=0.6, 76 | bagging_freq=5, 77 | silent=True) 78 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"] 79 | grid_lgbm = GridSearchCV(gbm, 80 | param_grid=tuned_parameters, 81 | scoring=metrics, 82 | refit='f1', 83 | cv=5, 84 | n_jobs=n_jobs 85 | ) 86 | grid_lgbm.fit(X_train, Y_train, verbose=-1) 87 | print("GridSearch best parameters", grid_lgbm.best_params_) 88 | 89 | # plot grid search results 90 | best_params = grid_lgbm.best_params_ 91 | results = grid_lgbm.cv_results_ 92 | index = grid_lgbm.best_index_ 93 | plot_grid(metrics=results, 94 | params=tuned_parameters, 95 | index=index, 96 | param_names=list(tuned_parameters), 97 | name="grid_lgbm") 98 | -------------------------------------------------------------------------------- /models/tuning/tuning_nn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from keras.layers import Dense, Dropout 3 | from keras.models import Sequential 4 | from keras.wrappers.scikit_learn import KerasClassifier 5 | from sklearn.model_selection import GridSearchCV 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | from models.tools import load_data 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_submissions = "submissions/" 13 | path_to_stacking = "stacking/" 14 | path_to_plots = "models/plots/" 15 | 16 | # load data 17 | my_features_string = [ 18 | "date_diff", 19 | "overlap_title", 20 | "common_author", 21 | "score_1_2", 22 | "score_2_1", 23 | "cosine_distance", 24 | "journal_similarity", 25 | # "overlapping_words_abstract", 26 | "jaccard", 27 | "adar", 28 | "preferential_attachment", 29 | "resource_allocation_index", 30 | "out_neighbors", 31 | "in_neighbors", 32 | "common_neighbors", 33 | # "shortest_path", 34 | "popularity", 35 | # "paths_of_length_one" 36 | # "katz" 37 | # "katz_2" 38 | ] 39 | 40 | (X_train, 41 | X_test, 42 | Y_train, 43 | my_features_index, 44 | my_features_dic) = load_data(my_features_string) 45 | 46 | # check for nans 47 | data = pd.DataFrame(X_test) 48 | print(data.info()) 49 | print(data.isna().sum(axis=0)) 50 | print(data.min(axis=0)) 51 | print(data.max(axis=0)) 52 | print(my_features_index) 53 | print(my_features_dic) 54 | 55 | # normalize data 56 | scaler = StandardScaler() 57 | X_train = scaler.fit_transform(X_train) 58 | X_test = scaler.transform(X_test) 59 | 60 | # Function to create model, required for KerasClassifier 61 | nb_input = len(my_features_string) 62 | 63 | 64 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'): 65 | # create model 66 | model = Sequential() 67 | model.add(Dense(neurons, input_dim=nb_input, activation=activation)) 68 | model.add(Dropout(dropout_rate)) 69 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid')) 70 | # Compile model 71 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 72 | return model 73 | 74 | # fixed parameters 75 | epochs = 20 76 | batch_size = 128 77 | 78 | # create model 79 | model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=batch_size, verbose=1) 80 | 81 | # define the grid search parameters 82 | neurons = [15, 30, 45, 60, 75] 83 | dropout_rate = [0.0, 0.1, 0.2, 0.3] 84 | activation = ['relu', 'tanh', 'sigmoid'] 85 | param_grid = dict(neurons=neurons, dropout_rate=dropout_rate, activation=activation) 86 | grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) 87 | grid_result = grid.fit(X_train, Y_train) 88 | 89 | # summarize results 90 | print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) 91 | means = grid_result.cv_results_['mean_test_score'] 92 | stds = grid_result.cv_results_['std_test_score'] 93 | params = grid_result.cv_results_['params'] 94 | for mean, stdev, param in zip(means, stds, params): 95 | print("%f (%f) with: %r" % (mean, stdev, param)) 96 | -------------------------------------------------------------------------------- /models/tuning/tuning_random_forest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | from sklearn.model_selection import GridSearchCV 3 | 4 | from models.tools import load_data 5 | 6 | # path 7 | path_to_data = "data/" 8 | path_to_submissions = "submissions/" 9 | path_to_stacking = "stacking/" 10 | path_to_plots = "plots/" 11 | 12 | # tuned hyper-parameters 13 | 14 | parameters = { 15 | "criterion": "entropy", # default = gini 16 | "bootstrap": True, 17 | "n_jobs": -1 18 | } 19 | 20 | # used features 21 | 22 | my_features_string = [ 23 | "date_diff", 24 | "overlap_title", 25 | "common_author", 26 | # "score_1_2", 27 | # "score_2_1", 28 | "cosine_distance", 29 | # "journal_similarity", 30 | # "overlapping_words_abstract", 31 | # "jaccard", 32 | # "adar", 33 | "preferential_attachment", 34 | # "resource_allocation_index", 35 | "out_neighbors", 36 | "in_neighbors", 37 | "common_neighbors", 38 | "shortest_path", 39 | "popularity", 40 | "common_successors", 41 | "common_predecessors", 42 | "paths_of_length_one" 43 | # "katz" 44 | # "katz_2" 45 | ] 46 | 47 | # load data 48 | 49 | (X_train, 50 | X_test, 51 | Y_train, 52 | my_features_index, 53 | my_features_dic) = load_data(my_features_string) 54 | 55 | 56 | # GridSearchCV 57 | 58 | # param grid 59 | 60 | tuned_parameters = { 61 | "n_estimators": [150], 62 | "max_depth": [3, 6, 9, 12, 15, 20], 63 | "min_samples_leaf": [3, 5, 10, 20] 64 | } 65 | 66 | # tuning 67 | rf = RandomForestClassifier( 68 | criterion=parameters["criterion"], 69 | bootstrap=parameters["bootstrap"], 70 | n_jobs=parameters["n_jobs"] 71 | ) 72 | 73 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"] 74 | grid_RF = GridSearchCV(rf, 75 | param_grid=tuned_parameters, 76 | scoring=metrics, 77 | refit='f1', 78 | cv=5, 79 | n_jobs=-1, 80 | verbose=10 81 | ) 82 | grid_RF.fit(X_train, Y_train) 83 | print("GridSearch best parameters", grid_RF.best_params_) 84 | -------------------------------------------------------------------------------- /models/tuning/tuning_svm.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from sklearn.model_selection import StratifiedKFold 4 | from sklearn.svm import SVC 5 | from skopt import gp_minimize 6 | 7 | from models.tools import f1_score, load_data 8 | from models.tuning.objective_function import ObjectiveFunction 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_plots = "models/tuning/plots/" 13 | 14 | # used features 15 | 16 | my_features_string = [ 17 | "overlap_title", 18 | "date_diff", 19 | "common_author", 20 | "journal_similarity", 21 | "overlapping_words_abstract", 22 | "cosine_distance", 23 | "shortest_path", 24 | "jaccard", 25 | "adar", 26 | "preferential_attachment", 27 | "resource_allocation_index", 28 | "out_neighbors", 29 | "in_neighbors", 30 | "common_neighbors" 31 | ] 32 | 33 | # load data 34 | 35 | (X_train, 36 | X_test, 37 | Y_train, 38 | my_features_index, 39 | my_features_dic) = load_data(my_features_string) 40 | 41 | 42 | # function to optimize (too costly --> feature selection, subsampling) 43 | def objective_svm(x): 44 | C_in, gamma_in = x[0] ** 2, x[1] ** 2 45 | svm_classifier = SVC(C=C_in, cache_size=200, 46 | class_weight=None, 47 | coef0=0.0, 48 | decision_function_shape='ovr', 49 | degree=3, gamma=gamma_in, 50 | kernel='rbf', 51 | max_iter=-1, 52 | probability=False, 53 | random_state=None, 54 | shrinking=True, 55 | tol=0.001, 56 | verbose=False) 57 | k = 5 58 | kf = StratifiedKFold(k) 59 | i = 0 60 | score = 0 61 | for train_index, test_index in kf.split(X_train, Y_train): 62 | svm_classifier.fit(X_train[train_index], Y_train[train_index]) 63 | Y_pred = svm_classifier.predict(X_train[test_index]) 64 | score += f1_score(Y_train[test_index], Y_pred) 65 | i += 1 66 | return score 67 | 68 | 69 | # Bayesian Optimization (too costly --> feature selection, subsampling) 70 | f_bo = ObjectiveFunction(objective_svm) 71 | t0 = time.time() 72 | res = gp_minimize(f_bo, [(10 ** (-9), 10), (10 ** (-9), 0.1)], n_jobs=4) 73 | t1 = time.time() 74 | print("The total time with BO is : " + str(t1 - t0) + " seconds") 75 | print('best score BO :', -res.fun) 76 | print('best parameters BO:', res.x) 77 | -------------------------------------------------------------------------------- /models/tuning/tuning_svm_feat_selec.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | from sklearn.feature_selection import SelectKBest, mutual_info_classif 3 | from sklearn.model_selection import GridSearchCV 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.svm import SVC 6 | from sklearn.utils import resample 7 | 8 | from models.tools import load_data 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_plots = "models/tuning/plots/" 13 | 14 | # used features 15 | 16 | my_features_string = [ 17 | "date_diff", 18 | # "overlap_title", 19 | "common_author", 20 | # # "score_1_2", 21 | # # "score_2_1", 22 | "cosine_distance", 23 | # "journal_similarity", 24 | # # "overlapping_words_abstract", 25 | # # "jaccard", 26 | # # "adar", 27 | "preferential_attachment", 28 | # # "resource_allocation_index", 29 | # "out_neighbors", 30 | "in_neighbors", 31 | "common_neighbors", 32 | # "shortest_path", 33 | # "popularity", 34 | # "common_successors", 35 | # "common_predecessors", 36 | # "paths_of_length_one", 37 | "authors_citation" 38 | # "coauthor_score" 39 | # # "katz" 40 | # # "katz_2" 41 | ] 42 | 43 | # load data 44 | 45 | (X_train, 46 | X_test, 47 | Y_train, 48 | my_features_index, 49 | my_features_dic) = load_data(my_features_string) 50 | 51 | # subsampling 52 | X_train_sub, Y_train_sub = resample(X_train, Y_train, n_samples=500, random_state=42) 53 | print(X_train_sub.shape, Y_train_sub.shape) 54 | # pipeline architecture 55 | pipe = Pipeline([ 56 | ('reduce_dim', PCA()), 57 | ('classif', SVC(gamma=0.01)) 58 | ]) 59 | # parameter values 60 | nb_features = [2, 4] 61 | Cs = [0.001, 0.01, 0.1] 62 | kernels = ['linear', 'rbf'] 63 | 64 | # parameter grid 65 | param_grid = [ 66 | { 67 | 'reduce_dim': [PCA()], 68 | 'reduce_dim__n_components': nb_features, 69 | 'classif__C': Cs, 70 | 'classif__kernel': kernels 71 | }, 72 | # { 73 | # 'reduce_dim': [SelectKBest(chi2)], 74 | # 'reduce_dim__k': nb_features, 75 | # 'classif__C': Cs, 76 | # 'classif__kernel':kernels 77 | # } 78 | # , 79 | { 80 | 'reduce_dim': [SelectKBest(mutual_info_classif)], 81 | 'reduce_dim__k': nb_features, 82 | 'classif__C': Cs, 83 | 'classif__kernel': kernels 84 | } 85 | ] 86 | 87 | # cross validation grid search instance 88 | grid = GridSearchCV(pipe, cv=4, n_jobs=2, param_grid=param_grid, verbose=10) 89 | 90 | # fit grid 91 | grid.fit(X_train_sub, Y_train_sub) 92 | 93 | # print best params 94 | print(grid.best_params_) 95 | -------------------------------------------------------------------------------- /notes: -------------------------------------------------------------------------------- 1 | Lien vers kaggle: https://www.kaggle.com/t/012200c0318541f6806bfe757092b4f0 2 | Il faut citer des papiers si possible. 3 | 4 | Il faudrait faire un plan d'attaque. 5 | D'abord tester des choses très simples pour avoir des premiers résultats et une idée de la puissance de calcul nécessaire. 6 | 7 | Le modèle "baseline" prend en compte seulement quelques features et les entraîne dans un SVM: 8 | - number of overlapping words in paper titles 9 | - number of common authors 10 | - difference in publication years 11 | 12 | Il faudrait créer ces features et ajouter quelques features au fur et à mesure. 13 | 14 | brainstorming sur les features: 15 | - mots les plus importants d'après TF-IDF dans le titre et dans l'abstract: check. voir comment on fait exactement. cosine distance ? 16 | - biensûr les dates: à faire 17 | - un genre d'historique de citation des auteurs. Par exemple les 10 auteurs les plus cités par les auteurs du texte: à faire 18 | - représentation word2vec des abstracts ? 19 | - représentations sous formes de graphes des textes et essayer d'en extraire des features 20 | 21 | 22 | à faire: 23 | relire les cours pour trouver tout ce qui pourrait nous servir. 24 | 25 | brainstorming data exploration: 26 | - nombre d'auteurs différents: check 27 | - nombre d'apparitions d'un auteur dans la base de donnée: check 28 | - les mots les plus fréquents dans les abstracts: pas encore 29 | - distribution du nombre d'overlapping words chez les textes qui ne se citent pas et chez les textes qui se citent: check 30 | - faire un tf-idf sur la base de donnée entière et voir les résultats de ça: à faire aussi 31 | - combien de journaux différents ? 32 | - combien d'auteurs manquants ? 33 | 34 | subsampler pour l'exploration, les premiers tests ? 35 | 36 | brainstorming recherche d'articles: 37 | - checker dans les cours les articles qui sont cités 38 | - demander à des gens ? 39 | - faire des recherches en ligne 40 | 41 | 42 | 43 | Références à aller checker: 44 | • Christopher D. Manning, Prabhakar Raghavan and Hinrich 45 | Schütze, Introduction to Information Retrieval, Cambridge 46 | University Press. 2008. http://www-nlp.stanford.edu/IR-book/ 47 | • “Indexing by Latent Semantic Analysis”, S.Deerwester, 48 | S.Dumais, T.Landauer, G.Fumas, R.Harshman, Journal of the 49 | Society for Information Science, 1990 50 | • “Mining the Web: Discovering Knowledge from Hypertext 51 | Data”, Soumen Chakrabarti 52 | 53 | 54 | 55 | Résultats: 56 | Random Forest avec les paramètres de base: 57 | Avec 10 estimateurs j'ai peu d'overfitting et un score de 0.85. Avec 30 estimateurs j'ai pas franchement plus d'overfitting mais les résultats ne s'améliorent pas. 58 | Si je rajoute cosine distance avec 30 estimateurs j'ai masse overfitting et les résultats qui baissent. Avec 10 estimateurs ça overfitte encore pas mal. Donc faudrait réussir à réduire cet overfitting 59 | Si je rajoute les deux score en plus, on reste sur de l'overfitting de gros porc. 60 | 61 | Light GBM: 62 | pas d'overfitting sur les features de base. Léger overfitting avec la cosine distance mais les résultats sont meilleurs, genre 0.87. Ensuite rajouter les deux scores n'améliore pas vraiment les résultats. 63 | 64 | Avec shortest path on arrive à 94.5/93.8 (train/test). 65 | En LGBM on a des résultats un peu meilleurs et avec moins d'overfitting. 66 | 67 | On passe à 94.1/94.1 si on rajoute cosine distance et l'ajout de cosine distance a un intérêt très limité... 68 | 69 | LGBM avec les basics et shortest path: 92.9/92.8 70 | LGBM avec shortest path et overlapping: 94.2/93.9 71 | RF avec shortest path et overlapping: 94.5/93.7 72 | LGBM avec shortest path et cosine distance: 94.2/93.9 73 | 74 | 75 | Un papier qui fait de la link prediction (coauthorship) 76 | http://www.cs.rpi.edu/~zaki/PaperDir/LINK06.pdf 77 | Un article qui donne des bonnées idées sur la théorie des graphes: 78 | http://be.amazd.com/link-prediction/ 79 | 80 | Une thèse sur les graphes dirigés: 81 | https://www.cs.upc.edu/~dariog/PhD-Thesis-Link-Prediction-DGG.pdf 82 | 83 | 84 | relecture de code le 1er Mars: 85 | _ je change un peu le format du code. Maintenant c'est en mode projet donc tu dois ouvrir tout le bordel sous pycharm et changer les paramètres pour que le working directory ça soit toujours la source du projet 86 | - preprocessing. done. 87 | - pour moi dans author_graph_features il y a un soucis. Le même que ce qu'on avait déjà eu avant, il faudrait supprimer les arrêtes qui existent si target == 1. à corriger ou à jeter... Si tu le corriges je peux le faire tourner sur Compute Engine. 88 | - baseline: ok 89 | - citation_graph_features: ok 90 | - network_x bigraph: ok 91 | - network_x digraph: ok 92 | - en train de faire le network_x bigraph_long pour calculer katz 93 | 94 | nohup python3 -u task_manager.py > log.txt 2>&1 & 95 | 96 | https://drive.google.com/file/d/1RetpAekytXLNwQLUfJhHxamGHcOd_7j8/view?usp=sharing 97 | 98 | 17663 sur le cloud 99 | 100 | 371 460 dans le dernier tail log.txt à 22H35. 101 | 102 | 103 | -------------------------------------------------------------------------------- /ressources/data_challenge_description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/ressources/data_challenge_description.pdf -------------------------------------------------------------------------------- /results/results: -------------------------------------------------------------------------------- 1 | date: 2018-02-15 00:26:26.153189 2 | features: ['overlap_title', 'date_diff', 'common_author'] 3 | model: Random Forest 4 | parameters: default 5 | cross validation: 6 | 0.7780963908271935 7 | 0.7784294452612852 8 | 0.776469919253952 9 | 0.7771685269126416 10 | 0.7766730028756641 11 | kaggle score: 0.77710 12 | 13 | 14 | date: 2018-02-15 00:26:26.153189 15 | features: ['overlap_title', 'date_diff', 'common_author', "journal_similarity"] 16 | model: Random Forest 17 | parameters: default 18 | cross validation: 19 | 0.7797616629976524 20 | 0.7810288945029772 21 | 0.778419522022388 22 | 0.7788338126106805 23 | 0.779093759646472 24 | kaggle score: 0.77904 25 | 26 | 27 | date: 2018-02-15 00:30:45.924858 28 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract'] 29 | model: Random Forest 30 | parameters: default 31 | cross validation: 32 | 0.8342363711688586 33 | 0.8365596289286208 34 | 0.8348442754788712 35 | 0.8360140371399327 36 | 0.8359328036912479 37 | kaggle score: 0.83755 38 | 39 | 40 | date: 2018-02-15 15:09:52.437552 41 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance'] 42 | model: Random Forest 43 | parameters: 44 | {'min_data_in_leaf': 2, 'max_depth': 200, 'boosting_type': 'gbdt', 'objective': 'binary', 'task': 'train', 'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'learning_rate': 0.1, 'bagging_freq': 5, 'num_leaves': 200} 45 | cross validation: 46 | Start training... 47 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 48 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 49 | [40] valid_0's f1 score: 0.870444 50 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 51 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 52 | [80] valid_0's f1 score: 0.871552 53 | train: 0.874026271431 54 | test: 0.871902069297 55 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 56 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 57 | [40] valid_0's f1 score: 0.872282 58 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 59 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 60 | [80] valid_0's f1 score: 0.872716 61 | train: 0.873581441765 62 | test: 0.872562595906 63 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 64 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 65 | [40] valid_0's f1 score: 0.87105 66 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 67 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 68 | [80] valid_0's f1 score: 0.872016 69 | train: 0.873792318185 70 | test: 0.872204161004 71 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 72 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 73 | [40] valid_0's f1 score: 0.871357 74 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 75 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 76 | [80] valid_0's f1 score: 0.872234 77 | train: 0.873583781681 78 | test: 0.872538922426 79 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 80 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 81 | [40] valid_0's f1 score: 0.869521 82 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 83 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf 84 | [80] valid_0's f1 score: 0.870304 85 | train: 0.874169648139 86 | test: 0.870220013444 87 | kaggle score: 88 | 89 | -------------------------------------------------------------------------------- /sampling/sampling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | path_to_data = "~/Documents/polytechnique/3A/nlp/link-prediction/data/" 4 | 5 | divide_by = 100 6 | sample_size_string = str(divide_by) 7 | 8 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"] 9 | nodes = pd.read_csv(path_to_data+"node_information.csv", names=nodes_header) 10 | 11 | names = ["id1", "id2", "target"] 12 | training = pd.read_csv(path_to_data+"training_set.txt", names=names, delimiter=" ") 13 | 14 | sample_1 = training.sample(frac=1.0/divide_by, replace=False) 15 | sample_2 = sample_1.copy() 16 | sample_2.columns = ["id2", "id1", "target"] 17 | 18 | names = ["id1", "id2"] 19 | testing = pd.read_csv(path_to_data+"testing_set.txt", names=names, delimiter=" ") 20 | 21 | sample_1_testing = testing.sample(frac=1.0/divide_by, replace=False) 22 | 23 | sample_2_testing = sample_1_testing.copy() 24 | sample_2_testing.columns = ["id2", "id1"] 25 | 26 | all_ids = pd.concat([sample_1, sample_2, sample_1_testing, sample_2_testing]) 27 | del all_ids["target"] 28 | del all_ids["id2"] 29 | all_ids.columns = ["id"] 30 | 31 | all_ids_2 = all_ids.groupby(by="id").first().reset_index() 32 | 33 | merged = all_ids_2.merge(right=nodes, how="inner") 34 | 35 | 36 | merged.to_csv(path_to_data+"node_information"+sample_size_string+".csv", header=False) 37 | sample_1.to_csv(path_to_data+"training_set"+sample_size_string+".txt", header=False, sep=" ") 38 | sample_1_testing.to_csv(path_to_data+"testing_set"+sample_size_string+".txt", header=False, sep=" ") 39 | -------------------------------------------------------------------------------- /stacking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/stacking/__init__.py -------------------------------------------------------------------------------- /stacking/stacking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.metrics import f1_score 5 | from sklearn.model_selection import StratifiedKFold 6 | 7 | # path 8 | path_to_data = "data/" 9 | path_to_submissions = "submissions/" 10 | path_to_stacking = "stacking/" 11 | 12 | # get labels 13 | names = ["id1", "id2", "target"] 14 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ") 15 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ") 16 | Y_train = Y_train['target'].values 17 | Y_test = Y_test['target'].values 18 | 19 | # group model predictions as features 20 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep'] 21 | X_train = pd.DataFrame(columns=model_strings) 22 | X_test = pd.DataFrame(columns=model_strings) 23 | for model in model_strings: 24 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category'] 25 | # take the mean of the test set probs of each cv fold 26 | if model == 'svm_linear': 27 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 28 | else: 29 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 30 | print(X_train.head(), X_test.head()) 31 | X_train = X_train.values 32 | X_test = X_test.values 33 | 34 | # model 35 | model = RandomForestClassifier( 36 | criterion='entropy', 37 | n_estimators=100, 38 | min_samples_leaf=6, 39 | max_depth=7, 40 | bootstrap=True, 41 | n_jobs=-1 42 | ) 43 | 44 | # cross validated predictions 45 | k = 5 46 | kf = StratifiedKFold(k) 47 | predictions = np.zeros((X_test.shape[0], k)) 48 | i = 0 49 | 50 | for train_index, test_index in kf.split(X_train, Y_train): 51 | model.fit(X_train[train_index], Y_train[train_index]) 52 | Y_pred = model.predict(X_train[test_index]) 53 | Y_pred_train = model.predict(X_train[train_index]) 54 | predictions[:, i] = model.predict(X_test) 55 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 56 | print("test: " + str(f1_score(Y_train[test_index], Y_pred))) 57 | i += 1 58 | 59 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 60 | submission = pd.DataFrame(Y_test) 61 | submission.to_csv( 62 | path_or_buf=path_to_submissions + "stack_sub_rf.csv", 63 | index=True, 64 | index_label="id", 65 | header=["category"] 66 | ) 67 | -------------------------------------------------------------------------------- /stacking/stacking_tuning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.feature_selection import SelectKBest, chi2 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import f1_score 7 | from sklearn.model_selection import GridSearchCV, StratifiedKFold 8 | from sklearn.pipeline import Pipeline 9 | 10 | # path 11 | path_to_data = "data/" 12 | path_to_submissions = "submissions/" 13 | path_to_stacking = "stacking/" 14 | 15 | # get labels 16 | names = ["id1", "id2", "target"] 17 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ") 18 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ") 19 | Y_train = Y_train['target'].values 20 | Y_test = Y_test['target'].values 21 | 22 | # group model predictions as features 23 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep'] 24 | X_train = pd.DataFrame(columns=model_strings) 25 | X_test = pd.DataFrame(columns=model_strings) 26 | for model in model_strings: 27 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category'] 28 | # take the mean of the test set probs of each cv fold 29 | if model == 'svm_linear': 30 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 31 | else: 32 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 33 | print(X_train.head(), X_test.head()) 34 | X_train = X_train.values 35 | X_test = X_test.values 36 | 37 | # fit a grid searched logistic regression on top of the base models 38 | # parameters grid 39 | param_grid = { 40 | "C": [1, 0.1, 0.01, 0.001], 41 | "penalty": ["l2", "l1"] 42 | } 43 | 44 | # pipeline architecture 45 | pipe = Pipeline([ 46 | ('reduce_dim', SelectKBest(chi2)), 47 | ('classif', LogisticRegression()) 48 | ]) 49 | # parameter values 50 | nb_features = [2, 3, 4, 5] 51 | C = [0.001, 0.01, 0.1] 52 | kernels = ['linear', 'rbf'] 53 | n_estimators = [100, 200] 54 | max_depth = [10, 20] 55 | min_samples_leaf = [20] 56 | penalty = ["l2", "l1"] 57 | 58 | # parameter grid 59 | param_grid = [ 60 | { 61 | 'reduce_dim__k': [5], 62 | 'classif': [RandomForestClassifier(bootstrap=True, n_jobs=-1)], 63 | 'classif__n_estimators': n_estimators, 64 | 'classif__max_depth': max_depth, 65 | 'classif__min_samples_leaf': min_samples_leaf, 66 | 67 | }, 68 | { 69 | 'reduce_dim__k': nb_features, 70 | 'classif': [LogisticRegression(n_jobs=-1)], 71 | 'classif__C': C, 72 | 'classif__penalty': penalty 73 | } 74 | ] 75 | 76 | # cross validation grid search instance 77 | grid = GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=param_grid, verbose=10) 78 | 79 | # fit grid 80 | grid.fit(X_train, Y_train) 81 | 82 | # print best params 83 | print(grid.best_params_) 84 | 85 | # get params 86 | print(grid.best_params_) 87 | parameters = grid.best_params_ 88 | 89 | # model instance for prediction 90 | model = grid.best_estimator_ 91 | 92 | # cross validated predictions 93 | k = 5 94 | kf = StratifiedKFold(k) 95 | predictions = np.zeros((X_test.shape[0], k)) 96 | i = 0 97 | 98 | for train_index, test_index in kf.split(X_train, Y_train): 99 | model.fit(X_train[train_index], Y_train[train_index]) 100 | Y_pred = model.predict(X_train[test_index]) 101 | Y_pred_train = model.predict(X_train[train_index]) 102 | predictions[:, i] = model.predict(X_test) 103 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train))) 104 | print("test: " + str(f1_score(Y_train[test_index], Y_pred))) 105 | i += 1 106 | 107 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int) 108 | submission = pd.DataFrame(Y_test) 109 | submission.to_csv( 110 | path_or_buf=path_to_submissions + "stack_sub2.csv", 111 | index=True, 112 | index_label="id", 113 | header=["category"] 114 | ) 115 | -------------------------------------------------------------------------------- /stacking/stacking_tuning_micro.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.model_selection import GridSearchCV 4 | 5 | # path 6 | path_to_data = "data/" 7 | path_to_submissions = "submissions/" 8 | path_to_stacking = "stacking/" 9 | 10 | # get labels 11 | names = ["id1", "id2", "target"] 12 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ") 13 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ") 14 | Y_train = Y_train['target'].values 15 | Y_test = Y_test['target'].values 16 | 17 | # group model predictions as features 18 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep'] 19 | X_train = pd.DataFrame(columns=model_strings) 20 | X_test = pd.DataFrame(columns=model_strings) 21 | for model in model_strings: 22 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category'] 23 | # take the mean of the test set probs of each cv fold 24 | if model == 'svm_linear': 25 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 26 | else: 27 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values 28 | print(X_train.head(), X_test.head()) 29 | X_train = X_train.values 30 | X_test = X_test.values 31 | 32 | # GridSearchCV to fine tune the stacking parameters 33 | 34 | # instantiate param grid 35 | 36 | tuned_parameters = { 37 | "n_estimators": [100], 38 | "max_depth": [3, 7, 10], 39 | "min_samples_leaf": [6], 40 | "criterion": ["entropy"] 41 | } 42 | 43 | # fit a GridSearchCV instance and print optimal parameters 44 | rf = RandomForestClassifier( 45 | bootstrap=True, 46 | n_jobs=-1 47 | ) 48 | 49 | metrics = ["f1"] 50 | grid_RF = GridSearchCV(rf, 51 | param_grid=tuned_parameters, 52 | scoring=metrics, 53 | refit='f1', 54 | cv=4, 55 | n_jobs=-1, 56 | verbose=10 57 | ) 58 | grid_RF.fit(X_train, Y_train) 59 | print("GridSearch best parameters", grid_RF.best_params_) 60 | -------------------------------------------------------------------------------- /task_manager.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_author_graph_features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ImportError", 10 | "evalue": "No module named 'code.feature_engineering'; 'code' is not a package", 11 | "traceback": [ 12 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 13 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 14 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0migraph\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_engineering\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtools\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mlit_eval_nan_proof\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# progress bar for pandas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 15 | "\u001b[0;31mImportError\u001b[0m: No module named 'code.feature_engineering'; 'code' is not a package" 16 | ], 17 | "output_type": "error" 18 | } 19 | ], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "from tqdm import tqdm\n", 24 | "from itertools import permutations\n", 25 | "import igraph\n", 26 | "\n", 27 | "from code.feature_engineering.tools import lit_eval_nan_proof\n", 28 | "\n", 29 | "# progress bar for pandas\n", 30 | "tqdm.pandas(tqdm())\n", 31 | "\n", 32 | "# path\n", 33 | "path_to_data = \"../../data/\"\n", 34 | "\n", 35 | "# loading data\n", 36 | "converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,\n", 37 | " 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}\n", 38 | "nodes = pd.read_csv(path_to_data + \"nodes_preprocessed.csv\", converters=converter_dict)\n", 39 | "nodes.set_index(\"id\", inplace=True)\n", 40 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n", 41 | "training.set_index(\"my_index\", inplace=True)\n", 42 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n", 43 | "testing.set_index(\"my_index\", inplace=True)\n", 44 | "\n", 45 | "# create author graph\n", 46 | "# vertices are authors\n", 47 | "# edge of weight 1 if they co-wrote a paper, 2 if they only cite each other\n", 48 | "\n", 49 | "# create empty directed graph\n", 50 | "g = igraph.Graph(directed=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# add vertices\n", 60 | "authors = nodes['authors']\n", 61 | "authors_set = list(set(authors.dropna().sum()))\n", 62 | "g.add_vertices(authors_set)" 63 | ] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.5.2" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /tests/test_baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | "
my_indexid1id2targetoverlap_titledate_diffcommon_authorscore_1_2score_2_1cosine_distancejaccardadarpreferential_attachmentresource_allocation_indexcommon_neighbors
09510123|950211495101239502114120017.84439214.5359350.0757910.0666670.51389855.00.1428570.142857
19707075|960417897070759604178111019.41518424.2968500.0824500.0980394.32036611388.00.2264010.226401
29312155|95061429312155950614200-2015.11603710.0801940.0184020.0000000.0000005.00.0000000.000000
39911255|302165991125530216500-4016.76577020.2959040.0582450.0000000.000000280.00.0000000.000000
49701033|209076970103320907600-5021.45780925.2408190.0690250.0000000.000000168.00.0000000.000000
\n", 139 | "
" 140 | ], 141 | "text/plain": [ 142 | " my_index id1 id2 target overlap_title date_diff \\\n", 143 | "0 9510123|9502114 9510123 9502114 1 2 0 \n", 144 | "1 9707075|9604178 9707075 9604178 1 1 1 \n", 145 | "2 9312155|9506142 9312155 9506142 0 0 -2 \n", 146 | "3 9911255|302165 9911255 302165 0 0 -4 \n", 147 | "4 9701033|209076 9701033 209076 0 0 -5 \n", 148 | "\n", 149 | " common_author score_1_2 score_2_1 cosine_distance jaccard adar \\\n", 150 | "0 0 17.844392 14.535935 0.075791 0.066667 0.513898 \n", 151 | "1 0 19.415184 24.296850 0.082450 0.098039 4.320366 \n", 152 | "2 0 15.116037 10.080194 0.018402 0.000000 0.000000 \n", 153 | "3 0 16.765770 20.295904 0.058245 0.000000 0.000000 \n", 154 | "4 0 21.457809 25.240819 0.069025 0.000000 0.000000 \n", 155 | "\n", 156 | " preferential_attachment resource_allocation_index common_neighbors \n", 157 | "0 55.0 0.142857 0.142857 \n", 158 | "1 11388.0 0.226401 0.226401 \n", 159 | "2 5.0 0.000000 0.000000 \n", 160 | "3 280.0 0.000000 0.000000 \n", 161 | "4 168.0 0.000000 0.000000 " 162 | ] 163 | }, 164 | "execution_count": 9, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "import numpy as np\n", 171 | "import pandas as pd\n", 172 | "from tqdm import tqdm\n", 173 | "\n", 174 | "path_to_data = \"../data/\"\n", 175 | "training = pd.read_csv(path_to_data+\"training_features.txt\")\n", 176 | "training.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/html": [ 187 | "
\n", 188 | "\n", 201 | "\n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | "
id1id2targetoverlap_titledate_diffcommon_author
count6.155120e+056.155120e+05615512.000000615512.000000615512.000000615512.000000
mean5.317422e+066.798460e+060.5444740.5184161.1566810.079396
std4.749198e+064.343138e+060.4980190.9071133.5216910.372206
min1.001000e+031.001000e+030.0000000.000000-11.0000000.000000
25%1.112660e+052.080790e+050.0000000.0000000.0000000.000000
50%9.310036e+069.505058e+061.0000000.0000001.0000000.000000
75%9.708050e+069.709097e+061.0000001.0000003.0000000.000000
max9.912293e+069.912293e+061.00000010.00000011.0000008.000000
\n", 288 | "
" 289 | ], 290 | "text/plain": [ 291 | " id1 id2 target overlap_title \\\n", 292 | "count 6.155120e+05 6.155120e+05 615512.000000 615512.000000 \n", 293 | "mean 5.317422e+06 6.798460e+06 0.544474 0.518416 \n", 294 | "std 4.749198e+06 4.343138e+06 0.498019 0.907113 \n", 295 | "min 1.001000e+03 1.001000e+03 0.000000 0.000000 \n", 296 | "25% 1.112660e+05 2.080790e+05 0.000000 0.000000 \n", 297 | "50% 9.310036e+06 9.505058e+06 1.000000 0.000000 \n", 298 | "75% 9.708050e+06 9.709097e+06 1.000000 1.000000 \n", 299 | "max 9.912293e+06 9.912293e+06 1.000000 10.000000 \n", 300 | "\n", 301 | " date_diff common_author \n", 302 | "count 615512.000000 615512.000000 \n", 303 | "mean 1.156681 0.079396 \n", 304 | "std 3.521691 0.372206 \n", 305 | "min -11.000000 0.000000 \n", 306 | "25% 0.000000 0.000000 \n", 307 | "50% 1.000000 0.000000 \n", 308 | "75% 3.000000 0.000000 \n", 309 | "max 11.000000 8.000000 " 310 | ] 311 | }, 312 | "execution_count": 7, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "training.describe()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.5.2" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } 351 | -------------------------------------------------------------------------------- /tests/test_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | "
idyeartitleauthorsjournalabstract
010012000compactification geometry and dualityPaul S. AspinwallNaNthese are notes based on lectures given at tas...
110022000domain walls and massive gauged supergravity p...M. Cvetic, H. Lu, C.N. PopeClass.Quant.Grav.we point out that massive gauged supergravity ...
210032000comment on metric fluctuations in brane worldsY.S. Myung, Gungwon KangNaNrecently ivanov and volovich hep-th 9912242 cl...
310042000moving mirrors and thermodynamic paradoxesAdam D. HelferPhys.Rev.quantum fields responding to moving mirrors ha...
410052000bundles of chiral blocks and boundary conditio...J. Fuchs, C. SchweigertNaNproceedings of lie iii clausthal july 1999 var...
\n", 85 | "
" 86 | ], 87 | "text/plain": [ 88 | " id year title \\\n", 89 | "0 1001 2000 compactification geometry and duality \n", 90 | "1 1002 2000 domain walls and massive gauged supergravity p... \n", 91 | "2 1003 2000 comment on metric fluctuations in brane worlds \n", 92 | "3 1004 2000 moving mirrors and thermodynamic paradoxes \n", 93 | "4 1005 2000 bundles of chiral blocks and boundary conditio... \n", 94 | "\n", 95 | " authors journal \\\n", 96 | "0 Paul S. Aspinwall NaN \n", 97 | "1 M. Cvetic, H. Lu, C.N. Pope Class.Quant.Grav. \n", 98 | "2 Y.S. Myung, Gungwon Kang NaN \n", 99 | "3 Adam D. Helfer Phys.Rev. \n", 100 | "4 J. Fuchs, C. Schweigert NaN \n", 101 | "\n", 102 | " abstract \n", 103 | "0 these are notes based on lectures given at tas... \n", 104 | "1 we point out that massive gauged supergravity ... \n", 105 | "2 recently ivanov and volovich hep-th 9912242 cl... \n", 106 | "3 quantum fields responding to moving mirrors ha... \n", 107 | "4 proceedings of lie iii clausthal july 1999 var... " 108 | ] 109 | }, 110 | "execution_count": 1, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "import numpy as np\n", 117 | "import pandas as pd\n", 118 | "from tqdm import tqdm\n", 119 | "\n", 120 | "path_to_data = \"../../data/\"\n", 121 | "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n", 122 | "nodes = pd.read_csv(path_to_data+\"node_information.csv\", names=nodes_header)\n", 123 | "nodes.head()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 2, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/html": [ 134 | "
\n", 135 | "\n", 148 | "\n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | "
idyear
count2.777000e+0427770.000000
mean6.096134e+061998.009039
std4.581677e+063.124684
min1.001000e+031992.000000
25%2.041122e+051995.000000
50%9.405182e+061998.000000
75%9.705204e+062001.000000
max9.912293e+062003.000000
\n", 199 | "
" 200 | ], 201 | "text/plain": [ 202 | " id year\n", 203 | "count 2.777000e+04 27770.000000\n", 204 | "mean 6.096134e+06 1998.009039\n", 205 | "std 4.581677e+06 3.124684\n", 206 | "min 1.001000e+03 1992.000000\n", 207 | "25% 2.041122e+05 1995.000000\n", 208 | "50% 9.405182e+06 1998.000000\n", 209 | "75% 9.705204e+06 2001.000000\n", 210 | "max 9.912293e+06 2003.000000" 211 | ] 212 | }, 213 | "execution_count": 2, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "nodes.describe()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 5, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/html": [ 230 | "
\n", 231 | "\n", 244 | "\n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | "
idyeartitleauthorsjournalabstract
0idyeartitleauthorsjournalabstract
110012000['compactif', 'geometri', 'dualiti']['paul s. aspinwall']NaN['note', 'base', 'lectur', 'given', 'tasi99', ...
210022000['domain', 'wall', 'massiv', 'gaug', 'supergra...['m. cvetic', 'h. lu', 'c.n. pope']['class', 'quant', 'grav']['point', 'massiv', 'gaug', 'supergrav', 'pote...
310032000['comment', 'metric', 'fluctuat', 'brane', 'wo...['y.s. myung', 'gungwon kang']NaN['recent', 'ivanov', 'volovich', 'hep-th', '99...
410042000['move', 'mirror', 'thermodynam', 'paradox']['adam d. helfer']['phys', 'rev']['quantum', 'field', 'respond', 'move', 'mirro...
\n", 304 | "
" 305 | ], 306 | "text/plain": [ 307 | " id year title \\\n", 308 | "0 id year title \n", 309 | "1 1001 2000 ['compactif', 'geometri', 'dualiti'] \n", 310 | "2 1002 2000 ['domain', 'wall', 'massiv', 'gaug', 'supergra... \n", 311 | "3 1003 2000 ['comment', 'metric', 'fluctuat', 'brane', 'wo... \n", 312 | "4 1004 2000 ['move', 'mirror', 'thermodynam', 'paradox'] \n", 313 | "\n", 314 | " authors journal \\\n", 315 | "0 authors journal \n", 316 | "1 ['paul s. aspinwall'] NaN \n", 317 | "2 ['m. cvetic', 'h. lu', 'c.n. pope'] ['class', 'quant', 'grav'] \n", 318 | "3 ['y.s. myung', 'gungwon kang'] NaN \n", 319 | "4 ['adam d. helfer'] ['phys', 'rev'] \n", 320 | "\n", 321 | " abstract \n", 322 | "0 abstract \n", 323 | "1 ['note', 'base', 'lectur', 'given', 'tasi99', ... \n", 324 | "2 ['point', 'massiv', 'gaug', 'supergrav', 'pote... \n", 325 | "3 ['recent', 'ivanov', 'volovich', 'hep-th', '99... \n", 326 | "4 ['quantum', 'field', 'respond', 'move', 'mirro... " 327 | ] 328 | }, 329 | "execution_count": 5, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n", 336 | "nodes_preprocessed = pd.read_csv(path_to_data+\"nodes_preprocessed.csv\", names=nodes_header)\n", 337 | "nodes_preprocessed.head()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 6, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/html": [ 348 | "
\n", 349 | "\n", 362 | "\n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | "
idyeartitleauthorsjournalabstract
count277712777127771237382029927771
unique2777113274961583428027765
top97020942002['black', 'hole', 'entropi'][\"shin'ichi nojiri\", 'sergei d. odintsov']['phys', 'lett']['comment', 'start', 'paper', 'hep-th', '01060...
freq1333573835753
\n", 413 | "
" 414 | ], 415 | "text/plain": [ 416 | " id year title \\\n", 417 | "count 27771 27771 27771 \n", 418 | "unique 27771 13 27496 \n", 419 | "top 9702094 2002 ['black', 'hole', 'entropi'] \n", 420 | "freq 1 3335 7 \n", 421 | "\n", 422 | " authors journal \\\n", 423 | "count 23738 20299 \n", 424 | "unique 15834 280 \n", 425 | "top [\"shin'ichi nojiri\", 'sergei d. odintsov'] ['phys', 'lett'] \n", 426 | "freq 38 3575 \n", 427 | "\n", 428 | " abstract \n", 429 | "count 27771 \n", 430 | "unique 27765 \n", 431 | "top ['comment', 'start', 'paper', 'hep-th', '01060... \n", 432 | "freq 3 " 433 | ] 434 | }, 435 | "execution_count": 6, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "nodes_preprocessed.describe()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [] 450 | } 451 | ], 452 | "metadata": { 453 | "kernelspec": { 454 | "display_name": "Python 3", 455 | "language": "python", 456 | "name": "python3" 457 | }, 458 | "language_info": { 459 | "codemirror_mode": { 460 | "name": "ipython", 461 | "version": 3 462 | }, 463 | "file_extension": ".py", 464 | "mimetype": "text/x-python", 465 | "name": "python", 466 | "nbconvert_exporter": "python", 467 | "pygments_lexer": "ipython3", 468 | "version": "3.5.2" 469 | } 470 | }, 471 | "nbformat": 4, 472 | "nbformat_minor": 2 473 | } 474 | --------------------------------------------------------------------------------