├── .gitignore
├── .idea
└── vcs.xml
├── LICENSE
├── README.md
├── __init__.py
├── cloud_setup.py
├── feature_engineering
├── __init__.py
├── author_graph_features.py
├── authors.py
├── authors_2.py
├── baseline_feature_engineering.py
├── basic_features.py
├── citation_graph_features.py
├── cosine_distance.py
├── networkx_bigraph.py
├── networkx_bigraph_long.py
├── networkx_bigraph_long2.py
├── networkx_digraph.py
├── preprocessing.py
└── tools.py
├── link-prediction-report.pdf
├── main.py
├── models
├── __init__.py
├── camboui.ipynb
├── feature_selection.ipynb
├── feature_selection.py
├── lgbm.py
├── logistic_regression.py
├── nn.py
├── nn_deep.py
├── plots
│ ├── rf_importance.png
│ └── rf_importance_full.png
├── random_forest.py
├── svm.py
├── tools.py
└── tuning
│ ├── __init__.py
│ ├── console_nn_grid_search_example.txt
│ ├── objective_function.py
│ ├── plots
│ └── grid_lgbm.png
│ ├── tools.py
│ ├── tuning_lgbm.py
│ ├── tuning_nn.py
│ ├── tuning_random_forest.py
│ ├── tuning_svm.py
│ └── tuning_svm_feat_selec.py
├── notes
├── ressources
└── data_challenge_description.pdf
├── results
└── results
├── sampling
├── sampling.ipynb
└── sampling.py
├── stacking
├── __init__.py
├── stacking.py
├── stacking_tuning.py
└── stacking_tuning_micro.py
├── task_manager.py
└── tests
├── __init__.py
├── test_author_graph_features.ipynb
├── test_baseline.ipynb
└── test_preprocessing.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 | /data/
103 | /submissions/
104 | /calculated_features/
105 | .idea/
106 | code/data_exploration/camboui.ipynb
107 | /code/data_exploration/camboui.ipynb
108 | /code/data_exploration/data_exploration.ipynb
109 | /code/feature_engineering/camboui.ipynb
110 | /code/models/camboui.ipynb
111 | /code/feature_engineering/camboui.py
112 | /code/feature_engineering/camboui_network_x.ipynb
113 | /tests/multiprocessing_tuto.py
114 | /data_exploration/data_exploration.ipynb
115 | /models/feature_selection_2.ipynb
116 | /results/log.txt
117 | /log.txt
118 | /bigraph_from_root.py
119 | /illustrate_report.ipynb
120 | /camboui.py
121 | *.csv
122 | results/*
123 | /storage.py
124 | Untitled.ipynb
125 | Untitled1.ipynb
126 | requirements.txt
127 |
128 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 raph-m
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # link-prediction
2 |
3 | Predict links in a citation network.
4 | You can find the project instructions in the ressources folder.
5 | Our project report is available in the link-prediction-report pdf.
6 |
7 | ## Feature Engineering
8 |
9 | In the feature engineering folder you can find scripts to compute new features from the dataset. The features computed are described at the beginning of the scripts, and you can find more information in our project report.
10 |
11 | ## Feature Selection
12 |
13 | Running the feature_selection.py script will print the results of a forward selection algorithm. We chose the set of features that we were going to use for the rest of the project from these results.
14 |
15 | ## Models
16 |
17 | You can find several implementations of models to fit to our data. Running the scripts will give you the results and create a submission file.
18 |
19 | ## Tuning
20 |
21 | Running the tuning scripts will output best paramaters resulting from a cross validated grid search on a hand picked parameter grid.
22 |
23 | ## Main
24 |
25 | The main.py script processes all you need (feature engineering and machine learning) in order to create our final submission. The svm fit might take a substantial amount of time.
26 | You may use the generated "stack_sub_rf.csv" as a reproduction of our best submission. If they were to be reproducibility issues with runtimes and what not we left our original submission under the name ("stack_sub_rf_reference.csv")
27 |
28 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/__init__.py
--------------------------------------------------------------------------------
/cloud_setup.py:
--------------------------------------------------------------------------------
1 | # how to setup the environment for cloud computing (install python tools and libraries, download database from
2 | # google drive public link and run python file)
3 |
4 | """
5 | sudo apt update
6 | sudo apt install python python-dev python3 python3-dev
7 | sudo apt-get install python3-setuptools
8 | wget https://bootstrap.pypa.io/get-pip.py
9 | sudo python get-pip.py
10 | sudo pip install --upgrade virtualenv
11 | sudo pip install virtualenvwrapper
12 | echo "export WORKON_HOME=$HOME/.virtualenvs" >> .bashrc
13 | echo "export PROJECT_HOME=$HOME/Devel" >> .bashrc
14 | echo "source /usr/local/bin/virtualenvwrapper.sh" >> .bashrc
15 | echo "source "/usr/bin/virtualenvwrapper.sh"" >> .bashrc
16 | echo "export WORKON_HOME="/opt/virtual_env/"" >> .bashrc
17 | source `which virtualenvwrapper.sh`
18 | mkvirtualenv -p /usr/bin/python3.5 ml1
19 | sudo pip install pandas
20 | sudo pip install requests
21 | sudo pip install dotenv
22 | sudo pip install
23 | git clone https://github.com/raph-m/safe_driver_prediction
24 | cd safe_driver_prediction/proj2
25 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip
26 | cd ..
27 | cd ..
28 | sudo apt-get install unzip
29 | unzip data.zip
30 | cd safe_driver_prediction
31 | git pull origin master
32 | echo "ENV_NAME=vm" > .env
33 | python proj2/feature_engineering.py train ../../churn/ 3000000
34 | """
35 |
36 | # une version qui marche (sans virtualenv):
37 | """
38 | sudo apt update
39 | sudo apt install python python-dev python3 python3-dev
40 | sudo apt-get install python3-setuptools
41 | wget https://bootstrap.pypa.io/get-pip.py
42 | sudo python get-pip.py
43 | alias python=python3
44 | sudo apt-get python3-setuptools
45 | sudo easy_install3 pip
46 | sudo pip3 install pandas
47 | sudo pip3 install requests
48 | sudo pip3 install dotenv
49 | git clone https://github.com/raph-m/safe_driver_prediction
50 | cd safe_driver_prediction/proj2
51 | python gdrive.py 1EQ0zE_2WLQdNIepWUjroPyGmi-dvN5KK ../../data.zip
52 | cd ..
53 | cd ..
54 | sudo apt-get install unzip
55 | unzip data.zip
56 | cd safe_driver_prediction
57 | echo "ENV_NAME=vm" > .env
58 | cd proj2
59 | python feature_engineering.py
60 | """
61 |
62 | # une autre façon de faire c'est avec `alias python=python3`
63 |
64 | # pour automatiser ces commandes, il faudrait mettre les commandes dans ce bashCommand et lancer ce script:
65 | # bashCommand = "cwm --rdf test.rdf --ntriples > test.nt"
66 | # process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
67 | # output, error = process.communicate()
68 |
69 | """
70 | git clone https://github.com/raph-m/link-prediction
71 | cd link-prediction/
72 | # get and API token from kaggle (kaggle.json)
73 | sudo pip install kaggle
74 | mv kaggle.json .kaggle/
75 | mkdir data
76 | cd data
77 | kaggle competitions download -c link-prediction-challenge-tm-and-nlp
78 | sudo pip install nltk
79 | sudo pip install tqdm
80 |
81 | """
82 |
83 | import requests
84 |
85 |
86 | # python script to download a file from a google drive public link
87 |
88 |
89 | def download_file_from_google_drive(id, destination):
90 | def get_confirm_token(response):
91 | for key, value in response.cookies.items():
92 | if key.startswith('download_warning'):
93 | return value
94 |
95 | return None
96 |
97 | def save_response_content(response, destination):
98 | CHUNK_SIZE = 32768
99 |
100 | with open(destination, "wb") as f:
101 | for chunk in response.iter_content(CHUNK_SIZE):
102 | if chunk: # filter out keep-alive new chunks
103 | f.write(chunk)
104 |
105 | URL = "https://docs.google.com/uc?export=download"
106 |
107 | session = requests.Session()
108 |
109 | response = session.get(URL, params={'id': id}, stream=True)
110 | token = get_confirm_token(response)
111 |
112 | if token:
113 | params = {'id': id, 'confirm': token}
114 | response = session.get(URL, params=params, stream=True)
115 |
116 | save_response_content(response, destination)
117 |
118 |
119 | if __name__ == "__main__":
120 | import sys
121 |
122 | if len(sys.argv) is not 3:
123 | print("Usage: python google_drive.py drive_file_id destination_file_path")
124 | else:
125 | # TAKE ID FROM SHAREABLE LINK
126 | file_id = sys.argv[1]
127 | # DESTINATION FILE ON YOUR DISK
128 | destination = sys.argv[2]
129 | download_file_from_google_drive(file_id, destination)
130 |
--------------------------------------------------------------------------------
/feature_engineering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/feature_engineering/__init__.py
--------------------------------------------------------------------------------
/feature_engineering/author_graph_features.py:
--------------------------------------------------------------------------------
1 | import time
2 | from itertools import permutations, product
3 |
4 | import igraph
5 | import numpy as np
6 | import pandas as pd
7 | from tqdm import tqdm
8 |
9 | from feature_engineering.tools import lit_eval_nan_proof
10 |
11 | # progress bar for pandas
12 | tqdm.pandas(tqdm())
13 |
14 | # path
15 | path_to_data = "data/"
16 |
17 | # loading data
18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
19 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
21 | nodes.set_index("id", inplace=True)
22 | training = pd.read_csv(path_to_data + "training_features.txt")
23 | training.set_index("my_index", inplace=True)
24 | testing = pd.read_csv(path_to_data + "testing_features.txt")
25 | testing.set_index("my_index", inplace=True)
26 |
27 | # create author graph
28 | # vertices are authors
29 | # edge of weight 1 if they cowrote a paper, 2 if they only cite each other
30 |
31 | # create empty directed graph
32 | g = igraph.Graph(directed=True)
33 |
34 | # add vertices
35 | authors = nodes['authors']
36 | authors_set = list(set(authors.dropna().sum()))
37 | g.add_vertices(authors_set)
38 |
39 | # first, add citation edges
40 | edges = {}
41 | # store edge ids for each edge
42 | ids = {}
43 | # store weights
44 | weights = {}
45 | id1 = training['id1'].values
46 | id2 = training['id2'].values
47 | index_train = training.index
48 | target = training["target"].values
49 | # edge id
50 | id = 0
51 | # store all the edges related to each citation
52 | eid = {}
53 | for i in tqdm(range(len(id1))):
54 | # if there is a
55 | if target[i] == 1:
56 | authors1 = nodes.at[id1[i], 'authors']
57 | authors2 = nodes.at[id2[i], 'authors']
58 | # check that author information is not missing
59 | if isinstance(authors1, float) or isinstance(authors2, float):
60 | continue
61 | # if authors available then add edges
62 | pairs = list(product(authors1, authors2))
63 | # for each pair of authors
64 | for pair in pairs:
65 | # if edge already exists
66 | if pair in edges:
67 | # increment weight
68 | weights[pair] += 1
69 | # add id to edges related to this citation
70 | if index_train[i] in eid:
71 | eid[index_train[i]] += [id]
72 | else:
73 | eid[index_train[i]] = [id]
74 | # if doesn't exist
75 | else:
76 | # create edge
77 | edges[pair] = 1
78 | # keep track of edge id
79 | ids[pair] = id
80 | # add id to edges related to this citation
81 | if index_train[i] in eid:
82 | eid[index_train[i]] += [id]
83 | else:
84 | eid[index_train[i]] = [id]
85 | # store weight
86 | weights[pair] = 1
87 | # increment id
88 | id += 1
89 |
90 | # then, add coauthor edges
91 | authors_array = authors.values
92 | index_nodes = nodes.index.values
93 | # for each document
94 | for i in tqdm(range(len(authors_array))):
95 | # if missing author info, skip
96 | if isinstance(authors_array[i], float):
97 | continue
98 | # if not for each pair of coauthors
99 | coauthors = permutations(authors_array[i], 2)
100 | for pair in coauthors:
101 | # if edge already exists
102 | if pair in edges:
103 | # increment weight
104 | weights[pair] += 2
105 | # if doesn't exist
106 | else:
107 | # create edge
108 | edges[pair] = 1
109 | # store weight
110 | weights[pair] = 2
111 |
112 | # add edges to graph
113 | g.add_edges(list(edges))
114 |
115 | # add weights
116 | weights = list(edges.values())
117 | max_weight = max(weights)
118 | weights = max_weight - np.array(weights) + 1
119 | g.es['weight'] = list(weights)
120 |
121 | # compute features such as shortest path
122 |
123 | # features placeholders
124 | min_shortest_path = []
125 | max_shortest_path = []
126 | mean_shortest_path = []
127 | author_in_degree_mean_target = []
128 | author_in_degree_max_target = []
129 | author_out_degree_mean_source = []
130 | author_out_degree_max_source = []
131 | author_common_neigbors_mean = []
132 | author_common_neigbors_max = []
133 | author_jaccard_mean = []
134 | author_jaccard_max = []
135 |
136 | # get training ids
137 | id1 = training['id1'].values
138 | id2 = training['id2'].values
139 | target = training["target"].values
140 | index_train = training.index
141 |
142 | # compute features for all samples
143 | for i in tqdm(range(len(id1))):
144 | authors1 = nodes.at[id1[i], 'authors']
145 | authors2 = nodes.at[id2[i], 'authors']
146 | # if one of the articles has missing author info
147 | if isinstance(authors1, float) or isinstance(authors2, float):
148 | # print("NAN")
149 | # no shortest path can be computed
150 | min_shortest_path.append(np.nan)
151 | max_shortest_path.append(np.nan)
152 | mean_shortest_path.append(np.nan)
153 | # if author info is missing for first doc
154 | if isinstance(authors1, float):
155 | # no degree can be computed
156 | author_out_degree_max_source.append(np.nan)
157 | author_out_degree_mean_source.append(np.nan)
158 | # if not missing
159 | else:
160 | # compute degrees
161 | out = g.strength(authors1, weights="weight")
162 | mean_out = np.mean(out)
163 | max_out = np.max(out)
164 | author_out_degree_max_source.append(max_out)
165 | author_out_degree_mean_source.append(mean_out)
166 | # if it is missing for the second document
167 | if isinstance(authors2, float):
168 | # no degree can be computed
169 | author_in_degree_max_target.append(np.nan)
170 | author_in_degree_mean_target.append(np.nan)
171 | # if not
172 | else:
173 | # compute degrees for other document
174 | in_ = g.strength(authors2, weights="weight")
175 | mean_in = np.mean(in_)
176 | max_in = np.max(in_)
177 | author_in_degree_max_target.append(max_in)
178 | author_in_degree_mean_target.append(mean_in)
179 | continue
180 | # print("NO NAN")
181 | # if there's no missing author information
182 | # set weights of unwanted edges to zero
183 | if target[i] == 1:
184 | # print('target is 1')
185 | t0 = time.time()
186 | # print('fetching edge ids')
187 | eids_to_unweigh = eid[index_train[i]]
188 | t1 = time.time()
189 | for id in eids_to_unweigh:
190 | g.es['weight'][id] += 1
191 | t1_bis = time.time()
192 | print('bottleneck', t1 - t0, t1_bis - t1)
193 | # compute shortest paths
194 | # print("computing shortest path")
195 | t1 = time.time()
196 | # paths = g.shortest_paths_dijkstra(source=authors1, target=authors2,
197 | # mode="OUT", weights="weight")[0][0]
198 | # min_value = np.min(paths)
199 | # max_value = np.max(paths)
200 | # mean_value = np.mean(paths)
201 | t2 = time.time()
202 | print('shortest_path', t2 - t1)
203 | # compute degrees
204 | out = g.strength(authors1, weights="weight")
205 | in_ = g.strength(authors2, weights="weight")
206 | mean_out = np.mean(out)
207 | max_out = np.max(out)
208 | in_ = g.strength(authors2, weights="weight")
209 | mean_in = np.mean(in_)
210 | max_in = np.max(in_)
211 | t3 = time.time()
212 | print('weighted degree', t3 - t2)
213 | # create set of pairs as vertex ids as well as index values
214 | pairs = list(product(authors1, authors2))
215 | pairs_index = list(product(range(len(authors1)), range(len(authors2))))
216 | # compute jaccard similarity
217 | # jaccards = g.similarity_jaccard(pairs=pairs)
218 | # max_jacc = np.max(jaccards)
219 | # mean_jacc = np.mean(jaccards)
220 | t4 = time.time()
221 | # print('jacc', t4 - t3)
222 | # compute common neighbours
223 | hoods1 = g.neighborhood(vertices=authors1)
224 | hoods2 = g.neighborhood(vertices=authors2)
225 | common_hoods = [set(hoods1[i]).intersection(set(hoods2[j])) for (i, j) in pairs_index]
226 | common_hoods_size = list(map(len, common_hoods))
227 | max_hood = np.max(common_hoods_size)
228 | mean_hood = np.mean(common_hoods_size)
229 | t5 = time.time()
230 | print('common hoods', t5 - t4)
231 | # append features to corresponding set
232 | # min_shortest_path.append(min_value)
233 | # max_shortest_path.append(max_value)
234 | # mean_shortest_path.append(mean_value)
235 | author_out_degree_max_source.append(max_out)
236 | author_out_degree_mean_source.append(mean_out)
237 | author_in_degree_max_target.append(max_in)
238 | author_in_degree_mean_target.append(mean_in)
239 | author_common_neigbors_mean.append(mean_hood)
240 | author_common_neigbors_max.append(max_hood)
241 | # author_jaccard_mean.append(max_jacc)
242 | # author_jaccard_max.append(mean_jacc)
243 | if target[i] == 1:
244 | for id in eids_to_unweigh:
245 | g.es['weight'][id] = 0
246 | t6 = time.time()
247 | # print("append features", t6 - t5)
248 |
249 | # add feature to dataframe
250 | # training["author_min_shortest_path"] = min_shortest_path
251 | # training["author_max_shortest_path"] = max_shortest_path
252 | # training["author_sum_shortest_path"] = sum_shortest_path
253 | # training["author_mean_shortest_path"] = mean_shortest_path
254 | training['author_out_degree_max_source'] = author_out_degree_max_source
255 | training['author_out_degree_mean_source'] = author_out_degree_mean_source
256 | training['author_in_degree_max_target'] = author_in_degree_max_target
257 | training['author_in_degree_mean_target'] = author_in_degree_mean_target
258 | training['author_common_neigbors_mean'] = author_common_neigbors_mean
259 | training['author_common_neigbors_max'] = author_common_neigbors_max
260 | # training['author_jaccard_mean'] = author_jaccard_mean
261 | # training['author_jaccard_max'] = author_jaccard_max
262 |
263 | # repeat process for test set
264 | min_shortest_path_test = []
265 | max_shortest_path_test = []
266 | sum_shortest_path_test = []
267 | mean_shortest_path_test = []
268 | author_in_degree_mean_target_test = []
269 | author_in_degree_sum_target_test = []
270 | author_out_degree_mean_source_test = []
271 | author_out_degree_sum_source_test = []
272 | id1 = testing['id1'].values
273 | id2 = testing['id2'].values
274 | for i in tqdm(range(len(id1))):
275 | authors1 = nodes.at[id1[i], 'authors']
276 | authors2 = nodes.at[id2[i], 'authors']
277 | if isinstance(authors1, float) or isinstance(authors2, float):
278 | min_shortest_path_test.append(np.nan)
279 | max_shortest_path_test.append(np.nan)
280 | sum_shortest_path_test.append(np.nan)
281 | mean_shortest_path_test.append(np.nan)
282 | if isinstance(authors1, float):
283 | author_out_degree_sum_source_test.append(np.nan)
284 | author_out_degree_mean_source_test.append(np.nan)
285 | else:
286 | sum_out = 0
287 | n_source = len(authors1)
288 | for author1 in authors1:
289 | sum_out += g.strength(author1, mode='OUT', weights="weight")
290 | mean_out = sum_out / n_source
291 | author_out_degree_sum_source_test.append(sum_out)
292 | author_out_degree_mean_source_test.append(mean_out)
293 | if isinstance(authors2, float):
294 | author_in_degree_sum_target_test.append(np.nan)
295 | author_in_degree_mean_target_test.append(np.nan)
296 | else:
297 | sum_in = 0
298 | n_target = len(authors2)
299 | for author2 in authors2:
300 | sum_in += g.strength(author2, mode='IN', weights="weight")
301 | mean_in = sum_in / n_target
302 | author_in_degree_sum_target_test.append(sum_in)
303 | author_in_degree_mean_target_test.append(mean_in)
304 | continue
305 | min_value = float('inf')
306 | max_value = - float('inf')
307 | sum_value = 0
308 | n = len(authors1) * len(authors2)
309 | for author1 in authors1:
310 | for author2 in authors2:
311 | current = g.shortest_paths_dijkstra(source=author1, target=author2,
312 | mode="OUT", weights=g.es["weight"])[0][0]
313 | min_value = current if current < min_value else min_value
314 | max_value = current if current > max_value else max_value
315 | sum_value += current
316 | mean_value = sum_value / n
317 | sum_out = 0
318 | sum_in = 0
319 | n_source = len(authors1)
320 | n_target = len(authors2)
321 | for author1 in authors1:
322 | sum_out += g.strength(author1, mode='OUT', weights="weight")
323 | for author2 in authors2:
324 | sum_in += g.strength(author2, mode='IN', weights="weight")
325 | mean_out = sum_out / n_source
326 | mean_in = sum_in / n_target
327 | min_shortest_path_test.append(min_value)
328 | max_shortest_path_test.append(max_value)
329 | sum_shortest_path_test.append(sum_value)
330 | mean_shortest_path_test.append(mean_value)
331 | author_out_degree_sum_source_test.append(sum_out)
332 | author_out_degree_mean_source_test.append(mean_out)
333 | author_in_degree_sum_target_test.append(sum_in)
334 | author_in_degree_mean_target_test.append(mean_in)
335 |
336 | # add feature to dataframe
337 | testing["author_min_shortest_path"] = min_shortest_path_test
338 | testing["author_max_shortest_path"] = max_shortest_path_test
339 | testing["author_sum_shortest_path"] = sum_shortest_path_test
340 | testing["author_mean_shortest_path"] = mean_shortest_path_test
341 | testing['author_out_degree_sum_source'] = author_out_degree_sum_source_test
342 | testing['author_out_degree_mean_source'] = author_out_degree_mean_source_test
343 | testing['author_in_degree_sum_target'] = author_in_degree_sum_target_test
344 | testing['author_in_degree_mean_target'] = author_in_degree_mean_target_test
345 |
346 | # save data sets
347 | training.to_csv(path_to_data + "training_features.txt")
348 | testing.to_csv(path_to_data + "testing_features.txt")
349 |
--------------------------------------------------------------------------------
/feature_engineering/authors.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import numpy as np
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 | from feature_engineering.tools import lit_eval_nan_proof
7 |
8 | # this script computes the features authors_citation and coauthor score by considering the graph of coauthorship and
9 | # the author's graph of citations.
10 | # the script takes approximately 5 minutes to run
11 |
12 | # progress bar for pandas
13 | tqdm.pandas(tqdm())
14 |
15 | # path
16 | path_to_data = "data/"
17 |
18 | # loading data
19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
22 | nodes.set_index("id", inplace=True)
23 | training = pd.read_csv(path_to_data + "training_features.txt")
24 | training.set_index("my_index", inplace=True)
25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
26 | testing.set_index("my_index", inplace=True)
27 |
28 | # loading data
29 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
30 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
31 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
32 | converters=converter_dict)
33 | nodes.set_index("id", inplace=True)
34 |
35 | G = nx.DiGraph()
36 | coauthors = nx.Graph()
37 |
38 | print("building coauthor graph")
39 | nodes_id = nodes.index.values
40 | for i in tqdm(range(len(nodes_id))):
41 |
42 | authors = nodes.loc[nodes_id[i]]["authors"]
43 | if authors is np.nan:
44 | authors = []
45 |
46 | authors = np.unique([a for a in authors if a != ""])
47 |
48 | for a in authors:
49 | G.add_node(a)
50 | coauthors.add_node(a)
51 |
52 | for a1 in authors:
53 | for a2 in authors:
54 | if a1 != a2:
55 | if coauthors.has_edge(a1, a2):
56 | coauthors[a1][a2]["weight"] += 1
57 | else:
58 | coauthors.add_edge(a1, a2, weight=1)
59 |
60 | id1 = training["id1"].values
61 | id2 = training["id2"].values
62 |
63 | print("building citation graph")
64 | for i in tqdm(range(len(id1))):
65 | current_authors_1 = nodes.loc[id1[i]]["authors"]
66 | current_authors_2 = nodes.loc[id2[i]]["authors"]
67 |
68 | if current_authors_1 is np.nan:
69 | current_authors_1 = []
70 |
71 | if current_authors_2 is np.nan:
72 | current_authors_2 = []
73 |
74 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
75 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
76 |
77 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
78 | for a1 in current_authors_1:
79 | for a2 in current_authors_2:
80 | if G.has_edge(a1, a2):
81 | G[a1][a2]["weight"] += 1
82 | else:
83 | G.add_edge(a1, a2, weight=1)
84 |
85 | coauthor_score = np.zeros(len(id1))
86 | normalized_coauthor_score = np.zeros(len(id1))
87 | best_coauthor_score = np.zeros(len(id1))
88 | authors_citation = np.zeros(len(id1))
89 | normalized_authors_citation = np.zeros(len(id1))
90 | best_authors_citation = np.zeros(len(id1))
91 |
92 | print("building features for training")
93 | for i in tqdm(range(len(id1))):
94 | current_authors_1 = nodes.loc[id1[i]]["authors"]
95 | current_authors_2 = nodes.loc[id2[i]]["authors"]
96 |
97 | if current_authors_1 is np.nan:
98 | current_authors_1 = []
99 |
100 | if current_authors_2 is np.nan:
101 | current_authors_2 = []
102 |
103 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
104 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
105 |
106 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
107 | for a1 in current_authors_1:
108 | for a2 in current_authors_2:
109 | G[a1][a2]["weight"] -= 1
110 |
111 | best = 0
112 | for a1 in current_authors_1:
113 | for a2 in current_authors_2:
114 | if G.has_edge(a1, a2):
115 | current = G[a1][a2]["weight"]
116 | authors_citation[i] += current
117 | if current > best:
118 | best = current
119 |
120 | best_authors_citation[i] = best
121 |
122 | best = 0
123 | for a1 in current_authors_1:
124 | for a2 in current_authors_2:
125 | if coauthors.has_edge(a1, a2):
126 | current = coauthors[a1][a2]["weight"]
127 | coauthor_score[i] += current
128 | if current > best:
129 | best = current
130 |
131 | best_coauthor_score[i] = best
132 |
133 | # normalize features
134 | denom = len(current_authors_1) * len(current_authors_2)
135 | if denom > 0:
136 | normalized_authors_citation[i] = authors_citation[i] / denom
137 | normalized_coauthor_score[i] = coauthor_score[i] / denom
138 |
139 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
140 | for a1 in current_authors_1:
141 | for a2 in current_authors_2:
142 | G[a1][a2]["weight"] += 1
143 |
144 | training["authors_citation"] = authors_citation
145 | training["normalized_authors_citation"] = normalized_authors_citation
146 | training["coauthor_score"] = coauthor_score
147 | training["normalized_coauthor_score"] = normalized_coauthor_score
148 | training["best_coauthor_score"] = best_coauthor_score
149 | training["best_authors_citation"] = best_authors_citation
150 |
151 | id1 = testing["id1"].values
152 | id2 = testing["id2"].values
153 |
154 | coauthor_score = np.zeros(len(id1))
155 | normalized_coauthor_score = np.zeros(len(id1))
156 | best_coauthor_score = np.zeros(len(id1))
157 | authors_citation = np.zeros(len(id1))
158 | normalized_authors_citation = np.zeros(len(id1))
159 | best_authors_citation = np.zeros(len(id1))
160 |
161 | print("building features for testing")
162 | for i in tqdm(range(len(id1))):
163 | current_authors_1 = nodes.loc[id1[i]]["authors"]
164 | current_authors_2 = nodes.loc[id2[i]]["authors"]
165 |
166 | if current_authors_1 is np.nan:
167 | current_authors_1 = []
168 |
169 | if current_authors_2 is np.nan:
170 | current_authors_2 = []
171 |
172 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
173 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
174 |
175 | best = 0
176 | for a1 in current_authors_1:
177 | for a2 in current_authors_2:
178 | if G.has_edge(a1, a2):
179 | current = G[a1][a2]["weight"]
180 | authors_citation[i] += current
181 | if current > best:
182 | best = current
183 |
184 | best_authors_citation[i] = best
185 |
186 | best = 0
187 | for a1 in current_authors_1:
188 | for a2 in current_authors_2:
189 | if coauthors.has_edge(a1, a2):
190 | current = coauthors[a1][a2]["weight"]
191 | coauthor_score[i] += current
192 | if current > best:
193 | best = current
194 |
195 | best_coauthor_score[i] = best
196 |
197 | # normalize features
198 | denom = len(current_authors_1) * len(current_authors_2)
199 | if denom > 0:
200 | normalized_authors_citation[i] = authors_citation[i] / denom
201 | normalized_coauthor_score[i] = coauthor_score[i] / denom
202 |
203 | testing["authors_citation"] = authors_citation
204 | testing["normalized_authors_citation"] = normalized_authors_citation
205 | testing["coauthor_score"] = coauthor_score
206 | testing["normalized_coauthor_score"] = normalized_coauthor_score
207 | testing["best_coauthor_score"] = best_coauthor_score
208 | testing["best_authors_citation"] = best_authors_citation
209 |
210 | print("done, saving data")
211 | # save data-frame
212 | training.to_csv(path_to_data + "training_features.txt")
213 | testing.to_csv(path_to_data + "testing_features.txt")
214 |
--------------------------------------------------------------------------------
/feature_engineering/authors_2.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import networkx as nx
4 | import numpy as np
5 | import pandas as pd
6 | from tqdm import tqdm
7 |
8 | from feature_engineering.tools import lit_eval_nan_proof
9 |
10 | # this script computes the features authors_in_neighbors and authors_common_neighbors by considering
11 | # the author's graph of citations.
12 | # the script takes approximately 5 minutes to run
13 |
14 | # progress bar for pandas
15 | tqdm.pandas(tqdm())
16 |
17 | # path
18 | path_to_data = "data/"
19 |
20 | # loading data
21 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
22 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
23 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
24 | nodes.set_index("id", inplace=True)
25 | training = pd.read_csv(path_to_data + "training_features.txt")
26 | training.set_index("my_index", inplace=True)
27 | testing = pd.read_csv(path_to_data + "testing_features.txt")
28 | testing.set_index("my_index", inplace=True)
29 |
30 | # loading data
31 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
32 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
33 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
34 | converters=converter_dict)
35 | nodes.set_index("id", inplace=True)
36 |
37 | G = nx.DiGraph()
38 | coauthors = nx.Graph()
39 |
40 | print("building coauthor graph")
41 | nodes_id = nodes.index.values
42 | for i in tqdm(range(len(nodes_id))):
43 |
44 | authors = nodes.loc[nodes_id[i]]["authors"]
45 | if authors is np.nan:
46 | authors = []
47 |
48 | authors = np.unique([a for a in authors if a != ""])
49 |
50 | for a in authors:
51 | G.add_node(a)
52 | coauthors.add_node(a)
53 |
54 | for a1 in authors:
55 | for a2 in authors:
56 | if a1 != a2:
57 | if coauthors.has_edge(a1, a2):
58 | coauthors[a1][a2]["weight"] += 1
59 | else:
60 | coauthors.add_edge(a1, a2, weight=1)
61 |
62 | id1 = training["id1"].values
63 | id2 = training["id2"].values
64 |
65 | print("building citation graph")
66 | for i in tqdm(range(len(id1))):
67 |
68 | current_authors_1 = nodes.loc[id1[i]]["authors"]
69 | current_authors_2 = nodes.loc[id2[i]]["authors"]
70 |
71 | if current_authors_1 is np.nan:
72 | current_authors_1 = []
73 |
74 | if current_authors_2 is np.nan:
75 | current_authors_2 = []
76 |
77 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
78 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
79 |
80 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
81 | for a1 in current_authors_1:
82 | for a2 in current_authors_2:
83 | if G.has_edge(a1, a2):
84 | G[a1][a2]["weight"] += 1
85 | else:
86 | G.add_edge(a1, a2, weight=1)
87 |
88 | authors_in_neighbors = np.zeros(len(id1))
89 | normalized_authors_in_neighbors = np.zeros(len(id1))
90 | best_authors_in_neighbors = np.zeros(len(id1))
91 | authors_common_neighbors = np.zeros(len(id1))
92 |
93 | print("building features for training")
94 | for i in tqdm(range(len(id1))):
95 | current_authors_1 = nodes.loc[id1[i]]["authors"]
96 | current_authors_2 = nodes.loc[id2[i]]["authors"]
97 |
98 | if current_authors_1 is np.nan:
99 | current_authors_1 = []
100 |
101 | if current_authors_2 is np.nan:
102 | current_authors_2 = []
103 |
104 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
105 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
106 |
107 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
108 | for a1 in current_authors_1:
109 | for a2 in current_authors_2:
110 | G[a1][a2]["weight"] -= 1
111 |
112 | # this feature is commented because too long to compute
113 | # for a1 in current_authors_1:
114 | # for p in G.successors(a1):
115 | # for a2 in G.successors(p):
116 | # if a2 in current_authors_2:
117 | # authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"])
118 |
119 | best = 0
120 | for a1 in current_authors_2:
121 | current = len([g for g in G.predecessors(a1)])
122 | authors_in_neighbors[i] += current
123 | if current > best:
124 | best = current
125 |
126 | best_authors_in_neighbors[i] = best
127 |
128 | # normalize feature
129 | denom = len(current_authors_2)
130 | if denom > 0:
131 | normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom
132 |
133 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
134 | for a1 in current_authors_1:
135 | for a2 in current_authors_2:
136 | G[a1][a2]["weight"] += 1
137 |
138 | training["authors_in_neighbors"] = authors_in_neighbors
139 | training["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors
140 | training["best_authors_in_neighbors"] = best_authors_in_neighbors
141 | training["authors_common_neighbors"] = authors_common_neighbors
142 |
143 | id1 = testing["id1"].values
144 | id2 = testing["id2"].values
145 |
146 | authors_in_neighbors = np.zeros(len(id1))
147 | normalized_authors_in_neighbors = np.zeros(len(id1))
148 | best_authors_in_neighbors = np.zeros(len(id1))
149 | authors_common_neighbors = np.zeros(len(id1))
150 |
151 | print("building features for testing")
152 | for i in tqdm(range(len(id1))):
153 | current_authors_1 = nodes.loc[id1[i]]["authors"]
154 | current_authors_2 = nodes.loc[id2[i]]["authors"]
155 |
156 | if current_authors_1 is np.nan:
157 | current_authors_1 = []
158 |
159 | if current_authors_2 is np.nan:
160 | current_authors_2 = []
161 |
162 | current_authors_1 = np.unique([a for a in current_authors_1 if a != ""])
163 | current_authors_2 = np.unique([a for a in current_authors_2 if a != ""])
164 |
165 | # for a1 in current_authors_1:
166 | # for p in G.successors(a1):
167 | # for a2 in G.successors(p):
168 | # if a2 in current_authors_2:
169 | # authors_common_neighbors[i] += min(G[a1][p]["weight"], G[p][a2]["weight"])
170 |
171 | best = 0
172 | for a1 in current_authors_2:
173 | current = len([g for g in G.predecessors(a1)])
174 | authors_in_neighbors[i] += current
175 | if current > best:
176 | best = current
177 |
178 | best_authors_in_neighbors[i] = best
179 |
180 | # normalize feature
181 | denom = len(current_authors_2)
182 | if denom > 0:
183 | normalized_authors_in_neighbors[i] = authors_in_neighbors[i] / denom
184 |
185 | testing["authors_in_neighbors"] = authors_in_neighbors
186 | testing["normalized_authors_in_neighbors"] = normalized_authors_in_neighbors
187 | testing["best_authors_in_neighbors"] = best_authors_in_neighbors
188 | testing["authors_common_neighbors"] = authors_common_neighbors
189 |
190 | print("done, saving data")
191 | # save data-frame
192 | training.to_csv(path_to_data + "training_features.txt")
193 | testing.to_csv(path_to_data + "testing_features.txt")
194 |
--------------------------------------------------------------------------------
/feature_engineering/baseline_feature_engineering.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 |
4 | from feature_engineering.tools import lit_eval_nan_proof
5 |
6 | # this script produces the following features: overlap_title, date_diff and common_author
7 | # this is the script that you should run right after the pre-processing
8 |
9 | # progress bar for pandas
10 | tqdm.pandas(tqdm())
11 |
12 | # path
13 | path_to_data = "data/"
14 |
15 | # loading preprocessed data
16 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
17 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
18 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
19 | nodes.set_index("id", inplace=True)
20 | training = pd.read_csv(path_to_data + "training_new_index.txt")
21 | training.set_index("my_index", inplace=True)
22 | testing = pd.read_csv(path_to_data + "testing_new_index.txt")
23 | testing.set_index("my_index", inplace=True)
24 |
25 | # adding baseline features in training dataframe
26 |
27 | # features placeholders
28 | overlap_title = []
29 | date_diff = []
30 | common_author = []
31 |
32 | # IDs for training set
33 | id1 = training['id1'].values
34 | id2 = training['id2'].values
35 |
36 | # computing features for training set
37 | for i in tqdm(range(len(id1))):
38 | title1 = nodes.at[id1[i], 'title']
39 | title2 = nodes.at[id2[i], 'title']
40 | date1 = nodes.at[id1[i], 'year']
41 | date2 = nodes.at[id2[i], 'year']
42 | author1 = nodes.at[id1[i], 'authors']
43 | author2 = nodes.at[id2[i], 'authors']
44 | overlap_title.append(len(set(title1).intersection(set(title2))))
45 | date_diff.append(int(date1) - int(date2))
46 | if isinstance(author1, float) or isinstance(author2, float):
47 | common_author.append(0)
48 | else:
49 | common_author.append(len(set(author1).intersection(set(author2))))
50 |
51 | # adding feature to data-frame
52 | training["overlap_title"] = overlap_title
53 | training["date_diff"] = date_diff
54 | training["common_author"] = common_author
55 |
56 | # repeat process for test set
57 | overlap_title_test = []
58 | date_diff_test = []
59 | common_author_test = []
60 | id1 = testing['id1'].values
61 | id2 = testing['id2'].values
62 | for i in tqdm(range(len(id1))):
63 | title1 = nodes.at[id1[i], 'title']
64 | title2 = nodes.at[id2[i], 'title']
65 | date1 = nodes.at[id1[i], 'year']
66 | date2 = nodes.at[id2[i], 'year']
67 | author1 = nodes.at[id1[i], 'authors']
68 | author2 = nodes.at[id2[i], 'authors']
69 | overlap_title_test.append(len(set(title1).intersection(set(title2))))
70 | date_diff_test.append(int(date1) - int(date2))
71 | if isinstance(author1, float) or isinstance(author2, float):
72 | common_author_test.append(0)
73 | else:
74 | common_author_test.append(len(set(author1).intersection(set(author2))))
75 | testing["overlap_title"] = overlap_title_test
76 | testing["date_diff"] = date_diff_test
77 | testing["common_author"] = common_author_test
78 |
79 | # save data sets
80 | training.to_csv(path_to_data + "training_features.txt")
81 | testing.to_csv(path_to_data + "testing_features.txt")
82 |
--------------------------------------------------------------------------------
/feature_engineering/basic_features.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 |
4 | from feature_engineering.tools import compare_journals, lit_eval_nan_proof
5 |
6 | # this script adds the features journal_similarity and overlapping_words_abstract to the csv features files
7 |
8 | # progress bar for pandas
9 | tqdm.pandas(tqdm())
10 |
11 | # path
12 | path_to_data = "data/"
13 |
14 | # loading data
15 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
16 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
17 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
18 | converters=converter_dict)
19 | nodes.set_index("id", inplace=True)
20 | training = pd.read_csv(path_to_data + "training_features.txt")
21 | training.set_index("my_index", inplace=True)
22 | testing = pd.read_csv(path_to_data + "testing_features.txt")
23 | testing.set_index("my_index", inplace=True)
24 |
25 | # placeholder for second batch of features
26 | journal_similarity = []
27 | overlapping_words_abstract = []
28 |
29 | # IDs for training set
30 | id1 = training['id1'].values
31 | id2 = training['id2'].values
32 |
33 | # computing features for training set
34 | for i in tqdm(range(len(id1))):
35 | journal1 = nodes.at[id1[i], 'journal']
36 | journal2 = nodes.at[id2[i], 'journal']
37 | abstract1 = nodes.at[id1[i], "abstract"]
38 | abstract2 = nodes.at[id2[i], "abstract"]
39 | if isinstance(journal1, float) or isinstance(journal2, float):
40 | journal_similarity.append(0)
41 | else:
42 | journal_similarity.append(compare_journals(journal1, journal2))
43 | overlapping_words_abstract.append(len(set(abstract1).intersection(set(abstract2))))
44 |
45 | # adding feature to dataframe
46 | training["journal_similarity"] = journal_similarity
47 | training["overlapping_words_abstract"] = overlapping_words_abstract
48 |
49 | # repeat process for test set
50 | journal_similarity_test = []
51 | overlapping_words_abstract_test = []
52 | id1 = testing['id1'].values
53 | id2 = testing['id2'].values
54 | for i in tqdm(range(len(id1))):
55 | journal1 = nodes.at[id1[i], 'journal']
56 | journal2 = nodes.at[id2[i], 'journal']
57 | abstract1 = nodes.at[id1[i], "abstract"]
58 | abstract2 = nodes.at[id2[i], "abstract"]
59 | if isinstance(journal1, float) or isinstance(journal2, float):
60 | journal_similarity_test.append(0)
61 | else:
62 | journal_similarity_test.append(compare_journals(journal1, journal2))
63 | overlapping_words_abstract_test.append(len(set(abstract1).intersection(set(abstract2))))
64 | testing["journal_similarity"] = journal_similarity_test
65 | testing["overlapping_words_abstract"] = overlapping_words_abstract_test
66 |
67 | # save data sets
68 | training.to_csv(path_to_data + "training_features.txt")
69 | testing.to_csv(path_to_data + "testing_features.txt")
70 |
--------------------------------------------------------------------------------
/feature_engineering/citation_graph_features.py:
--------------------------------------------------------------------------------
1 | import igraph
2 | import numpy as np
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 | from feature_engineering.tools import lit_eval_nan_proof
7 |
8 | # this script adds the feature shortest_path to the files training_features and testing_features
9 | # this script takes approximately 1000 minutes to execute
10 |
11 | # progress bar for pandas
12 | tqdm.pandas(tqdm())
13 |
14 | # path
15 | path_to_data = "data/"
16 |
17 | # loading data
18 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
19 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
20 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
21 | nodes.set_index("id", inplace=True)
22 | training = pd.read_csv(path_to_data + "training_features.txt")
23 | training.set_index("my_index", inplace=True)
24 | testing = pd.read_csv(path_to_data + "testing_features.txt")
25 | testing.set_index("my_index", inplace=True)
26 |
27 | # placeholders for graph features
28 | shortest_path = []
29 |
30 | # IDs for training set
31 | id1 = training['id1'].values
32 | id2 = training['id2'].values
33 | target = training["target"].values
34 |
35 | # creating graph of citations
36 |
37 | # create empty directed graph
38 | g = igraph.Graph(directed=True)
39 |
40 | # some nodes may not be connected to any other node
41 | # hence the need to create the nodes of the graph from node_info.csv,
42 | # not just from the edge list
43 | nodes = nodes.index.values
44 | str_vec = np.vectorize(str)
45 | nodes = str_vec(nodes)
46 |
47 | # add vertices
48 | g.add_vertices(nodes)
49 |
50 | # create and add edges
51 | edges = [(str(id1[i]), str(id2[i])) for i in range(len(id1)) if target[i] == 1]
52 | g.add_edges(edges)
53 |
54 | for i in tqdm(range(len(id1))):
55 | if target[i] == 1:
56 | g.delete_edges([(str(id1[i]), str(id2[i]))])
57 | shortest_path.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0])
58 | if target[i] == 1:
59 | g.add_edge(str(id1[i]), str(id2[i]))
60 | # adding feature to dataframe
61 | training["shortest_path"] = shortest_path
62 |
63 | # repeat process for test set
64 | shortest_path_test = []
65 | id1 = testing['id1'].values
66 | id2 = testing['id2'].values
67 | for i in tqdm(range(len(id1))):
68 | shortest_path_test.append(g.shortest_paths_dijkstra(source=str(id1[i]), target=str(id2[i]), mode="OUT")[0][0])
69 | if target[i] == 1:
70 | g.add_edge(str(id1[i]), str(id2[i]))
71 | testing["shortest_path"] = shortest_path_test
72 |
73 | # save data sets
74 | training.to_csv(path_to_data + "training_features.txt")
75 | testing.to_csv(path_to_data + "testing_features.txt")
76 |
--------------------------------------------------------------------------------
/feature_engineering/cosine_distance.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from gensim import corpora, models
6 | from tqdm import tqdm
7 |
8 | from feature_engineering.tools import lit_eval_nan_proof
9 |
10 | # this script adds the features score_1_2, score_2_1 and cosine_distance to the features csv files.
11 | # this script takes approximately 10 minutes to run
12 |
13 | # progress bar for pandas
14 | tqdm.pandas(tqdm())
15 |
16 | # path
17 | path_to_data = "data/"
18 |
19 | # loading data
20 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
21 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
22 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
23 | nodes.set_index("id", inplace=True)
24 | training = pd.read_csv(path_to_data + "training_features.txt")
25 | training.set_index("my_index", inplace=True)
26 | testing = pd.read_csv(path_to_data + "testing_features.txt")
27 | testing.set_index("my_index", inplace=True)
28 |
29 | # create dictionary for tfidf
30 | abstracts = nodes['abstract'].values
31 | average_len = np.mean(np.array([len(a) for a in abstracts]))
32 | dictionary = corpora.Dictionary(abstracts)
33 |
34 |
35 | def my_tf(p):
36 | return math.log(1.0 + p)
37 |
38 |
39 | # instantiate tf-idf model
40 | tfidf = models.TfidfModel(dictionary=dictionary, wlocal=my_tf)
41 |
42 |
43 | # handy functions to compute cosine distance
44 | def get_tf_idf_encoding(index):
45 | abstract = nodes.at[index, "abstract"]
46 | abstract = dictionary.doc2bow(abstract)
47 | ans = tfidf[[abstract]]
48 | return ans[0]
49 |
50 |
51 | def my_norm(tfidf_abstract):
52 | ans = 0.0
53 | for (k, v) in tfidf_abstract:
54 | ans += v ** 2
55 | return np.sqrt(ans)
56 |
57 |
58 | def cosine_distance(id1, id2):
59 | tfidf_abstract1 = get_tf_idf_encoding(id1)
60 | tfidf_abstract2 = get_tf_idf_encoding(id2)
61 | f1 = dict(tfidf_abstract1)
62 | f2 = dict(tfidf_abstract2)
63 | ans = 0.0
64 | for k, v in f1.items():
65 | if k in f2.keys():
66 | ans += v * f2[k]
67 | return ans
68 |
69 |
70 | def get_score(id1, id2, avglen, k1=1.2, b=0.75):
71 | abstract_1 = nodes.at[id1, "abstract"]
72 | len_1 = len(abstract_1)
73 | abstract_1 = dictionary.doc2bow(abstract_1)
74 | tf_1 = dict([
75 | (termid, tfidf.wlocal(tf))
76 | for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0
77 | ])
78 | idf_1 = dict([
79 | (termid, tfidf.idfs.get(termid))
80 | for termid, tf in abstract_1 if tfidf.idfs.get(termid, 0.0) != 0.0
81 | ])
82 |
83 | abstract_2 = nodes.at[id2, "abstract"]
84 | abstract_2 = dictionary.doc2bow(abstract_2)
85 | tf_2 = dict([
86 | (termid, tfidf.wlocal(tf))
87 | for termid, tf in abstract_2 if tfidf.idfs.get(termid, 0.0) != 0.0
88 | ])
89 |
90 | ans = 0.0
91 | for k, v in tf_1.items():
92 | if k in tf_2.keys():
93 | ans += idf_1[k] * (v * (k1 + 1)) / (v + k1 * (1 - b + b * len_1 / avglen))
94 | return ans
95 |
96 |
97 | # placeholder for feature
98 | score_1_2 = []
99 | score_2_1 = []
100 | cosine_dist = []
101 |
102 | # IDs for training set
103 | id1 = training['id1'].values
104 | id2 = training['id2'].values
105 |
106 | # computing features for training set
107 | for i in tqdm(range(len(id1))):
108 | score_1_2.append(get_score(id1[i], id2[i], average_len))
109 | score_2_1.append(get_score(id2[i], id1[i], average_len))
110 | cosine_dist.append(cosine_distance(id1[i], id2[i]))
111 |
112 | # add feature to data-frame
113 | training["score_1_2"] = score_1_2
114 | training["score_2_1"] = score_2_1
115 | training["cosine_distance"] = cosine_dist
116 |
117 | score_1_2 = []
118 | score_2_1 = []
119 | cosine_dist = []
120 |
121 | # IDs for training set
122 | id1 = testing['id1'].values
123 | id2 = testing['id2'].values
124 |
125 | # computing features for training set
126 | for i in tqdm(range(len(id1))):
127 | score_1_2.append(get_score(id1[i], id2[i], average_len))
128 | score_2_1.append(get_score(id2[i], id1[i], average_len))
129 | cosine_dist.append(cosine_distance(id1[i], id2[i]))
130 |
131 | # add feature to data-frame
132 | testing["score_1_2"] = score_1_2
133 | testing["score_2_1"] = score_2_1
134 | testing["cosine_distance"] = cosine_dist
135 |
136 | # save data-frame
137 | training.to_csv(path_to_data + "training_features.txt")
138 | testing.to_csv(path_to_data + "testing_features.txt")
139 |
--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import numpy as np
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 | from feature_engineering.tools import lit_eval_nan_proof
7 |
8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar,
9 | # preferential_attachment, resource_allocation_index and common_neighbors
10 | # approx 10 minutes to run it
11 |
12 | # progress bar for pandas
13 | tqdm.pandas(tqdm())
14 |
15 | # path
16 | path_to_data = "data/"
17 |
18 | # loading data
19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
22 | nodes.set_index("id", inplace=True)
23 | training = pd.read_csv(path_to_data + "training_features.txt")
24 | training.set_index("my_index", inplace=True)
25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
26 | testing.set_index("my_index", inplace=True)
27 |
28 | G = nx.Graph()
29 | G.add_nodes_from(nodes.index.values)
30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
31 |
32 | # IDs for training set
33 | id1 = training['id1'].values
34 | id2 = training['id2'].values
35 |
36 | # placeholder for feature
37 | n = len(id1)
38 | jaccard = np.zeros(n)
39 | adar = np.zeros(n)
40 | preferential_attachment = np.zeros(n)
41 | resource_allocation_index = np.zeros(n)
42 | common_neighbors = np.zeros(n)
43 |
44 | # computing features for training set
45 | for i in tqdm(range(len(id1))):
46 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
47 | G.remove_edge(id1[i], id2[i])
48 |
49 | pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
50 | pred = [(u, v, p) for (u, v, p) in pred]
51 | jaccard[i] = pred[0][2]
52 |
53 | pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
54 | pred = [(u, v, p) for (u, v, p) in pred]
55 | adar[i] = pred[0][2]
56 |
57 | pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
58 | pred = [(u, v, p) for (u, v, p) in pred]
59 | preferential_attachment[i] = pred[0][2]
60 |
61 | pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
62 | pred = [(u, v, p) for (u, v, p) in pred]
63 | resource_allocation_index[i] = pred[0][2]
64 |
65 | pred = nx.common_neighbors(G, id1[i], id2[i])
66 | pred = len([u for u in pred])
67 | common_neighbors[i] = pred
68 |
69 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
70 | G.add_edge(id1[i], id2[i])
71 |
72 | # add feature to data-frame
73 | training["jaccard"] = jaccard
74 | training["adar"] = adar
75 | training["preferential_attachment"] = preferential_attachment
76 | training["resource_allocation_index"] = resource_allocation_index
77 | training["common_neighbors"] = resource_allocation_index
78 |
79 | # IDs for training set
80 | id1 = testing['id1'].values
81 | id2 = testing['id2'].values
82 |
83 | # placeholder for feature
84 | n = len(id1)
85 | jaccard = np.zeros(n)
86 | adar = np.zeros(n)
87 | preferential_attachment = np.zeros(n)
88 | resource_allocation_index = np.zeros(n)
89 | common_neighbors = np.zeros(n)
90 |
91 | # computing features for training set
92 | for i in tqdm(range(len(id1))):
93 | pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])])
94 | pred = [(u, v, p) for (u, v, p) in pred]
95 | jaccard[i] = pred[0][2]
96 |
97 | pred = nx.adamic_adar_index(G, [(id1[i], id2[i])])
98 | pred = [(u, v, p) for (u, v, p) in pred]
99 | adar[i] = pred[0][2]
100 |
101 | pred = nx.preferential_attachment(G, [(id1[i], id2[i])])
102 | pred = [(u, v, p) for (u, v, p) in pred]
103 | preferential_attachment[i] = pred[0][2]
104 |
105 | pred = nx.resource_allocation_index(G, [(id1[i], id2[i])])
106 | pred = [(u, v, p) for (u, v, p) in pred]
107 | resource_allocation_index[i] = pred[0][2]
108 |
109 | pred = nx.common_neighbors(G, id1[i], id2[i])
110 | pred = len([u for u in pred])
111 | common_neighbors[i] = pred
112 |
113 | # add feature to data-frame
114 | testing["jaccard"] = jaccard
115 | testing["adar"] = adar
116 | testing["preferential_attachment"] = preferential_attachment
117 | testing["resource_allocation_index"] = resource_allocation_index
118 | testing["common_neighbors"] = resource_allocation_index
119 |
120 | # save data-frame
121 | training.to_csv(path_to_data + "training_features.txt")
122 | testing.to_csv(path_to_data + "testing_features.txt")
123 |
--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph_long.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 |
3 | import networkx as nx
4 | import numpy as np
5 | import pandas as pd
6 | from tqdm import tqdm
7 |
8 | from feature_engineering.tools import lit_eval_nan_proof
9 |
10 | # this script computes some features by considering the bidirectional graph of citations: katz
11 |
12 | # progress bar for pandas
13 | tqdm.pandas(tqdm())
14 |
15 | # path
16 | path_to_data = "data/"
17 |
18 | # loading data
19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
22 | nodes.set_index("id", inplace=True)
23 | training = pd.read_csv(path_to_data + "training_features.txt")
24 | training.set_index("my_index", inplace=True)
25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
26 | testing.set_index("my_index", inplace=True)
27 |
28 | # IDs for training set
29 | id1 = training['id1'].values
30 | id2 = training['id2'].values
31 |
32 | # placeholder for feature
33 | n = len(id1)
34 | print("start computing for training: ")
35 | print("size of data to process: " + str(n))
36 | katz = np.zeros(n)
37 | katz_2 = np.zeros(n)
38 | breaking_point = 10
39 | beta = 0.98
40 | beta_2 = 0.9
41 |
42 |
43 | def work(i0=None, n=None, is_training=True):
44 | print(i0)
45 | G = nx.Graph()
46 | G.add_nodes_from(nodes.index.values)
47 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
48 |
49 | ans = np.zeros(n)
50 | ans_2 = np.zeros(n)
51 |
52 | for i in range(n):
53 | if is_training:
54 | if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1:
55 | G.remove_edge(id1[i0 + i], id2[i0 + i])
56 |
57 | katz_acc = 0.0
58 | katz_2_acc = 0.0
59 | counter = 0
60 | try:
61 | iterator = nx.all_shortest_paths(G, source=id1[i0 + i], target=id2[i0 + i])
62 | for p in iterator:
63 | len_p = len(p)
64 | katz_acc += (beta ** len_p)
65 | katz_2_acc += (beta_2 ** len_p)
66 | counter += 1
67 | if counter >= breaking_point:
68 | break
69 | except:
70 | ans[i] = -1
71 | ans_2[i] = -1
72 |
73 | if is_training:
74 | if training.at[str(id1[i0 + i]) + "|" + str(id2[i0 + i]), "target"] == 1:
75 | G.add_edge(id1[i0 + i], id2[i0 + i])
76 |
77 | ans[i] = katz_acc
78 | ans_2[i] = katz_2_acc
79 |
80 | print(i0)
81 |
82 | return ans, ans_2, i0
83 |
84 |
85 | def callback(r):
86 | ans, ans_2, i0 = r
87 |
88 |
89 | # computing features for training set
90 |
91 | pool = Pool()
92 | print("starting pool...")
93 | import time
94 |
95 | start = time.time()
96 | n_tasks = 512
97 | tasks = []
98 | step = int(n / n_tasks)
99 | print(step)
100 | for i0 in range(n_tasks):
101 | kwds = {
102 | "i0": i0 * step,
103 | "n": step,
104 | "is_training": True
105 | }
106 | tasks.append(pool.apply_async(work, kwds=kwds, callback=callback))
107 | pool.close()
108 | pool.join()
109 | for i in range(n_tasks):
110 | katz[i * step: (i + 1) * step], \
111 | katz_2[i * step: (i + 1) * step], _ = tasks[i].get()
112 |
113 | end = time.time()
114 | print(end - start)
115 | # add feature to data-frame
116 | training["katz"] = katz
117 | training["katz_2"] = katz_2
118 |
119 | # IDs for testing set
120 | print("start computing for training: ")
121 | id1 = testing['id1'].values
122 | id2 = testing['id2'].values
123 |
124 | # placeholder for feature
125 | n = len(id1)
126 | print("size of data to process: " + str(n))
127 |
128 | katz = np.zeros(n)
129 | katz_2 = np.zeros(n)
130 |
131 | pool = Pool()
132 | print("starting pool...")
133 | n_tasks = 512
134 | tasks = []
135 | step = int(n / n_tasks)
136 | for i0 in range(n_tasks):
137 | kwds = {
138 | "i0": i0 * step,
139 | "n": step,
140 | "is_training": False
141 | }
142 | tasks.append(pool.apply_async(work, kwds=kwds, callback=callback))
143 | pool.close()
144 | pool.join()
145 | for i in range(n_tasks):
146 | katz[i * step: (i + 1) * step], \
147 | katz_2[i * step: (i + 1) * step], _ = tasks[i].get()
148 |
149 | # add feature to data-frame
150 | testing["katz"] = katz
151 | testing["katz_2"] = katz_2
152 |
153 | print("done, saving data")
154 | # save data-frame
155 | training.to_csv(path_to_data + "training_features.txt")
156 | testing.to_csv(path_to_data + "testing_features.txt")
157 |
--------------------------------------------------------------------------------
/feature_engineering/networkx_bigraph_long2.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import numpy as np
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 | from feature_engineering.tools import lit_eval_nan_proof
7 |
8 | # this script computes some features by considering the bidirectional graph of citations: jaccard, adar,
9 | # preferential_attachment, resource_allocation_index and common_neighbors
10 | # approx 10 minutes to run it
11 |
12 | # progress bar for pandas
13 | tqdm.pandas(tqdm())
14 |
15 | # path
16 | path_to_data = "data/"
17 |
18 | # loading data
19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
22 | nodes.set_index("id", inplace=True)
23 | training = pd.read_csv(path_to_data + "training_features.txt")
24 | training.set_index("my_index", inplace=True)
25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
26 | testing.set_index("my_index", inplace=True)
27 |
28 | G = nx.Graph()
29 | G.add_nodes_from(nodes.index.values)
30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
31 |
32 | # IDs for training set
33 | id1 = training['id1'].values
34 | id2 = training['id2'].values
35 |
36 | # placeholder for feature
37 | n = len(id1)
38 | katz = np.zeros(n)
39 | katz_2 = np.zeros(n)
40 | beta = 0.98
41 | beta_2 = 0.90
42 | breaking_point = 10
43 |
44 | # computing features for training set
45 | for i in tqdm(range(len(id1))):
46 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
47 | G.remove_edge(id1[i], id2[i])
48 |
49 | katz_acc = 0.0
50 | katz_2_acc = 0.0
51 | counter = 0
52 | try:
53 | iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i])
54 | for p in iterator:
55 | len_p = len(p)
56 | katz_acc += len_p * (beta ** len_p)
57 | katz_2_acc += len_p * (beta_2 ** len_p)
58 | counter += 1
59 | if counter >= breaking_point:
60 | break
61 | katz[i] = katz_acc
62 | katz[i] = katz_2_acc
63 | except:
64 | katz[i] = -1
65 | katz_2[i] = -1
66 |
67 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
68 | G.add_edge(id1[i], id2[i])
69 |
70 | # add feature to data-frame
71 | training["katz"] = katz
72 | training["katz_2"] = katz_2
73 |
74 | # IDs for training set
75 | id1 = testing['id1'].values
76 | id2 = testing['id2'].values
77 |
78 | # placeholder for feature
79 | n = len(id1)
80 | katz = np.zeros(n)
81 | katz_2 = np.zeros(n)
82 |
83 | # computing features for training set
84 | for i in tqdm(range(len(id1))):
85 | katz_acc = 0.0
86 | katz_2_acc = 0.0
87 | counter = 0
88 | try:
89 | iterator = nx.all_shortest_paths(G, source=id1[i], target=id2[i])
90 | for p in iterator:
91 | len_p = len(p)
92 | katz_acc += len_p * (beta ** len_p)
93 | katz_2_acc += len_p * (beta_2 ** len_p)
94 | counter += 1
95 | if counter >= breaking_point:
96 | break
97 | katz[i] = katz_acc
98 | katz[i] = katz_2_acc
99 | except:
100 | katz[i] = -1
101 | katz_2[i] = -1
102 |
103 | # add feature to data-frame
104 | testing["katz"] = katz
105 | testing["katz_2"] = katz_2
106 |
107 | # save data-frame
108 | training.to_csv(path_to_data + "training_features.txt")
109 | testing.to_csv(path_to_data + "testing_features.txt")
110 |
--------------------------------------------------------------------------------
/feature_engineering/networkx_digraph.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import numpy as np
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 | from feature_engineering.tools import lit_eval_nan_proof
7 |
8 | # this script computes the features out_neighbors, in_neighbors and popularity by considering the directed
9 | # graph of citations. Popularity is the sum of in degrees of predecessors.
10 | # the script takes approximately 5 minutes to run
11 |
12 | # progress bar for pandas
13 | tqdm.pandas(tqdm())
14 |
15 | # path
16 | path_to_data = "data/"
17 |
18 | # loading data
19 | converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,
20 | 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}
21 | nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv", converters=converter_dict)
22 | nodes.set_index("id", inplace=True)
23 | training = pd.read_csv(path_to_data + "training_features.txt")
24 | training.set_index("my_index", inplace=True)
25 | testing = pd.read_csv(path_to_data + "testing_features.txt")
26 | testing.set_index("my_index", inplace=True)
27 |
28 | G = nx.DiGraph()
29 | G.add_nodes_from(nodes.index.values)
30 | G.add_edges_from(zip(training[training["target"] == 1]["id1"], training[training["target"] == 1]["id2"]))
31 |
32 | # IDs for training set
33 | id1 = training['id1'].values
34 | id2 = training['id2'].values
35 |
36 | # placeholder for feature
37 | n = len(id1)
38 | out_neighbors = np.zeros(n)
39 | in_neighbors = np.zeros(n)
40 | popularity = np.zeros(n)
41 | common_predecessors = np.zeros(n)
42 | common_successors = np.zeros(n)
43 | paths_of_length_one = np.zeros(n)
44 |
45 | # computing features for training set
46 | for i in tqdm(range(len(id1))):
47 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
48 | G.remove_edge(id1[i], id2[i])
49 |
50 | in_neighbors[i] = G.in_degree(id2[i])
51 | out_neighbors[i] = G.out_degree(id1[i])
52 |
53 | current_common_successors = 0
54 | current_common_predecessors = 0
55 | current_paths_of_length_one = 0
56 |
57 | predecessors_2 = G.predecessors(id2[i])
58 | predecessors_1 = G.predecessors(id1[i])
59 |
60 | pop = 0
61 | for p in predecessors_2:
62 | pop += G.in_degree(p)
63 | if p in predecessors_1:
64 | current_common_predecessors += 1
65 | popularity[i] = pop
66 |
67 | successors_2 = G.successors(id2[i])
68 | successors_1 = G.successors(id1[i])
69 |
70 | for p in successors_1:
71 | if p in successors_2:
72 | current_common_successors += 1
73 |
74 | for p in successors_1:
75 | if p in predecessors_2:
76 | current_paths_of_length_one += 1
77 |
78 | common_successors[i] = current_common_successors
79 | common_predecessors[i] = current_common_predecessors
80 | paths_of_length_one[i] = current_paths_of_length_one
81 |
82 | if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1:
83 | G.add_edge(id1[i], id2[i])
84 |
85 | # add feature to data-frame
86 | training["out_neighbors"] = out_neighbors
87 | training["in_neighbors"] = in_neighbors
88 | training["popularity"] = popularity
89 | training["common_successors"] = out_neighbors
90 | training["common_predecessors"] = in_neighbors
91 | training["paths_of_length_one"] = popularity
92 |
93 | # IDs for training set
94 | id1 = testing['id1'].values
95 | id2 = testing['id2'].values
96 |
97 | # placeholder for feature
98 | n = len(id1)
99 | out_neighbors = np.zeros(n)
100 | in_neighbors = np.zeros(n)
101 | popularity = np.zeros(n)
102 |
103 | # computing features for training set
104 | for i in tqdm(range(len(id1))):
105 |
106 | in_neighbors[i] = G.in_degree(id2[i])
107 | out_neighbors[i] = G.out_degree(id1[i])
108 |
109 | current_common_successors = 0
110 | current_common_predecessors = 0
111 | current_paths_of_length_one = 0
112 |
113 | predecessors_2 = G.predecessors(id2[i])
114 | predecessors_1 = G.predecessors(id1[i])
115 |
116 | pop = 0
117 | for p in predecessors_2:
118 | pop += G.in_degree(p)
119 | if p in predecessors_1:
120 | current_common_predecessors += 1
121 | popularity[i] = pop
122 |
123 | successors_2 = G.successors(id2[i])
124 | successors_1 = G.successors(id1[i])
125 |
126 | for p in successors_1:
127 | if p in successors_2:
128 | current_common_successors += 1
129 |
130 | for p in successors_1:
131 | if p in predecessors_2:
132 | current_paths_of_length_one += 1
133 |
134 | common_successors[i] = current_common_successors
135 | common_predecessors[i] = current_common_predecessors
136 | paths_of_length_one[i] = current_paths_of_length_one
137 |
138 | popularity[i] = pop
139 |
140 | # add feature to data-frame
141 | testing["out_neighbors"] = out_neighbors
142 | testing["in_neighbors"] = in_neighbors
143 | testing["popularity"] = popularity
144 | testing["common_successors"] = out_neighbors
145 | testing["common_predecessors"] = in_neighbors
146 | testing["paths_of_length_one"] = popularity
147 |
148 | # save data-frame
149 | training.to_csv(path_to_data + "training_features.txt")
150 | testing.to_csv(path_to_data + "testing_features.txt")
151 |
--------------------------------------------------------------------------------
/feature_engineering/preprocessing.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import pandas as pd
3 | from tqdm import tqdm
4 |
5 | from feature_engineering.tools import \
6 | text_element_wise_preprocess, \
7 | authors_element_wise_preprocess, \
8 | journal_element_wise_preprocess
9 |
10 | # This script reads the data in node_information.csv and training_set and testing_set.csv, and creates the
11 | # files "nodes_preprocessed.csv", "training_new_index.txt" and "testing_new_index.txt".
12 |
13 |
14 | # pre-processing tools
15 | nltk.download('punkt') # for tokenization
16 | nltk.download('stopwords')
17 |
18 | # progress bar for pandas
19 | tqdm.pandas(tqdm())
20 |
21 | # path
22 | path_to_data = "data/"
23 |
24 | # pre-processing tools
25 | nltk.download('punkt') # for tokenization
26 | nltk.download('stopwords')
27 | stpwds = set(nltk.corpus.stopwords.words("english"))
28 | stemmer = nltk.stem.PorterStemmer()
29 |
30 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
31 | nodes = pd.read_csv(path_to_data + "node_information.csv", names=nodes_header)
32 | nodes.set_index("id", inplace=True)
33 |
34 | # apply to DF
35 | nodes['title'] = nodes['title'].progress_apply(text_element_wise_preprocess)
36 | nodes['abstract'] = nodes['abstract'].progress_apply(text_element_wise_preprocess)
37 | nodes['authors'] = nodes['authors'].progress_apply(authors_element_wise_preprocess)
38 | nodes['journal'] = nodes['journal'].progress_apply(journal_element_wise_preprocess)
39 |
40 | # loading train
41 | names = ["id1", "id2", "target"]
42 | training = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
43 |
44 | # indexing consistent throughout project
45 | training["my_index"] = training["id1"].astype(str) + "|" + training["id2"].astype(str)
46 | training.set_index("my_index", inplace=True)
47 |
48 | # same process for testing set
49 | names = ["id1", "id2"]
50 | testing = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
51 | testing["my_index"] = testing["id1"].astype(str) + "|" + testing["id2"].astype(str)
52 | testing.set_index("my_index", inplace=True)
53 |
54 | # save preprocessed data sets
55 | nodes.to_csv(path_to_data + "nodes_preprocessed.csv")
56 | training.to_csv(path_to_data + "training_new_index.txt")
57 | testing.to_csv(path_to_data + "testing_new_index.txt")
58 |
--------------------------------------------------------------------------------
/feature_engineering/tools.py:
--------------------------------------------------------------------------------
1 | import ast
2 |
3 | import nltk
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | # journal similarity feature
9 | def compare_journals(journal1, journal2):
10 | if len(journal1) == 0 or len(journal2) == 0:
11 | return 0
12 | if journal1[0] == journal2[0]:
13 | return 1 + compare_journals(journal1[1:], journal2[1:])
14 | else:
15 | return 0
16 |
17 |
18 | # nan-proof string converter wrapper
19 | def lit_eval_nan_proof(string):
20 | if len(string) == 0:
21 | return np.nan
22 | else:
23 | return ast.literal_eval(string)
24 |
25 |
26 | # element-wise stemmed tokenization and stopwords removal for titles and abstracts
27 | def text_element_wise_preprocess(string):
28 | stpwds = set(nltk.corpus.stopwords.words("english"))
29 | stemmer = nltk.stem.PorterStemmer()
30 | tokens = string.lower().split(" ")
31 | tokens_wo_stpwds = [stemmer.stem(token) for token in tokens if token not in stpwds]
32 | return tokens_wo_stpwds
33 |
34 |
35 | # element-wise lower case tokenization for authors
36 | def authors_element_wise_preprocess(string):
37 | if pd.isna(string):
38 | return string
39 | tokens = string.lower().split(", ")
40 | for i in range(len(tokens)):
41 | tokens[i] = tokens[i].split('(', 1)[0].strip(' ')
42 | return tokens
43 |
44 |
45 | # element-wise lower case tokenization for journals
46 | def journal_element_wise_preprocess(string):
47 | if pd.isna(string):
48 | return string
49 | tokens = string.lower().rstrip(".").split(".")
50 | return tokens
51 |
--------------------------------------------------------------------------------
/link-prediction-report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/link-prediction-report.pdf
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # feature engineering
2 |
3 | import time
4 |
5 | start = time.time()
6 | print("preprocessing:")
7 | import feature_engineering.preprocessing
8 |
9 | end = time.time()
10 | print("done in: " + str(end - start))
11 |
12 | start = time.time()
13 | print("baseline_feature_engineering:")
14 | import feature_engineering.baseline_feature_engineering
15 |
16 | end = time.time()
17 | print("done in: " + str(end - start))
18 |
19 | start = time.time()
20 | print("basic_features:")
21 | import feature_engineering.basic_features
22 |
23 | end = time.time()
24 | print("done in: " + str(end - start))
25 |
26 | start = time.time()
27 | print("cosine_distance:")
28 | import feature_engineering.cosine_distance
29 |
30 | end = time.time()
31 | print("done in: " + str(end - start))
32 |
33 | start = time.time()
34 | print("networkx_bigraph:")
35 | import feature_engineering.networkx_bigraph
36 |
37 | end = time.time()
38 | print("done in: " + str(end - start))
39 |
40 | start = time.time()
41 | print("networkx_digraph:")
42 | import feature_engineering.networkx_digraph
43 |
44 | end = time.time()
45 | print("done in: " + str(end - start))
46 |
47 | start = time.time()
48 | print("author's features:")
49 | import feature_engineering.authors
50 |
51 | end = time.time()
52 | print("done in: " + str(end - start))
53 |
54 | start = time.time()
55 | print("author's features:")
56 | import feature_engineering.authors_2
57 |
58 | end = time.time()
59 | print("done in: " + str(end - start))
60 |
61 | # models : train them and store the output probits for stacking purposes
62 |
63 | start = time.time()
64 | print("SVM:")
65 | import models.svm
66 |
67 | end = time.time()
68 | print("done in: " + str(end - start))
69 |
70 | start = time.time()
71 | print("Random Forest:")
72 | import models.random_forest
73 |
74 | end = time.time()
75 | print("done in: " + str(end - start))
76 |
77 | start = time.time()
78 | print("LightGBM:")
79 | import models.lgbm
80 |
81 | end = time.time()
82 | print("done in: " + str(end - start))
83 |
84 | start = time.time()
85 | print("shallow NN:")
86 | import models.nn
87 |
88 | end = time.time()
89 | print("done in: " + str(end - start))
90 |
91 | start = time.time()
92 | print("deep NN:")
93 | import models.nn_deep
94 |
95 | end = time.time()
96 | print("done in: " + str(end - start))
97 |
98 | # train the model stack and generate final submission "stack_sub_rf.csv"
99 |
100 | start = time.time()
101 | print("stack :")
102 | import stacking.stacking
103 |
104 | end = time.time()
105 | print("done in: " + str(end - start))
106 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/__init__.py
--------------------------------------------------------------------------------
/models/camboui.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "date: 2018-02-16 16:16:34.322166\n",
13 | "features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance', 'shortest_path', 'jaccard', 'adar', 'preferential_attachment', 'resource_allocation_index', 'out_neighbors', 'in_neighbors', 'popularity', 'common_neighbors']\n",
14 | "model: Random Forest\n",
15 | "parameters:\n",
16 | "{'n_estimators': 10}\n",
17 | "cross validation:\n",
18 | "train: 0.9966042778250185\n",
19 | "test: 0.9720086406139066\n",
20 | "train: 0.9967559756127237\n",
21 | "test: 0.9717386898461955\n",
22 | "train: 0.9965911639381028\n",
23 | "test: 0.9717295528568946\n",
24 | "train: 0.9965775073031881\n",
25 | "test: 0.9722326963394218\n",
26 | "train: 0.9965775816654026\n",
27 | "test: 0.9718838998969885\n",
28 | "kaggle score: \n",
29 | "overlap_title: 0.01665081496613972\n",
30 | "date_diff: 0.02190514883983991\n",
31 | "common_author: 0.005109300600450039\n",
32 | "journal_similarity: 0.002403034365747304\n",
33 | "shortest_path: 0.019781629572646377\n",
34 | "overlapping_words_abstract: 0.01535330054775155\n",
35 | "jaccard: 0.19108201273444772\n",
36 | "adar: 0.006316136251304461\n",
37 | "preferential_attachment: 0.052909861150268744\n",
38 | "resource_allocation_index: 0.43101242342404056\n",
39 | "out_neighbors: 0.015096505980321603\n",
40 | "in_neighbors: 0.015219239486567731\n",
41 | "popularity: 0.018873794971630692\n",
42 | "common_neighbors: 0.18828679710884363\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "import datetime\n",
48 | "from sklearn.model_selection import KFold\n",
49 | "from sklearn.ensemble import RandomForestClassifier\n",
50 | "from sklearn.metrics import accuracy_score\n",
51 | "import pandas as pd\n",
52 | "import numpy as np\n",
53 | "\n",
54 | "from tools import f1_score\n",
55 | "\n",
56 | "# path\n",
57 | "path_to_data = \"../../data/\"\n",
58 | "path_to_submissions = \"../../submissions/\"\n",
59 | "\n",
60 | "parameters = {\n",
61 | " \"n_estimators\": 10\n",
62 | "}\n",
63 | "# parameters\n",
64 | "\n",
65 | "# load data\n",
66 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
67 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
68 | "del training[\"my_index\"]\n",
69 | "del testing[\"my_index\"]\n",
70 | "\n",
71 | "# replace inf in shortest_path with -1\n",
72 | "training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])\n",
73 | "testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])\n",
74 | "\n",
75 | "my_features_string = [\n",
76 | " \"overlap_title\",\n",
77 | " \"date_diff\",\n",
78 | " \"common_author\",\n",
79 | " \"journal_similarity\",\n",
80 | " \"overlapping_words_abstract\",\n",
81 | " \"cosine_distance\",\n",
82 | " \"shortest_path\",\n",
83 | " \"jaccard\",\n",
84 | " \"adar\",\n",
85 | " \"preferential_attachment\",\n",
86 | " \"resource_allocation_index\",\n",
87 | " \"out_neighbors\",\n",
88 | " \"in_neighbors\",\n",
89 | " \"popularity\",\n",
90 | " \"common_neighbors\"\n",
91 | "]\n",
92 | "my_features_index = []\n",
93 | "my_features_dic = {}\n",
94 | "\n",
95 | "target = 0\n",
96 | "for i in range(len(training.columns)):\n",
97 | " if training.columns[i] == \"target\":\n",
98 | " target = i\n",
99 | " elif training.columns[i] in my_features_string:\n",
100 | " my_features_dic.update({len(my_features_index): training.columns[i]})\n",
101 | " my_features_index.append(i)\n",
102 | "\n",
103 | "# separating features and labels\n",
104 | "training_val = training.values\n",
105 | "testing_val = testing.values\n",
106 | "X_train, Y_train = training_val[:, my_features_index].astype(float), training_val[:, target].astype(int)\n",
107 | "X_test = testing_val[:, my_features_index]\n",
108 | "\n",
109 | "now = datetime.datetime.now()\n",
110 | "print(\"date: \"+str(now))\n",
111 | "print(\"features: \"+str(my_features_string))\n",
112 | "print(\"model: Random Forest\")\n",
113 | "print(\"parameters:\")\n",
114 | "print(parameters)\n",
115 | "print(\"cross validation:\")\n",
116 | "\n",
117 | "RF = RandomForestClassifier(n_estimators=parameters[\"n_estimators\"])\n",
118 | "k = 5\n",
119 | "kf = KFold(k)\n",
120 | "predictions = np.zeros((X_test.shape[0], k))\n",
121 | "i = 0\n",
122 | "\n",
123 | "for train_index, test_index in kf.split(X_train, Y_train):\n",
124 | " RF.fit(X_train[train_index], Y_train[train_index])\n",
125 | " Y_pred = RF.predict(X_train[test_index])\n",
126 | " Y_pred_train = RF.predict(X_train[train_index])\n",
127 | " predictions[:, i] = RF.predict(X_test)\n",
128 | " print(\"train: \"+str(f1_score(Y_train[train_index], Y_pred_train)))\n",
129 | " print(\"test: \"+str(f1_score(Y_train[test_index], Y_pred)))\n",
130 | " i += 1\n",
131 | "\n",
132 | "Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)\n",
133 | "\n",
134 | "submission = pd.DataFrame(Y_test)\n",
135 | "submission.to_csv(\n",
136 | " path_or_buf=path_to_submissions+\"-\".join(my_features_string)+\".csv\",\n",
137 | " index=True,\n",
138 | " index_label=\"id\",\n",
139 | " header=[\"category\"]\n",
140 | ")\n",
141 | "print(\"kaggle score: \")\n",
142 | "\n",
143 | "for i in range(len(RF.feature_importances_)):\n",
144 | " print(str(my_features_dic[i]) + \": \" + str(RF.feature_importances_[i]))"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 3,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/html": [
155 | "
\n",
156 | "\n",
169 | "
\n",
170 | " \n",
171 | " \n",
172 | " | \n",
173 | " id1 | \n",
174 | " id2 | \n",
175 | " target | \n",
176 | " overlap_title | \n",
177 | " date_diff | \n",
178 | " common_author | \n",
179 | " journal_similarity | \n",
180 | " shortest_path | \n",
181 | " overlapping_words_abstract | \n",
182 | " jaccard | \n",
183 | " adar | \n",
184 | " preferential_attachment | \n",
185 | " resource_allocation_index | \n",
186 | " out_neighbors | \n",
187 | " in_neighbors | \n",
188 | " popularity | \n",
189 | "
\n",
190 | " \n",
191 | " \n",
192 | " \n",
193 | " 0 | \n",
194 | " 9510123 | \n",
195 | " 9502114 | \n",
196 | " 1 | \n",
197 | " 2 | \n",
198 | " 0 | \n",
199 | " 0 | \n",
200 | " 2 | \n",
201 | " -1.0 | \n",
202 | " 4 | \n",
203 | " 0.066667 | \n",
204 | " 0.513898 | \n",
205 | " 55.0 | \n",
206 | " 0.142857 | \n",
207 | " 2.0 | \n",
208 | " 7.0 | \n",
209 | " 76.0 | \n",
210 | "
\n",
211 | " \n",
212 | " 1 | \n",
213 | " 9707075 | \n",
214 | " 9604178 | \n",
215 | " 1 | \n",
216 | " 1 | \n",
217 | " 1 | \n",
218 | " 0 | \n",
219 | " 0 | \n",
220 | " 2.0 | \n",
221 | " 7 | \n",
222 | " 0.098039 | \n",
223 | " 4.320366 | \n",
224 | " 11388.0 | \n",
225 | " 0.226401 | \n",
226 | " 67.0 | \n",
227 | " 123.0 | \n",
228 | " 4019.0 | \n",
229 | "
\n",
230 | " \n",
231 | " 2 | \n",
232 | " 9312155 | \n",
233 | " 9506142 | \n",
234 | " 0 | \n",
235 | " 0 | \n",
236 | " -2 | \n",
237 | " 0 | \n",
238 | " 0 | \n",
239 | " -1.0 | \n",
240 | " 6 | \n",
241 | " 0.000000 | \n",
242 | " 0.000000 | \n",
243 | " 5.0 | \n",
244 | " 0.000000 | \n",
245 | " 0.0 | \n",
246 | " 2.0 | \n",
247 | " 8.0 | \n",
248 | "
\n",
249 | " \n",
250 | " 3 | \n",
251 | " 9911255 | \n",
252 | " 302165 | \n",
253 | " 0 | \n",
254 | " 0 | \n",
255 | " -4 | \n",
256 | " 0 | \n",
257 | " 0 | \n",
258 | " -1.0 | \n",
259 | " 8 | \n",
260 | " 0.000000 | \n",
261 | " 0.000000 | \n",
262 | " 280.0 | \n",
263 | " 0.000000 | \n",
264 | " 16.0 | \n",
265 | " 2.0 | \n",
266 | " 3.0 | \n",
267 | "
\n",
268 | " \n",
269 | " 4 | \n",
270 | " 9701033 | \n",
271 | " 209076 | \n",
272 | " 0 | \n",
273 | " 0 | \n",
274 | " -5 | \n",
275 | " 0 | \n",
276 | " 0 | \n",
277 | " -1.0 | \n",
278 | " 8 | \n",
279 | " 0.000000 | \n",
280 | " 0.000000 | \n",
281 | " 168.0 | \n",
282 | " 0.000000 | \n",
283 | " 0.0 | \n",
284 | " 2.0 | \n",
285 | " 1.0 | \n",
286 | "
\n",
287 | " \n",
288 | "
\n",
289 | "
"
290 | ],
291 | "text/plain": [
292 | " id1 id2 target overlap_title date_diff common_author \\\n",
293 | "0 9510123 9502114 1 2 0 0 \n",
294 | "1 9707075 9604178 1 1 1 0 \n",
295 | "2 9312155 9506142 0 0 -2 0 \n",
296 | "3 9911255 302165 0 0 -4 0 \n",
297 | "4 9701033 209076 0 0 -5 0 \n",
298 | "\n",
299 | " journal_similarity shortest_path overlapping_words_abstract jaccard \\\n",
300 | "0 2 -1.0 4 0.066667 \n",
301 | "1 0 2.0 7 0.098039 \n",
302 | "2 0 -1.0 6 0.000000 \n",
303 | "3 0 -1.0 8 0.000000 \n",
304 | "4 0 -1.0 8 0.000000 \n",
305 | "\n",
306 | " adar preferential_attachment resource_allocation_index \\\n",
307 | "0 0.513898 55.0 0.142857 \n",
308 | "1 4.320366 11388.0 0.226401 \n",
309 | "2 0.000000 5.0 0.000000 \n",
310 | "3 0.000000 280.0 0.000000 \n",
311 | "4 0.000000 168.0 0.000000 \n",
312 | "\n",
313 | " out_neighbors in_neighbors popularity \n",
314 | "0 2.0 7.0 76.0 \n",
315 | "1 67.0 123.0 4019.0 \n",
316 | "2 0.0 2.0 8.0 \n",
317 | "3 16.0 2.0 3.0 \n",
318 | "4 0.0 2.0 1.0 "
319 | ]
320 | },
321 | "execution_count": 3,
322 | "metadata": {},
323 | "output_type": "execute_result"
324 | }
325 | ],
326 | "source": [
327 | "training.head()"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": []
336 | }
337 | ],
338 | "metadata": {
339 | "kernelspec": {
340 | "display_name": "Python 3",
341 | "language": "python",
342 | "name": "python3"
343 | },
344 | "language_info": {
345 | "codemirror_mode": {
346 | "name": "ipython",
347 | "version": 3
348 | },
349 | "file_extension": ".py",
350 | "mimetype": "text/x-python",
351 | "name": "python",
352 | "nbconvert_exporter": "python",
353 | "pygments_lexer": "ipython3",
354 | "version": "3.5.2"
355 | }
356 | },
357 | "nbformat": 4,
358 | "nbformat_minor": 2
359 | }
360 |
--------------------------------------------------------------------------------
/models/feature_selection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import datetime\n",
10 | "from sklearn.model_selection import KFold\n",
11 | "from sklearn.ensemble import RandomForestClassifier\n",
12 | "from sklearn.metrics import accuracy_score\n",
13 | "import pandas as pd\n",
14 | "import numpy as np\n",
15 | "\n",
16 | "from tools import f1_score\n",
17 | "\n",
18 | "# path\n",
19 | "path_to_data = \"../../data/\"\n",
20 | "path_to_submissions = \"../../submissions/\"\n",
21 | "\n",
22 | "parameters = {\n",
23 | " \"n_estimators\": 10,\n",
24 | " \"criterion\": \"entropy\", # default = gini\n",
25 | " \"bootstrap\": True\n",
26 | "}\n",
27 | "# parameters\n",
28 | "\n",
29 | "# load data\n",
30 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
31 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
32 | "del training[\"my_index\"]\n",
33 | "del testing[\"my_index\"]\n",
34 | "\n",
35 | "\n",
36 | "import pandas\n",
37 | "import numpy\n",
38 | "from sklearn.feature_selection import SelectKBest\n",
39 | "from sklearn.feature_selection import chi2\n",
40 | "# load data\n",
41 | "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data\"\n",
42 | "names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n",
43 | "dataframe = pandas.read_csv(url, names=names)\n",
44 | "array = dataframe.values\n",
45 | "X = array[:,0:8]\n",
46 | "Y = array[:,8]\n",
47 | "# feature extraction\n",
48 | "test = SelectKBest(score_func=chi2, k=4)\n",
49 | "fit = test.fit(X, Y)\n",
50 | "# summarize scores\n",
51 | "numpy.set_printoptions(precision=3)\n",
52 | "print(fit.scores_)\n",
53 | "features = fit.transform(X)\n",
54 | "# summarize selected features\n",
55 | "print(features[0:5,:])"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.5.2"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 0
80 | }
81 |
--------------------------------------------------------------------------------
/models/feature_selection.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.ensemble import RandomForestClassifier
3 | from sklearn.model_selection import KFold
4 |
5 | from models.tools import f1_score
6 |
7 | # path
8 | path_to_data = "data/"
9 | path_to_submissions = "submissions/"
10 | path_to_stacking = "stacking"
11 | path_to_plots = "plots"
12 |
13 | # tuned hyper-parameters
14 |
15 | parameters = {
16 | "n_estimators": 100,
17 | "criterion": "entropy", # default = gini
18 | "max_depth": 20,
19 | "min_samples_leaf": 10,
20 | "bootstrap": True,
21 | "n_jobs": -1
22 | }
23 |
24 | # load data
25 | training = pd.read_csv(path_to_data + "training_features.txt")
26 | del training["my_index"]
27 |
28 | # replace inf in shortest_path with -1
29 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])
30 |
31 | my_features_string = [
32 | "date_diff",
33 | "overlap_title",
34 | "common_author",
35 | "score_1_2",
36 | "score_2_1",
37 | "cosine_distance",
38 | "journal_similarity",
39 | "overlapping_words_abstract",
40 | "jaccard",
41 | "adar",
42 | "preferential_attachment",
43 | "resource_allocation_index",
44 | "out_neighbors",
45 | "in_neighbors",
46 | "common_neighbors",
47 | "shortest_path",
48 | "popularity",
49 | "common_successors",
50 | "common_predecessors",
51 | "paths_of_length_one",
52 | "authors_citation",
53 | "normalized_authors_citation",
54 | "best_authors_citation",
55 | "coauthor_score",
56 | "normalized_coauthor_score",
57 | "best_coauthor_score",
58 | "authors_in_neighbors",
59 | "normalized_authors_in_neighbors",
60 | "best_authors_in_neighbors"
61 | ]
62 |
63 | my_features_index = []
64 | my_features_dic = {}
65 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
66 |
67 | target = 0
68 | for i in range(len(training.columns)):
69 | if training.columns[i] == "target":
70 | target = i
71 |
72 | Y_train = training.values[:, target].astype(int)
73 |
74 | del training["target"]
75 |
76 | already_computed_names = []
77 | already_computed = []
78 |
79 | for i in range(len(training.columns)):
80 | if training.columns[i] in my_features_string:
81 | my_features_dic.update({i: training.columns[i]})
82 | my_features_index.append(i)
83 | if training.columns[i] in already_computed_names:
84 | already_computed.append(i)
85 |
86 | features_to_keep = []
87 | for u in range(len(my_features_index)):
88 |
89 | try:
90 | features_to_keep.append(already_computed[u])
91 | print("added already computed feature " + str(my_features_dic[already_computed[u]]))
92 | except:
93 |
94 | features_to_keep_names = [my_features_dic[i] for i in features_to_keep]
95 | print("new round !")
96 | print("u = " + str(u) + ", current features are: " + str(features_to_keep_names))
97 | best_test_score = 0.0
98 | best_train_score = 0.0
99 | best_index = 0
100 | for i, f in my_features_dic.items():
101 | if i not in features_to_keep:
102 | # separating features and labels
103 | print("testing additional feature: " + f)
104 | current_features = features_to_keep + [i]
105 |
106 | X_train = training.values[:, current_features]
107 |
108 | RF = RandomForestClassifier(
109 | n_estimators=parameters["n_estimators"],
110 | criterion=parameters["criterion"],
111 | max_depth=parameters["max_depth"],
112 | min_samples_leaf=parameters["min_samples_leaf"],
113 | bootstrap=parameters["bootstrap"],
114 | n_jobs=parameters["n_jobs"]
115 | )
116 | k = 2
117 | kf = KFold(k)
118 | train_score = 0.0
119 | test_score = 0.0
120 |
121 | for train_index, test_index in kf.split(X_train, Y_train):
122 | RF.fit(X_train[train_index], Y_train[train_index])
123 | Y_pred = RF.predict(X_train[test_index])
124 | Y_pred_train = RF.predict(X_train[train_index])
125 | train_score += f1_score(Y_train[train_index], Y_pred_train)
126 | test_score += f1_score(Y_train[test_index], Y_pred)
127 |
128 | train_score /= k
129 | test_score /= k
130 |
131 | if test_score > best_test_score:
132 | best_index = i
133 | best_train_score = train_score
134 | best_test_score = test_score
135 |
136 | print("train score: " + str(train_score))
137 | print("test score: " + str(test_score))
138 | print("")
139 |
140 | print("for this round, the best feature was " + my_features_dic[best_index])
141 | features_to_keep.append(best_index)
142 | print("the scores obtained were: ")
143 | print("train score: " + str(best_train_score))
144 | print("test score: " + str(best_test_score))
145 | print("\n\n\n\n")
146 |
147 | # # print feature importances
148 | # for i in range(len(RF.feature_importances_)):
149 | # print(str(my_features_dic[i]) + ": " + str(RF.feature_importances_[i]))
150 |
--------------------------------------------------------------------------------
/models/lgbm.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import lightgbm as lgb
4 | import numpy as np
5 | import pandas as pd
6 | from sklearn.model_selection import KFold
7 |
8 | from models.tools import f1_score, f1_score_lgbm, load_data
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 | path_to_stacking = "stacking/"
14 | path_to_plots = "models/plots/"
15 |
16 | # tuned hyper-parameters
17 | parameters = {
18 | 'task': 'train',
19 | 'boosting_type': 'gbdt',
20 | 'objective': 'binary',
21 | # 'metric': {},
22 | 'num_leaves': 200,
23 | 'learning_rate': 0.1,
24 | 'feature_fraction': 0.5,
25 | 'bagging_fraction': 0.6,
26 | 'bagging_freq': 5,
27 | 'verbose': 0,
28 | "min_data_in_leaf": 3,
29 | "max_depth": 150
30 | }
31 | # used features
32 |
33 | my_features_string = [
34 | "date_diff",
35 | "overlap_title",
36 | "common_author",
37 | # "score_1_2",
38 | # "score_2_1",
39 | "cosine_distance",
40 | "journal_similarity",
41 | # "overlapping_words_abstract",
42 | # "jaccard",
43 | # "adar",
44 | "preferential_attachment",
45 | # "resource_allocation_index",
46 | "out_neighbors",
47 | "in_neighbors",
48 | "common_neighbors",
49 | "shortest_path",
50 | "popularity",
51 | "common_successors",
52 | "common_predecessors",
53 | "paths_of_length_one",
54 | "authors_citation",
55 | "coauthor_score"
56 | # "katz"
57 | # "katz_2"
58 | ]
59 |
60 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
61 |
62 | # load data
63 |
64 | (X_train,
65 | X_test,
66 | Y_train,
67 | my_features_index,
68 | my_features_dic) = load_data(my_features_string)
69 |
70 | # print user info
71 | now = datetime.datetime.now()
72 | print("date: " + str(now))
73 | print("features: " + str(my_features_string))
74 | print("model: LGBM")
75 | print("parameters:")
76 | print(parameters)
77 | print("cross validation:")
78 |
79 | # instantiate Kfold and predictions placeholder
80 | k = 5
81 | kf = KFold(k)
82 | predictions = np.zeros((X_test.shape[0], k))
83 | predictions_train = np.zeros(X_train.shape[0])
84 | i = 0
85 |
86 | # for each fold store predictions on test set and print validation results
87 | results = []
88 | print('Start training...')
89 | for train_index, test_index in kf.split(X_train):
90 | lgb_train = lgb.Dataset(X_train[train_index], Y_train[train_index])
91 | lgb_eval = lgb.Dataset(X_train[test_index], Y_train[test_index], reference=lgb_train)
92 | gbm = lgb.train(parameters,
93 | train_set=lgb_train,
94 | num_boost_round=100,
95 | valid_sets=lgb_eval,
96 | verbose_eval=40,
97 | feval=f1_score_lgbm
98 | )
99 | res = gbm.predict(X_test)
100 | Y_pred = gbm.predict(X_train[test_index])
101 | Y_pred_train = gbm.predict(X_train[train_index])
102 | predictions[:, i] = res
103 | predictions_train[test_index] = Y_pred
104 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train.round())))
105 | print("test: " + str(f1_score(Y_train[test_index], Y_pred.round())))
106 | i += 1
107 |
108 | # save submission file
109 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
110 | submission = pd.DataFrame(Y_test)
111 | submission.to_csv(
112 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "lgbm" + ".csv",
113 | index=True,
114 | index_label="id",
115 | header=["category"]
116 | )
117 |
118 | # save probabilities for stacking
119 | stacking_logits_test = np.sum(predictions, axis=1)
120 | stacking_test = pd.DataFrame(stacking_logits_test)
121 | stacking_test.to_csv(
122 | path_or_buf=path_to_stacking + "lgbm_test" + ".csv",
123 | index=True,
124 | index_label="id",
125 | header=["category"]
126 | )
127 |
128 | stacking_train = pd.DataFrame(predictions_train)
129 | stacking_train.to_csv(
130 | path_or_buf=path_to_stacking + "lgbm_train" + ".csv",
131 | index=True,
132 | index_label="id",
133 | header=["category"]
134 | )
135 |
--------------------------------------------------------------------------------
/models/logistic_regression.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.linear_model import LogisticRegressionCV
6 | from sklearn.model_selection import KFold
7 |
8 | from models.tools import f1_score
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 |
14 | # parameters
15 | parameters = {
16 | "max_iter": 100,
17 | "tol": 1e-6,
18 | "penalty": "l2"
19 | }
20 |
21 | # load data
22 | training = pd.read_csv(path_to_data + "training_features.txt")
23 | testing = pd.read_csv(path_to_data + "testing_features.txt")
24 | del training["my_index"]
25 | del testing["my_index"]
26 |
27 | # replace inf in shortest_path with -1
28 | training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])
29 | testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])
30 |
31 | my_features_string = [
32 | "date_diff",
33 | "overlap_title",
34 | "common_author",
35 | "score_1_2",
36 | # "score_2_1",
37 | # "cosine_distance",
38 | # "journal_similarity",
39 | # "overlapping_words_abstract",
40 | # "jaccard",
41 | # "adar",
42 | # "preferential_attachment",
43 | # "resource_allocation_index",
44 | # "out_neighbors",
45 | # "in_neighbors",
46 | # "common_neighbors",
47 | "shortest_path",
48 | "popularity"
49 | ]
50 | my_features_index = []
51 | my_features_dic = {}
52 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
53 |
54 | target = 0
55 | for i in range(len(training.columns)):
56 | if training.columns[i] == "target":
57 | target = i
58 |
59 | Y_train = training.values[:, target].astype(int)
60 |
61 | del training["target"]
62 |
63 | for i in range(len(training.columns)):
64 | if training.columns[i] in my_features_string:
65 | my_features_dic.update({i: training.columns[i]})
66 | my_features_index.append(i)
67 |
68 | # separating features and labels
69 | training_val = training.values
70 | testing_val = testing.values
71 | X_train = training_val[:, my_features_index].astype(float)
72 | X_test = testing_val[:, my_features_index]
73 |
74 | now = datetime.datetime.now()
75 | print("date: " + str(now))
76 | print("features: " + str(my_features_string))
77 | print("model: Random Forest")
78 | print("parameters:")
79 | print(parameters)
80 | print("cross validation:")
81 |
82 | LogReg = LogisticRegressionCV(max_iter=parameters['max_iter'],
83 | tol=parameters['tol'],
84 | penalty=parameters['penalty'])
85 | k = 5
86 | kf = KFold(k)
87 | predictions = np.zeros((X_test.shape[0], k))
88 | i = 0
89 |
90 | for train_index, test_index in kf.split(X_train, Y_train):
91 | LogReg.fit(X_train[train_index], Y_train[train_index])
92 | Y_pred = LogReg.predict(X_train[test_index])
93 | Y_pred_train = LogReg.predict(X_train[train_index])
94 | predictions[:, i] = LogReg.predict(X_test)
95 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
96 | print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
97 | i += 1
98 |
99 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
100 |
101 | # submission = pd.DataFrame(Y_test)
102 | # submission.to_csv(
103 | # path_or_buf=path_to_submissions+"-".join(my_features_string)+"LogReg.csv",
104 | # index=True,
105 | # index_label="id",
106 | # header=["category"]
107 | # )
108 |
--------------------------------------------------------------------------------
/models/nn.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import numpy
4 | import numpy as np
5 | import pandas as pd
6 | from keras.layers import Dense, Dropout
7 | from keras.models import Sequential
8 | from keras.wrappers.scikit_learn import KerasClassifier
9 | from sklearn.model_selection import StratifiedKFold
10 | from sklearn.preprocessing import StandardScaler
11 |
12 | from models.tools import load_data
13 |
14 | # path
15 | path_to_data = "data/"
16 | path_to_submissions = "submissions/"
17 | path_to_stacking = "stacking/"
18 | path_to_plots = "plots/"
19 |
20 | # tuned hyper-parameters
21 |
22 | parameters = {
23 | "n_estimators": 150,
24 | "criterion": "entropy", # default = gini
25 | "max_depth": 15, # 9
26 | "min_samples_leaf": 4, # 10
27 | "bootstrap": True,
28 | "n_jobs": -1
29 | }
30 |
31 | # features used
32 |
33 | my_features_string = [
34 | "date_diff",
35 | "overlap_title",
36 | "common_author",
37 | "score_1_2",
38 | "score_2_1",
39 | "cosine_distance",
40 | "journal_similarity",
41 | # "overlapping_words_abstract",
42 | "jaccard",
43 | "adar",
44 | "preferential_attachment",
45 | "resource_allocation_index",
46 | "out_neighbors",
47 | "in_neighbors",
48 | "common_neighbors",
49 | # "shortest_path",
50 | "popularity",
51 | "authors_citation",
52 | "coauthor_score"
53 | # "paths_of_length_one"
54 | # "katz"
55 | # "katz_2"
56 | ]
57 |
58 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
59 |
60 | (X_train,
61 | X_test,
62 | Y_train,
63 | my_features_index,
64 | my_features_dic) = load_data(my_features_string)
65 |
66 | # normalize data
67 | scaler = StandardScaler()
68 | X_train = scaler.fit_transform(X_train)
69 | X_test = scaler.transform(X_test)
70 |
71 | # Function to create model, required for KerasClassifier
72 | nb_input = len(my_features_string)
73 |
74 |
75 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
76 | # create model
77 | model = Sequential()
78 | model.add(Dense(neurons, input_dim=nb_input, activation=activation))
79 | model.add(Dropout(dropout_rate))
80 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
81 | # Compile model
82 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
83 | return model
84 |
85 |
86 | # parameters
87 | epochs = 30
88 | batch_size = 128
89 |
90 | # tuned parameters
91 | dropout_rate = 0.2
92 | neurons = 75
93 |
94 | # fix random seed for reproducibility
95 | seed = 7
96 | numpy.random.seed(seed)
97 |
98 | # instantiate classifier
99 | nn = KerasClassifier(build_fn=create_model,
100 | epochs=epochs,
101 | batch_size=batch_size,
102 | dropout_rate=dropout_rate,
103 | neurons=neurons,
104 | verbose=1
105 | )
106 |
107 | # print user info
108 | now = datetime.datetime.now()
109 | print("date: " + str(now))
110 | print("features: " + str(my_features_string))
111 | print("model: Neural Network")
112 | print("parameters:")
113 | print(parameters)
114 | print("cross validation:")
115 |
116 | # instantiate Kfold and predictions placeholder
117 | k = 5
118 | kf = StratifiedKFold(k)
119 | predictions = np.zeros((X_test.shape[0], k))
120 | predictions_test = np.zeros((X_test.shape[0], k))
121 | predictions_train = np.zeros(X_train.shape[0])
122 | i = 0
123 |
124 | # for each fold store predictions on test set and print validation results
125 | test_score = 0.0
126 | for train_index, test_index in kf.split(X_train, Y_train):
127 | nn.fit(X_train[train_index], Y_train[train_index])
128 | Y_pred = nn.predict(X_train[test_index])[:, 0]
129 | Y_pred_train = nn.predict(X_train[train_index])[:, 0]
130 | predictions[:, i] = nn.predict(X_test)[:, 0]
131 | predictions_test[:, i] = nn.predict_proba(X_test)[:, 1]
132 | predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1]
133 | # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0]
134 | # test_score += current_test_score
135 | # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
136 | # print("test: " + str(current_test_score))
137 | i += 1
138 | # print("CV test score: "+str(test_score/k))
139 |
140 | # save submission file
141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
142 | submission = pd.DataFrame(Y_test)
143 | submission.to_csv(
144 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn.csv",
145 | index=True,
146 | index_label="id",
147 | header=["category"]
148 | )
149 |
150 | # save probabilities for stacking
151 | stacking_logits_test = np.sum(predictions_test, axis=1)
152 | stacking_test = pd.DataFrame(stacking_logits_test)
153 | stacking_test.to_csv(
154 | path_or_buf=path_to_stacking + "nn_test" + ".csv",
155 | index=True,
156 | index_label="id",
157 | header=["category"]
158 | )
159 |
160 | stacking_train = pd.DataFrame(predictions_train)
161 | stacking_train.to_csv(
162 | path_or_buf=path_to_stacking + "nn_train" + ".csv",
163 | index=True,
164 | index_label="id",
165 | header=["category"]
166 | )
167 |
--------------------------------------------------------------------------------
/models/nn_deep.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import numpy
4 | import numpy as np
5 | import pandas as pd
6 | from keras.layers import Dense, Dropout
7 | from keras.models import Sequential
8 | from keras.wrappers.scikit_learn import KerasClassifier
9 | from sklearn.model_selection import StratifiedKFold
10 | from sklearn.preprocessing import StandardScaler
11 |
12 | from models.tools import load_data
13 |
14 | # path
15 | path_to_data = "data/"
16 | path_to_submissions = "submissions/"
17 | path_to_stacking = "stacking/"
18 | path_to_plots = "plots/"
19 |
20 | # tuned hyper-parameters
21 |
22 | parameters = {
23 | "n_estimators": 150,
24 | "criterion": "entropy", # default = gini
25 | "max_depth": 15, # 9
26 | "min_samples_leaf": 4, # 10
27 | "bootstrap": True,
28 | "n_jobs": -1
29 | }
30 |
31 | # used features
32 |
33 | my_features_string = [
34 | "date_diff",
35 | "overlap_title",
36 | "common_author",
37 | "score_1_2",
38 | "score_2_1",
39 | "cosine_distance",
40 | "journal_similarity",
41 | # "overlapping_words_abstract",
42 | "jaccard",
43 | "adar",
44 | "preferential_attachment",
45 | "resource_allocation_index",
46 | "out_neighbors",
47 | "in_neighbors",
48 | "common_neighbors",
49 | # "shortest_path",
50 | "popularity",
51 | # "paths_of_length_one"
52 | # "katz"
53 | # "katz_2"
54 | ]
55 |
56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
57 |
58 | (X_train,
59 | X_test,
60 | Y_train,
61 | my_features_index,
62 | my_features_dic) = load_data(my_features_string)
63 |
64 | # normalize data
65 | scaler = StandardScaler()
66 | X_train = scaler.fit_transform(X_train)
67 | X_test = scaler.transform(X_test)
68 |
69 | # Function to create model, required for KerasClassifier
70 | nb_input = len(my_features_string)
71 |
72 |
73 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
74 | # create model
75 | model = Sequential()
76 | model.add(Dense(neurons, input_dim=nb_input, activation=activation))
77 | model.add(Dropout(dropout_rate))
78 | model.add(Dense(2 * neurons, activation=activation))
79 | model.add(Dropout(dropout_rate))
80 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
81 | # Compile model
82 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
83 | return model
84 |
85 |
86 | # parameters
87 | epochs = 30
88 | batch_size = 128
89 |
90 | # tuned parameters
91 | dropout_rate = 0.2
92 | neurons = 75
93 |
94 | # fix random seed for reproducibility
95 | seed = 7
96 | numpy.random.seed(seed)
97 |
98 | # instantiate classifier
99 | nn = KerasClassifier(build_fn=create_model,
100 | epochs=epochs,
101 | batch_size=batch_size,
102 | dropout_rate=dropout_rate,
103 | neurons=neurons,
104 | verbose=1
105 | )
106 |
107 | # print user info
108 | now = datetime.datetime.now()
109 | print("date: " + str(now))
110 | print("features: " + str(my_features_string))
111 | print("model: Neural Network")
112 | print("parameters:")
113 | print(parameters)
114 | print("cross validation:")
115 |
116 | # instantiate Kfold and predictions placeholder
117 | k = 5
118 | kf = StratifiedKFold(k)
119 | predictions = np.zeros((X_test.shape[0], k))
120 | predictions_test = np.zeros((X_test.shape[0], k))
121 | predictions_train = np.zeros(X_train.shape[0])
122 | i = 0
123 |
124 | # for each fold store predictions on test set and print validation results
125 | test_score = 0.0
126 | for train_index, test_index in kf.split(X_train, Y_train):
127 | nn.fit(X_train[train_index], Y_train[train_index])
128 | Y_pred = nn.predict(X_train[test_index])[:, 0]
129 | Y_pred_train = nn.predict(X_train[train_index])[:, 0]
130 | predictions[:, i] = nn.predict(X_test)[:, 0]
131 | predictions_test[:, i] = nn.predict_proba(X_test)[:, 1]
132 | predictions_train[test_index] = nn.predict_proba(X_train[test_index])[:, 1]
133 | # current_test_score = f1_score(Y_train[test_index], Y_pred)[:, 0]
134 | # test_score += current_test_score
135 | # print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
136 | # print("test: " + str(current_test_score))
137 | i += 1
138 | # print("CV test score: "+str(test_score/k))
139 |
140 | # save submission file
141 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
142 | submission = pd.DataFrame(Y_test)
143 | submission.to_csv(
144 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "nn_deep.csv",
145 | index=True,
146 | index_label="id",
147 | header=["category"]
148 | )
149 |
150 | # save probabilities for stacking
151 | stacking_logits_test = np.sum(predictions_test, axis=1)
152 | stacking_test = pd.DataFrame(stacking_logits_test)
153 | stacking_test.to_csv(
154 | path_or_buf=path_to_stacking + "nn_deep_test" + ".csv",
155 | index=True,
156 | index_label="id",
157 | header=["category"]
158 | )
159 |
160 | stacking_train = pd.DataFrame(predictions_train)
161 | stacking_train.to_csv(
162 | path_or_buf=path_to_stacking + "nn_deep_train" + ".csv",
163 | index=True,
164 | index_label="id",
165 | header=["category"]
166 | )
167 |
--------------------------------------------------------------------------------
/models/plots/rf_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance.png
--------------------------------------------------------------------------------
/models/plots/rf_importance_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/plots/rf_importance_full.png
--------------------------------------------------------------------------------
/models/random_forest.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.ensemble import RandomForestClassifier
6 | from sklearn.model_selection import KFold
7 |
8 | from models.tools import f1_score, plot_importance, load_data
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 | path_to_stacking = "stacking/"
14 | path_to_plots = "plots/"
15 |
16 | # tuned hyper-parameters
17 |
18 | parameters = {
19 | "n_estimators": 150,
20 | "criterion": "entropy", # default = gini
21 | "max_depth": 20, # 9
22 | "min_samples_leaf": 10, # 10
23 | "bootstrap": True,
24 | "n_jobs": -1
25 | }
26 |
27 | # used features
28 |
29 | my_features_string = [
30 | "date_diff",
31 | # "overlap_title",
32 | "common_author",
33 | # "score_1_2",
34 | # "score_2_1",
35 | "cosine_distance",
36 | # "journal_similarity",
37 | # "overlapping_words_abstract",
38 | # "jaccard",
39 | # "adar",
40 | "preferential_attachment",
41 | # "resource_allocation_index",
42 | # "out_neighbors",
43 | "in_neighbors",
44 | "common_neighbors",
45 | # "shortest_path",
46 | # "popularity",
47 | # "common_successors",
48 | # "common_predecessors",
49 | # "paths_of_length_one",
50 | "authors_citation",
51 | # "coauthor_score",
52 | # "katz",
53 | # "katz_2"
54 | ]
55 |
56 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
57 |
58 | # load data
59 |
60 | (X_train,
61 | X_test,
62 | Y_train,
63 | my_features_index,
64 | my_features_dic) = load_data(my_features_string)
65 |
66 | # print user info
67 | now = datetime.datetime.now()
68 | print("date: " + str(now))
69 | print("features: " + str(my_features_string))
70 | print("model: Random Forest")
71 | print("parameters:")
72 | print(parameters)
73 | print("cross validation:")
74 |
75 | # instantiate classifier
76 | RF = RandomForestClassifier(
77 | n_estimators=parameters["n_estimators"],
78 | criterion=parameters["criterion"],
79 | max_depth=parameters["max_depth"],
80 | min_samples_leaf=parameters["min_samples_leaf"],
81 | bootstrap=parameters["bootstrap"],
82 | n_jobs=parameters["n_jobs"]
83 | )
84 |
85 | # instantiate Kfold and predictions placeholder
86 | k = 2
87 | kf = KFold(k)
88 | predictions = np.zeros((X_test.shape[0], k))
89 | predictions_test = np.zeros((X_test.shape[0], k))
90 | predictions_train = np.zeros(X_train.shape[0])
91 | i = 0
92 |
93 | # for each fold store predictions on test set and print validation results
94 | test_score = 0.0
95 | for train_index, test_index in kf.split(X_train, Y_train):
96 | RF.fit(X_train[train_index], Y_train[train_index])
97 | Y_pred = RF.predict(X_train[test_index])
98 | Y_pred_train = RF.predict(X_train[train_index])
99 | predictions[:, i] = RF.predict(X_test)
100 | predictions_test[:, i] = RF.predict_proba(X_test)[:, 1]
101 | predictions_train[test_index] = RF.predict_proba(X_train[test_index])[:, 1]
102 | current_test_score = f1_score(Y_train[test_index], Y_pred)
103 | test_score += current_test_score
104 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
105 | print("test: " + str(current_test_score))
106 | i += 1
107 |
108 | print("CV test score: " + str(test_score / k))
109 | # save submission file
110 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
111 | submission = pd.DataFrame(Y_test)
112 | submission.to_csv(
113 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "RF.csv",
114 | index=True,
115 | index_label="id",
116 | header=["category"]
117 | )
118 |
119 | # save probabilities for stacking
120 | stacking_logits_test = np.sum(predictions_test, axis=1)
121 | stacking_test = pd.DataFrame(stacking_logits_test)
122 | stacking_test.to_csv(
123 | path_or_buf=path_to_stacking + "rf_test_2" + ".csv",
124 | index=True,
125 | index_label="id",
126 | header=["category"]
127 | )
128 |
129 | stacking_train = pd.DataFrame(predictions_train)
130 | stacking_train.to_csv(
131 | path_or_buf=path_to_stacking + "rf_train_2" + ".csv",
132 | index=True,
133 | index_label="id",
134 | header=["category"]
135 | )
136 |
137 | # plot feature importances
138 | plot_importance(RF,
139 | features_dict=my_features_dic,
140 | features_index=my_features_index,
141 | name='rf_importance')
142 |
--------------------------------------------------------------------------------
/models/svm.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn import svm
6 | from sklearn.model_selection import StratifiedKFold
7 | from sklearn.preprocessing import StandardScaler
8 |
9 | from models.tools import f1_score, load_data
10 |
11 | # path
12 | path_to_data = "data/"
13 | path_to_submissions = "submissions/"
14 | path_to_stacking = "stacking/"
15 | path_to_plots = "models/plots"
16 |
17 | # used features
18 |
19 | my_features_string = [
20 | "date_diff",
21 | "overlap_title",
22 | "common_author",
23 | # "score_1_2",
24 | # "score_2_1",
25 | "cosine_distance",
26 | # "journal_similarity",
27 | # # "overlapping_words_abstract",
28 | # "jaccard",
29 | # "adar",
30 | "preferential_attachment",
31 | # "resource_allocation_index",
32 | "out_neighbors",
33 | "in_neighbors",
34 | "common_neighbors",
35 | # # "shortest_path",
36 | # "popularity",
37 | # # "paths_of_length_one"
38 | # "katz"
39 | # "katz_2"
40 | ]
41 |
42 | my_features_acronym = ["_".join(list(map(lambda x: x[0], string.split('_')))) for string in my_features_string]
43 |
44 | # load data
45 |
46 | (X_train,
47 | X_test,
48 | Y_train,
49 | my_features_index,
50 | my_features_dic) = load_data(my_features_string)
51 |
52 | # normalize data
53 | scaler = StandardScaler()
54 | X_train = scaler.fit_transform(X_train)
55 | X_test = scaler.transform(X_test)
56 |
57 | # tuned hyperparameters
58 | parameters = {
59 | 'C': 0.1,
60 | 'gamma': 0.01,
61 | 'kernel': "linear"
62 | }
63 |
64 | # print user info
65 | now = datetime.datetime.now()
66 | print("date: " + str(now))
67 | print("features: " + str(my_features_string))
68 | print("model: SVM")
69 | print("parameters:")
70 | print(parameters)
71 | print("cross validation:")
72 |
73 | # instantiate classifier
74 | svm_classifier = svm.SVC(C=parameters['C'],
75 | gamma=parameters['gamma'],
76 | kernel=parameters['kernel'],
77 | probability=True,
78 | verbose=1)
79 |
80 | # instantiate Kfold and predictions placeholder
81 | k = 2
82 | kf = StratifiedKFold(k)
83 | predictions = np.zeros((X_test.shape[0], k))
84 | predictions_test = np.zeros((X_test.shape[0], k))
85 | predictions_train = np.zeros(X_train.shape[0])
86 | i = 0
87 |
88 | # for each fold store predictions on test set and print validation results
89 | for train_index, test_index in kf.split(X_train, Y_train):
90 | svm_classifier.fit(X_train[train_index], Y_train[train_index])
91 | Y_pred = svm_classifier.predict(X_train[test_index])
92 | Y_pred_train = svm_classifier.predict(X_train[train_index])
93 | predictions[:, i] = svm_classifier.predict(X_test)
94 | predictions_test[:, i] = svm_classifier.predict_proba(X_test)[:, 1]
95 | predictions_train[test_index] = svm_classifier.predict_proba(X_train[test_index])[:, 1]
96 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
97 | print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
98 | i += 1
99 |
100 | # save submission file
101 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
102 | submission = pd.DataFrame(Y_test)
103 | submission.to_csv(
104 | path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "SVM.csv",
105 | index=True,
106 | index_label="id",
107 | header=["category"]
108 | )
109 |
110 | # save probabilities for stacking
111 | stacking_logits_test = np.sum(predictions_test, axis=1)
112 | stacking_test = pd.DataFrame(stacking_logits_test)
113 | stacking_test.to_csv(
114 | path_or_buf=path_to_stacking + "svmlinear_test" + ".csv",
115 | index=True,
116 | index_label="id",
117 | header=["category"]
118 | )
119 |
120 | stacking_train = pd.DataFrame(predictions_train)
121 | stacking_train.to_csv(
122 | path_or_buf=path_to_stacking + "svmlinear_train" + ".csv",
123 | index=True,
124 | index_label="id",
125 | header=["category"]
126 | )
127 |
--------------------------------------------------------------------------------
/models/tools.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import pandas as pd
5 | import seaborn as sns
6 |
7 |
8 | def binary_error(preds, train_data):
9 | labels = train_data.get_label()
10 | return 'error', np.mean(labels != (preds > 0.5)), False
11 |
12 |
13 | def f1_score_lgbm(preds, train_data):
14 | labels = train_data.get_label()
15 | tp = np.sum(labels[labels == 1] == (preds[labels == 1] > 0.5))
16 | tn = np.sum(labels[labels == 0] == (preds[labels == 0] > 0.5))
17 | fp = np.sum(labels[labels == 1] != (preds[labels == 1] > 0.5))
18 | fn = np.sum(labels[labels == 0] != (preds[labels == 0] > 0.5))
19 | p = tp / (tp + fp)
20 | r = tp / (tp + fn)
21 |
22 | return 'f1 score', 2 * p * r / (p + r), False
23 |
24 |
25 | def f1_score(preds, labels):
26 | tp = np.sum(labels[labels == 1] == preds[labels == 1])
27 | tn = np.sum(labels[labels == 0] == preds[labels == 0])
28 | fp = np.sum(labels[labels == 1] != preds[labels == 1])
29 | fn = np.sum(labels[labels == 0] != preds[labels == 0])
30 | p = tp / (tp + fp)
31 | r = tp / (tp + fn)
32 |
33 | return 2 * p * r / (p + r)
34 |
35 |
36 | def load_data(my_features_string):
37 | # path
38 | path_to_data = "data/"
39 |
40 | # feature tracking utils
41 | my_features_index = []
42 | my_features_dic = {}
43 |
44 | # load raw data
45 | training = pd.read_csv(path_to_data + "training_features.txt")
46 | testing = pd.read_csv(path_to_data + "testing_features.txt")
47 |
48 | del training["my_index"]
49 | del testing["my_index"]
50 |
51 | # track features and target
52 | target = 0
53 | for i in range(len(training.columns)):
54 | if training.columns[i] == "target":
55 | target = i
56 |
57 | Y_train = training.values[:, target].astype(int)
58 |
59 | del training["target"]
60 |
61 | for i in range(len(training.columns)):
62 | if training.columns[i] in my_features_string:
63 | my_features_dic.update({i: training.columns[i]})
64 | my_features_index.append(i)
65 |
66 | # separating features and labels
67 | training_val = training.values
68 | testing_val = testing.values
69 | X_train = training_val[:, my_features_index].astype(float)
70 | X_test = testing_val[:, my_features_index]
71 |
72 | del training_val
73 | del testing_val
74 |
75 | print(training.head())
76 | print(testing.head())
77 |
78 | return X_train, X_test, Y_train, my_features_index, my_features_dic
79 |
80 |
81 | # plotting feature importances
82 | def plot_importance(rf, features_dict, features_index, name):
83 | # plot settings
84 | sns.set_style("darkgrid")
85 | mpl.rcParams['figure.dpi'] = 200
86 | # mpl.rcParams['figure.tight_layout'] = True
87 | path_to_plot = "models/plots/"
88 |
89 | # fetch mean importances
90 | importances = rf.feature_importances_
91 | # compute std using each estimator in the forest
92 | std = np.std([tree.feature_importances_ for tree in rf.estimators_],
93 | axis=0)
94 | # argsort the values
95 | index = list(map(int, np.argsort(importances)[::-1]))
96 | # Plot the feature importances of the rf
97 | plt.figure()
98 | # get axis
99 | fig, ax = plt.subplots(figsize=(6, 3))
100 | # add space for x labels
101 | plt.subplots_adjust(bottom=0.30)
102 | plt.title("Feature importances")
103 | # get number of features
104 | nb_features = len(features_dict)
105 | # plot with error bars
106 | plt.bar(range(nb_features), importances[index],
107 | color="r", yerr=std[index], align="center")
108 | # create x axis tickers
109 | plt.xticks(range(nb_features), index)
110 | # get feature names in right order
111 | index_features_sorted = np.array(features_index)[index]
112 | feature_names = list(map(lambda x: features_dict[x], index_features_sorted))
113 | # font dict to control x tickers labels
114 | ax.set_xticklabels(feature_names, rotation=40, fontsize=9, ha='right')
115 | plt.xlim([-1, nb_features])
116 | plt.ylim([0, 0.8])
117 | plt.savefig(path_to_plot + name)
118 | plt.show()
119 |
--------------------------------------------------------------------------------
/models/tuning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/__init__.py
--------------------------------------------------------------------------------
/models/tuning/objective_function.py:
--------------------------------------------------------------------------------
1 | class ObjectiveFunction:
2 | """class to analyze objective function optimization : hyperparameter tuning"""
3 |
4 | def __init__(self, func):
5 | self.f = func
6 | self.history_f = []
7 | self.history_fbest = None
8 | self.history_bests = []
9 |
10 | def __call__(self, x):
11 | val = self.f(x)
12 | self.history_f.append(-val)
13 | if self.history_fbest is None:
14 | self.history_fbest = val
15 | self.history_bests.append(-val)
16 | elif self.history_fbest > val:
17 | self.history_fbest = val
18 | self.history_bests.append(-val)
19 | else:
20 | self.history_bests.append(-self.history_fbest)
21 | return val
22 |
--------------------------------------------------------------------------------
/models/tuning/plots/grid_lgbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/models/tuning/plots/grid_lgbm.png
--------------------------------------------------------------------------------
/models/tuning/tools.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import seaborn as sns
5 |
6 |
7 | # Function to help plotting the results of our cross-validation for the random forest algorithm.
8 | def triple_ticker_grid(tuned_parameters, parameter_1, parameter_2, parameter_3):
9 | ticker_labels = []
10 | for i in tuned_parameters[parameter_1]:
11 | for j in tuned_parameters[parameter_2]:
12 | for k in tuned_parameters[parameter_3]:
13 | ticker_labels.append(str((i, j, k)))
14 | return ticker_labels
15 |
16 |
17 | def double_ticker_grid(tuned_parameters, parameter_1, parameter_2):
18 | ticker_labels = []
19 | for i in tuned_parameters[parameter_1]:
20 | for j in tuned_parameters[parameter_2]:
21 | ticker_labels.append(str((i, j)))
22 | return ticker_labels
23 |
24 |
25 | # fucntion for plotting the results of the grid search
26 | def plot_grid(metrics, params, param_names, index, name):
27 | # plot settings
28 | sns.reset_orig()
29 | mpl.rcParams['figure.dpi'] = 200
30 | path_to_plot = "models/tuning/plots/"
31 |
32 | # For the test set
33 | plt.figure(figsize=(30, 30))
34 | f, (ax1, ax2) = plt.subplots(2, 1, sharey=True)
35 | x = range(len(metrics['params']))
36 | ax1.plot(x, list(metrics['mean_test_precision']), '--',
37 | label='mean test precision', color='g')
38 | ax1.plot(x, list(metrics['mean_test_recall']), '-.',
39 | label='mean test recall', color='r')
40 | ax1.plot(x, list(metrics['mean_test_roc_auc']), '-o',
41 | label='mean test roc auc', color='b')
42 | ax1.plot(x, list(metrics['mean_test_accuracy']), '-*',
43 | label='mean test accuracy', color='purple')
44 | ax1.plot(x, list(metrics['mean_test_f1']), '-',
45 | label='mean test f1', color='orange')
46 | y_13 = np.arange(0.9, 1.05, 0.05)
47 | x_13 = np.repeat(index, len(y_13))
48 |
49 | ax1.plot(x_13, y_13, '-.', color='pink', lw=2.0)
50 | plt.ylim([0.95, 1.0])
51 | ax1.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=7)
52 | ax1.set_ylabel('Test metrics')
53 | plt.subplots_adjust(left=0.40, bottom=0.15)
54 | plt.title(" ".join(param_names) + " Grid Search")
55 |
56 | # Setting the labels for the x-axis (gridsearch combination)
57 | # x_ticks_labels = double_parameter_cross_validation(params,
58 | # 'max_depth',
59 | # 'min_samples_leaf',
60 | # 'n_estimators')
61 | # Set number of ticks for x-axis
62 | ax1.set_xticks([])
63 |
64 | # Set ticks labels for x-axis
65 | # ax1.set_xticklabels(x_ticks_labels, rotation=70, fontsize=6);
66 |
67 | # For the train set
68 | ax2.plot(x, list(metrics['mean_train_precision']), '--',
69 | label='mean train precision', color='c')
70 | ax2.plot(x, list(metrics['mean_train_recall']), '-.',
71 | label='mean train recall', color='m')
72 | ax2.plot(x, list(metrics['mean_train_roc_auc']), '-o',
73 | label='mean train roc auc', color='y')
74 | ax2.plot(x, list(metrics['mean_train_accuracy']), '-*',
75 | label='mean train accuracy', color='k')
76 | ax2.plot(x, list(metrics['mean_train_f1']), '-',
77 | label='mean train f1', color='orange')
78 | ax2.plot(x_13, y_13, '-.', color='pink', lw=2.0)
79 | ax2.legend(bbox_to_anchor=(-0.2, 0.2), loc=4, borderaxespad=0., fontsize=6)
80 | plt.ylim([0.95, 1.0])
81 | if len(param_names) == 2:
82 | x_ticks_labels = double_ticker_grid(params,
83 | param_names[0],
84 | param_names[1])
85 | if len(param_names) == 3:
86 | x_ticks_labels = triple_ticker_grid(params,
87 | param_names[0],
88 | param_names[1],
89 | param_names[2])
90 | # Set number of ticks for x-axis
91 | ax2.set_xticks(x)
92 | # Set ticks labels for x-axis
93 | ax2.set_xticklabels(x_ticks_labels, rotation=70, fontsize=7, ha='right')
94 | ax2.set_ylabel('Train metrics')
95 | plt.savefig(path_to_plot + name)
96 | plt.show()
97 |
--------------------------------------------------------------------------------
/models/tuning/tuning_lgbm.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from lightgbm import LGBMClassifier
4 | from sklearn.model_selection import GridSearchCV
5 |
6 | from models.tools import load_data
7 | from models.tuning.tools import plot_grid
8 |
9 | # deactivate deprecation warnings
10 | warnings.simplefilter("ignore", DeprecationWarning)
11 |
12 | n_jobs = 2
13 |
14 | # path
15 | path_to_data = "data/"
16 | path_to_submissions = "submissions/"
17 | path_to_stacking = "stacking/"
18 | path_to_plots = "plots/"
19 |
20 | # used features
21 |
22 | my_features_string = [
23 | "date_diff",
24 | "overlap_title",
25 | "common_author",
26 | "score_1_2",
27 | "score_2_1",
28 | "cosine_distance",
29 | "journal_similarity",
30 | "overlapping_words_abstract",
31 | "jaccard",
32 | "adar",
33 | "preferential_attachment",
34 | "resource_allocation_index",
35 | "out_neighbors",
36 | "in_neighbors",
37 | "common_neighbors",
38 | # "shortest_path",
39 | "popularity",
40 | "common_successors",
41 | "common_predecessors",
42 | # "paths_of_length_one",
43 | "authors_citation"
44 | "coauthor_score"
45 | # # "katz"
46 | # # "katz_2"
47 | ]
48 |
49 | # load data
50 |
51 | (X_train,
52 | X_test,
53 | Y_train,
54 | my_features_index,
55 | my_features_dic) = load_data(my_features_string)
56 |
57 | # GridSearchCV
58 |
59 | # param grid
60 |
61 | tuned_parameters = {
62 | # 'metric': {},
63 | 'num_leaves': [150, 200, 250],
64 | "min_data_in_leaf": [2, 4, 6],
65 | "max_depth": [150, 200, 250]
66 | }
67 |
68 | # tuning
69 | gbm = LGBMClassifier(
70 | boosting_type='gbdt',
71 | objective='binary',
72 | # 'metric': {},
73 | learning_rate=0.1,
74 | feature_fraction=0.4,
75 | bagging_fraction=0.6,
76 | bagging_freq=5,
77 | silent=True)
78 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"]
79 | grid_lgbm = GridSearchCV(gbm,
80 | param_grid=tuned_parameters,
81 | scoring=metrics,
82 | refit='f1',
83 | cv=5,
84 | n_jobs=n_jobs
85 | )
86 | grid_lgbm.fit(X_train, Y_train, verbose=-1)
87 | print("GridSearch best parameters", grid_lgbm.best_params_)
88 |
89 | # plot grid search results
90 | best_params = grid_lgbm.best_params_
91 | results = grid_lgbm.cv_results_
92 | index = grid_lgbm.best_index_
93 | plot_grid(metrics=results,
94 | params=tuned_parameters,
95 | index=index,
96 | param_names=list(tuned_parameters),
97 | name="grid_lgbm")
98 |
--------------------------------------------------------------------------------
/models/tuning/tuning_nn.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from keras.layers import Dense, Dropout
3 | from keras.models import Sequential
4 | from keras.wrappers.scikit_learn import KerasClassifier
5 | from sklearn.model_selection import GridSearchCV
6 | from sklearn.preprocessing import StandardScaler
7 |
8 | from models.tools import load_data
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 | path_to_stacking = "stacking/"
14 | path_to_plots = "models/plots/"
15 |
16 | # load data
17 | my_features_string = [
18 | "date_diff",
19 | "overlap_title",
20 | "common_author",
21 | "score_1_2",
22 | "score_2_1",
23 | "cosine_distance",
24 | "journal_similarity",
25 | # "overlapping_words_abstract",
26 | "jaccard",
27 | "adar",
28 | "preferential_attachment",
29 | "resource_allocation_index",
30 | "out_neighbors",
31 | "in_neighbors",
32 | "common_neighbors",
33 | # "shortest_path",
34 | "popularity",
35 | # "paths_of_length_one"
36 | # "katz"
37 | # "katz_2"
38 | ]
39 |
40 | (X_train,
41 | X_test,
42 | Y_train,
43 | my_features_index,
44 | my_features_dic) = load_data(my_features_string)
45 |
46 | # check for nans
47 | data = pd.DataFrame(X_test)
48 | print(data.info())
49 | print(data.isna().sum(axis=0))
50 | print(data.min(axis=0))
51 | print(data.max(axis=0))
52 | print(my_features_index)
53 | print(my_features_dic)
54 |
55 | # normalize data
56 | scaler = StandardScaler()
57 | X_train = scaler.fit_transform(X_train)
58 | X_test = scaler.transform(X_test)
59 |
60 | # Function to create model, required for KerasClassifier
61 | nb_input = len(my_features_string)
62 |
63 |
64 | def create_model(neurons=1, dropout_rate=0.1, activation='relu'):
65 | # create model
66 | model = Sequential()
67 | model.add(Dense(neurons, input_dim=nb_input, activation=activation))
68 | model.add(Dropout(dropout_rate))
69 | model.add(Dense(1, input_dim=nb_input, activation='sigmoid'))
70 | # Compile model
71 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
72 | return model
73 |
74 | # fixed parameters
75 | epochs = 20
76 | batch_size = 128
77 |
78 | # create model
79 | model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=batch_size, verbose=1)
80 |
81 | # define the grid search parameters
82 | neurons = [15, 30, 45, 60, 75]
83 | dropout_rate = [0.0, 0.1, 0.2, 0.3]
84 | activation = ['relu', 'tanh', 'sigmoid']
85 | param_grid = dict(neurons=neurons, dropout_rate=dropout_rate, activation=activation)
86 | grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
87 | grid_result = grid.fit(X_train, Y_train)
88 |
89 | # summarize results
90 | print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
91 | means = grid_result.cv_results_['mean_test_score']
92 | stds = grid_result.cv_results_['std_test_score']
93 | params = grid_result.cv_results_['params']
94 | for mean, stdev, param in zip(means, stds, params):
95 | print("%f (%f) with: %r" % (mean, stdev, param))
96 |
--------------------------------------------------------------------------------
/models/tuning/tuning_random_forest.py:
--------------------------------------------------------------------------------
1 | from sklearn.ensemble import RandomForestClassifier
2 | from sklearn.model_selection import GridSearchCV
3 |
4 | from models.tools import load_data
5 |
6 | # path
7 | path_to_data = "data/"
8 | path_to_submissions = "submissions/"
9 | path_to_stacking = "stacking/"
10 | path_to_plots = "plots/"
11 |
12 | # tuned hyper-parameters
13 |
14 | parameters = {
15 | "criterion": "entropy", # default = gini
16 | "bootstrap": True,
17 | "n_jobs": -1
18 | }
19 |
20 | # used features
21 |
22 | my_features_string = [
23 | "date_diff",
24 | "overlap_title",
25 | "common_author",
26 | # "score_1_2",
27 | # "score_2_1",
28 | "cosine_distance",
29 | # "journal_similarity",
30 | # "overlapping_words_abstract",
31 | # "jaccard",
32 | # "adar",
33 | "preferential_attachment",
34 | # "resource_allocation_index",
35 | "out_neighbors",
36 | "in_neighbors",
37 | "common_neighbors",
38 | "shortest_path",
39 | "popularity",
40 | "common_successors",
41 | "common_predecessors",
42 | "paths_of_length_one"
43 | # "katz"
44 | # "katz_2"
45 | ]
46 |
47 | # load data
48 |
49 | (X_train,
50 | X_test,
51 | Y_train,
52 | my_features_index,
53 | my_features_dic) = load_data(my_features_string)
54 |
55 |
56 | # GridSearchCV
57 |
58 | # param grid
59 |
60 | tuned_parameters = {
61 | "n_estimators": [150],
62 | "max_depth": [3, 6, 9, 12, 15, 20],
63 | "min_samples_leaf": [3, 5, 10, 20]
64 | }
65 |
66 | # tuning
67 | rf = RandomForestClassifier(
68 | criterion=parameters["criterion"],
69 | bootstrap=parameters["bootstrap"],
70 | n_jobs=parameters["n_jobs"]
71 | )
72 |
73 | metrics = ["f1", "precision", "recall", "accuracy", "roc_auc"]
74 | grid_RF = GridSearchCV(rf,
75 | param_grid=tuned_parameters,
76 | scoring=metrics,
77 | refit='f1',
78 | cv=5,
79 | n_jobs=-1,
80 | verbose=10
81 | )
82 | grid_RF.fit(X_train, Y_train)
83 | print("GridSearch best parameters", grid_RF.best_params_)
84 |
--------------------------------------------------------------------------------
/models/tuning/tuning_svm.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from sklearn.model_selection import StratifiedKFold
4 | from sklearn.svm import SVC
5 | from skopt import gp_minimize
6 |
7 | from models.tools import f1_score, load_data
8 | from models.tuning.objective_function import ObjectiveFunction
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_plots = "models/tuning/plots/"
13 |
14 | # used features
15 |
16 | my_features_string = [
17 | "overlap_title",
18 | "date_diff",
19 | "common_author",
20 | "journal_similarity",
21 | "overlapping_words_abstract",
22 | "cosine_distance",
23 | "shortest_path",
24 | "jaccard",
25 | "adar",
26 | "preferential_attachment",
27 | "resource_allocation_index",
28 | "out_neighbors",
29 | "in_neighbors",
30 | "common_neighbors"
31 | ]
32 |
33 | # load data
34 |
35 | (X_train,
36 | X_test,
37 | Y_train,
38 | my_features_index,
39 | my_features_dic) = load_data(my_features_string)
40 |
41 |
42 | # function to optimize (too costly --> feature selection, subsampling)
43 | def objective_svm(x):
44 | C_in, gamma_in = x[0] ** 2, x[1] ** 2
45 | svm_classifier = SVC(C=C_in, cache_size=200,
46 | class_weight=None,
47 | coef0=0.0,
48 | decision_function_shape='ovr',
49 | degree=3, gamma=gamma_in,
50 | kernel='rbf',
51 | max_iter=-1,
52 | probability=False,
53 | random_state=None,
54 | shrinking=True,
55 | tol=0.001,
56 | verbose=False)
57 | k = 5
58 | kf = StratifiedKFold(k)
59 | i = 0
60 | score = 0
61 | for train_index, test_index in kf.split(X_train, Y_train):
62 | svm_classifier.fit(X_train[train_index], Y_train[train_index])
63 | Y_pred = svm_classifier.predict(X_train[test_index])
64 | score += f1_score(Y_train[test_index], Y_pred)
65 | i += 1
66 | return score
67 |
68 |
69 | # Bayesian Optimization (too costly --> feature selection, subsampling)
70 | f_bo = ObjectiveFunction(objective_svm)
71 | t0 = time.time()
72 | res = gp_minimize(f_bo, [(10 ** (-9), 10), (10 ** (-9), 0.1)], n_jobs=4)
73 | t1 = time.time()
74 | print("The total time with BO is : " + str(t1 - t0) + " seconds")
75 | print('best score BO :', -res.fun)
76 | print('best parameters BO:', res.x)
77 |
--------------------------------------------------------------------------------
/models/tuning/tuning_svm_feat_selec.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import PCA
2 | from sklearn.feature_selection import SelectKBest, mutual_info_classif
3 | from sklearn.model_selection import GridSearchCV
4 | from sklearn.pipeline import Pipeline
5 | from sklearn.svm import SVC
6 | from sklearn.utils import resample
7 |
8 | from models.tools import load_data
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_plots = "models/tuning/plots/"
13 |
14 | # used features
15 |
16 | my_features_string = [
17 | "date_diff",
18 | # "overlap_title",
19 | "common_author",
20 | # # "score_1_2",
21 | # # "score_2_1",
22 | "cosine_distance",
23 | # "journal_similarity",
24 | # # "overlapping_words_abstract",
25 | # # "jaccard",
26 | # # "adar",
27 | "preferential_attachment",
28 | # # "resource_allocation_index",
29 | # "out_neighbors",
30 | "in_neighbors",
31 | "common_neighbors",
32 | # "shortest_path",
33 | # "popularity",
34 | # "common_successors",
35 | # "common_predecessors",
36 | # "paths_of_length_one",
37 | "authors_citation"
38 | # "coauthor_score"
39 | # # "katz"
40 | # # "katz_2"
41 | ]
42 |
43 | # load data
44 |
45 | (X_train,
46 | X_test,
47 | Y_train,
48 | my_features_index,
49 | my_features_dic) = load_data(my_features_string)
50 |
51 | # subsampling
52 | X_train_sub, Y_train_sub = resample(X_train, Y_train, n_samples=500, random_state=42)
53 | print(X_train_sub.shape, Y_train_sub.shape)
54 | # pipeline architecture
55 | pipe = Pipeline([
56 | ('reduce_dim', PCA()),
57 | ('classif', SVC(gamma=0.01))
58 | ])
59 | # parameter values
60 | nb_features = [2, 4]
61 | Cs = [0.001, 0.01, 0.1]
62 | kernels = ['linear', 'rbf']
63 |
64 | # parameter grid
65 | param_grid = [
66 | {
67 | 'reduce_dim': [PCA()],
68 | 'reduce_dim__n_components': nb_features,
69 | 'classif__C': Cs,
70 | 'classif__kernel': kernels
71 | },
72 | # {
73 | # 'reduce_dim': [SelectKBest(chi2)],
74 | # 'reduce_dim__k': nb_features,
75 | # 'classif__C': Cs,
76 | # 'classif__kernel':kernels
77 | # }
78 | # ,
79 | {
80 | 'reduce_dim': [SelectKBest(mutual_info_classif)],
81 | 'reduce_dim__k': nb_features,
82 | 'classif__C': Cs,
83 | 'classif__kernel': kernels
84 | }
85 | ]
86 |
87 | # cross validation grid search instance
88 | grid = GridSearchCV(pipe, cv=4, n_jobs=2, param_grid=param_grid, verbose=10)
89 |
90 | # fit grid
91 | grid.fit(X_train_sub, Y_train_sub)
92 |
93 | # print best params
94 | print(grid.best_params_)
95 |
--------------------------------------------------------------------------------
/notes:
--------------------------------------------------------------------------------
1 | Lien vers kaggle: https://www.kaggle.com/t/012200c0318541f6806bfe757092b4f0
2 | Il faut citer des papiers si possible.
3 |
4 | Il faudrait faire un plan d'attaque.
5 | D'abord tester des choses très simples pour avoir des premiers résultats et une idée de la puissance de calcul nécessaire.
6 |
7 | Le modèle "baseline" prend en compte seulement quelques features et les entraîne dans un SVM:
8 | - number of overlapping words in paper titles
9 | - number of common authors
10 | - difference in publication years
11 |
12 | Il faudrait créer ces features et ajouter quelques features au fur et à mesure.
13 |
14 | brainstorming sur les features:
15 | - mots les plus importants d'après TF-IDF dans le titre et dans l'abstract: check. voir comment on fait exactement. cosine distance ?
16 | - biensûr les dates: à faire
17 | - un genre d'historique de citation des auteurs. Par exemple les 10 auteurs les plus cités par les auteurs du texte: à faire
18 | - représentation word2vec des abstracts ?
19 | - représentations sous formes de graphes des textes et essayer d'en extraire des features
20 |
21 |
22 | à faire:
23 | relire les cours pour trouver tout ce qui pourrait nous servir.
24 |
25 | brainstorming data exploration:
26 | - nombre d'auteurs différents: check
27 | - nombre d'apparitions d'un auteur dans la base de donnée: check
28 | - les mots les plus fréquents dans les abstracts: pas encore
29 | - distribution du nombre d'overlapping words chez les textes qui ne se citent pas et chez les textes qui se citent: check
30 | - faire un tf-idf sur la base de donnée entière et voir les résultats de ça: à faire aussi
31 | - combien de journaux différents ?
32 | - combien d'auteurs manquants ?
33 |
34 | subsampler pour l'exploration, les premiers tests ?
35 |
36 | brainstorming recherche d'articles:
37 | - checker dans les cours les articles qui sont cités
38 | - demander à des gens ?
39 | - faire des recherches en ligne
40 |
41 |
42 |
43 | Références à aller checker:
44 | • Christopher D. Manning, Prabhakar Raghavan and Hinrich
45 | Schütze, Introduction to Information Retrieval, Cambridge
46 | University Press. 2008. http://www-nlp.stanford.edu/IR-book/
47 | • “Indexing by Latent Semantic Analysis”, S.Deerwester,
48 | S.Dumais, T.Landauer, G.Fumas, R.Harshman, Journal of the
49 | Society for Information Science, 1990
50 | • “Mining the Web: Discovering Knowledge from Hypertext
51 | Data”, Soumen Chakrabarti
52 |
53 |
54 |
55 | Résultats:
56 | Random Forest avec les paramètres de base:
57 | Avec 10 estimateurs j'ai peu d'overfitting et un score de 0.85. Avec 30 estimateurs j'ai pas franchement plus d'overfitting mais les résultats ne s'améliorent pas.
58 | Si je rajoute cosine distance avec 30 estimateurs j'ai masse overfitting et les résultats qui baissent. Avec 10 estimateurs ça overfitte encore pas mal. Donc faudrait réussir à réduire cet overfitting
59 | Si je rajoute les deux score en plus, on reste sur de l'overfitting de gros porc.
60 |
61 | Light GBM:
62 | pas d'overfitting sur les features de base. Léger overfitting avec la cosine distance mais les résultats sont meilleurs, genre 0.87. Ensuite rajouter les deux scores n'améliore pas vraiment les résultats.
63 |
64 | Avec shortest path on arrive à 94.5/93.8 (train/test).
65 | En LGBM on a des résultats un peu meilleurs et avec moins d'overfitting.
66 |
67 | On passe à 94.1/94.1 si on rajoute cosine distance et l'ajout de cosine distance a un intérêt très limité...
68 |
69 | LGBM avec les basics et shortest path: 92.9/92.8
70 | LGBM avec shortest path et overlapping: 94.2/93.9
71 | RF avec shortest path et overlapping: 94.5/93.7
72 | LGBM avec shortest path et cosine distance: 94.2/93.9
73 |
74 |
75 | Un papier qui fait de la link prediction (coauthorship)
76 | http://www.cs.rpi.edu/~zaki/PaperDir/LINK06.pdf
77 | Un article qui donne des bonnées idées sur la théorie des graphes:
78 | http://be.amazd.com/link-prediction/
79 |
80 | Une thèse sur les graphes dirigés:
81 | https://www.cs.upc.edu/~dariog/PhD-Thesis-Link-Prediction-DGG.pdf
82 |
83 |
84 | relecture de code le 1er Mars:
85 | _ je change un peu le format du code. Maintenant c'est en mode projet donc tu dois ouvrir tout le bordel sous pycharm et changer les paramètres pour que le working directory ça soit toujours la source du projet
86 | - preprocessing. done.
87 | - pour moi dans author_graph_features il y a un soucis. Le même que ce qu'on avait déjà eu avant, il faudrait supprimer les arrêtes qui existent si target == 1. à corriger ou à jeter... Si tu le corriges je peux le faire tourner sur Compute Engine.
88 | - baseline: ok
89 | - citation_graph_features: ok
90 | - network_x bigraph: ok
91 | - network_x digraph: ok
92 | - en train de faire le network_x bigraph_long pour calculer katz
93 |
94 | nohup python3 -u task_manager.py > log.txt 2>&1 &
95 |
96 | https://drive.google.com/file/d/1RetpAekytXLNwQLUfJhHxamGHcOd_7j8/view?usp=sharing
97 |
98 | 17663 sur le cloud
99 |
100 | 371 460 dans le dernier tail log.txt à 22H35.
101 |
102 |
103 |
--------------------------------------------------------------------------------
/ressources/data_challenge_description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/ressources/data_challenge_description.pdf
--------------------------------------------------------------------------------
/results/results:
--------------------------------------------------------------------------------
1 | date: 2018-02-15 00:26:26.153189
2 | features: ['overlap_title', 'date_diff', 'common_author']
3 | model: Random Forest
4 | parameters: default
5 | cross validation:
6 | 0.7780963908271935
7 | 0.7784294452612852
8 | 0.776469919253952
9 | 0.7771685269126416
10 | 0.7766730028756641
11 | kaggle score: 0.77710
12 |
13 |
14 | date: 2018-02-15 00:26:26.153189
15 | features: ['overlap_title', 'date_diff', 'common_author', "journal_similarity"]
16 | model: Random Forest
17 | parameters: default
18 | cross validation:
19 | 0.7797616629976524
20 | 0.7810288945029772
21 | 0.778419522022388
22 | 0.7788338126106805
23 | 0.779093759646472
24 | kaggle score: 0.77904
25 |
26 |
27 | date: 2018-02-15 00:30:45.924858
28 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract']
29 | model: Random Forest
30 | parameters: default
31 | cross validation:
32 | 0.8342363711688586
33 | 0.8365596289286208
34 | 0.8348442754788712
35 | 0.8360140371399327
36 | 0.8359328036912479
37 | kaggle score: 0.83755
38 |
39 |
40 | date: 2018-02-15 15:09:52.437552
41 | features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance']
42 | model: Random Forest
43 | parameters:
44 | {'min_data_in_leaf': 2, 'max_depth': 200, 'boosting_type': 'gbdt', 'objective': 'binary', 'task': 'train', 'verbose': 0, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'learning_rate': 0.1, 'bagging_freq': 5, 'num_leaves': 200}
45 | cross validation:
46 | Start training...
47 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
48 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
49 | [40] valid_0's f1 score: 0.870444
50 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
51 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
52 | [80] valid_0's f1 score: 0.871552
53 | train: 0.874026271431
54 | test: 0.871902069297
55 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
56 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
57 | [40] valid_0's f1 score: 0.872282
58 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
59 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
60 | [80] valid_0's f1 score: 0.872716
61 | train: 0.873581441765
62 | test: 0.872562595906
63 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
64 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
65 | [40] valid_0's f1 score: 0.87105
66 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
67 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
68 | [80] valid_0's f1 score: 0.872016
69 | train: 0.873792318185
70 | test: 0.872204161004
71 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
72 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
73 | [40] valid_0's f1 score: 0.871357
74 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
75 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
76 | [80] valid_0's f1 score: 0.872234
77 | train: 0.873583781681
78 | test: 0.872538922426
79 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
80 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
81 | [40] valid_0's f1 score: 0.869521
82 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
83 | [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
84 | [80] valid_0's f1 score: 0.870304
85 | train: 0.874169648139
86 | test: 0.870220013444
87 | kaggle score:
88 |
89 |
--------------------------------------------------------------------------------
/sampling/sampling.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | path_to_data = "~/Documents/polytechnique/3A/nlp/link-prediction/data/"
4 |
5 | divide_by = 100
6 | sample_size_string = str(divide_by)
7 |
8 | nodes_header = ["id", "year", "title", "authors", "journal", "abstract"]
9 | nodes = pd.read_csv(path_to_data+"node_information.csv", names=nodes_header)
10 |
11 | names = ["id1", "id2", "target"]
12 | training = pd.read_csv(path_to_data+"training_set.txt", names=names, delimiter=" ")
13 |
14 | sample_1 = training.sample(frac=1.0/divide_by, replace=False)
15 | sample_2 = sample_1.copy()
16 | sample_2.columns = ["id2", "id1", "target"]
17 |
18 | names = ["id1", "id2"]
19 | testing = pd.read_csv(path_to_data+"testing_set.txt", names=names, delimiter=" ")
20 |
21 | sample_1_testing = testing.sample(frac=1.0/divide_by, replace=False)
22 |
23 | sample_2_testing = sample_1_testing.copy()
24 | sample_2_testing.columns = ["id2", "id1"]
25 |
26 | all_ids = pd.concat([sample_1, sample_2, sample_1_testing, sample_2_testing])
27 | del all_ids["target"]
28 | del all_ids["id2"]
29 | all_ids.columns = ["id"]
30 |
31 | all_ids_2 = all_ids.groupby(by="id").first().reset_index()
32 |
33 | merged = all_ids_2.merge(right=nodes, how="inner")
34 |
35 |
36 | merged.to_csv(path_to_data+"node_information"+sample_size_string+".csv", header=False)
37 | sample_1.to_csv(path_to_data+"training_set"+sample_size_string+".txt", header=False, sep=" ")
38 | sample_1_testing.to_csv(path_to_data+"testing_set"+sample_size_string+".txt", header=False, sep=" ")
39 |
--------------------------------------------------------------------------------
/stacking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/stacking/__init__.py
--------------------------------------------------------------------------------
/stacking/stacking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.metrics import f1_score
5 | from sklearn.model_selection import StratifiedKFold
6 |
7 | # path
8 | path_to_data = "data/"
9 | path_to_submissions = "submissions/"
10 | path_to_stacking = "stacking/"
11 |
12 | # get labels
13 | names = ["id1", "id2", "target"]
14 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
15 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
16 | Y_train = Y_train['target'].values
17 | Y_test = Y_test['target'].values
18 |
19 | # group model predictions as features
20 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
21 | X_train = pd.DataFrame(columns=model_strings)
22 | X_test = pd.DataFrame(columns=model_strings)
23 | for model in model_strings:
24 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
25 | # take the mean of the test set probs of each cv fold
26 | if model == 'svm_linear':
27 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
28 | else:
29 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
30 | print(X_train.head(), X_test.head())
31 | X_train = X_train.values
32 | X_test = X_test.values
33 |
34 | # model
35 | model = RandomForestClassifier(
36 | criterion='entropy',
37 | n_estimators=100,
38 | min_samples_leaf=6,
39 | max_depth=7,
40 | bootstrap=True,
41 | n_jobs=-1
42 | )
43 |
44 | # cross validated predictions
45 | k = 5
46 | kf = StratifiedKFold(k)
47 | predictions = np.zeros((X_test.shape[0], k))
48 | i = 0
49 |
50 | for train_index, test_index in kf.split(X_train, Y_train):
51 | model.fit(X_train[train_index], Y_train[train_index])
52 | Y_pred = model.predict(X_train[test_index])
53 | Y_pred_train = model.predict(X_train[train_index])
54 | predictions[:, i] = model.predict(X_test)
55 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
56 | print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
57 | i += 1
58 |
59 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
60 | submission = pd.DataFrame(Y_test)
61 | submission.to_csv(
62 | path_or_buf=path_to_submissions + "stack_sub_rf.csv",
63 | index=True,
64 | index_label="id",
65 | header=["category"]
66 | )
67 |
--------------------------------------------------------------------------------
/stacking/stacking_tuning.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.feature_selection import SelectKBest, chi2
5 | from sklearn.linear_model import LogisticRegression
6 | from sklearn.metrics import f1_score
7 | from sklearn.model_selection import GridSearchCV, StratifiedKFold
8 | from sklearn.pipeline import Pipeline
9 |
10 | # path
11 | path_to_data = "data/"
12 | path_to_submissions = "submissions/"
13 | path_to_stacking = "stacking/"
14 |
15 | # get labels
16 | names = ["id1", "id2", "target"]
17 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
18 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
19 | Y_train = Y_train['target'].values
20 | Y_test = Y_test['target'].values
21 |
22 | # group model predictions as features
23 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
24 | X_train = pd.DataFrame(columns=model_strings)
25 | X_test = pd.DataFrame(columns=model_strings)
26 | for model in model_strings:
27 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
28 | # take the mean of the test set probs of each cv fold
29 | if model == 'svm_linear':
30 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
31 | else:
32 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
33 | print(X_train.head(), X_test.head())
34 | X_train = X_train.values
35 | X_test = X_test.values
36 |
37 | # fit a grid searched logistic regression on top of the base models
38 | # parameters grid
39 | param_grid = {
40 | "C": [1, 0.1, 0.01, 0.001],
41 | "penalty": ["l2", "l1"]
42 | }
43 |
44 | # pipeline architecture
45 | pipe = Pipeline([
46 | ('reduce_dim', SelectKBest(chi2)),
47 | ('classif', LogisticRegression())
48 | ])
49 | # parameter values
50 | nb_features = [2, 3, 4, 5]
51 | C = [0.001, 0.01, 0.1]
52 | kernels = ['linear', 'rbf']
53 | n_estimators = [100, 200]
54 | max_depth = [10, 20]
55 | min_samples_leaf = [20]
56 | penalty = ["l2", "l1"]
57 |
58 | # parameter grid
59 | param_grid = [
60 | {
61 | 'reduce_dim__k': [5],
62 | 'classif': [RandomForestClassifier(bootstrap=True, n_jobs=-1)],
63 | 'classif__n_estimators': n_estimators,
64 | 'classif__max_depth': max_depth,
65 | 'classif__min_samples_leaf': min_samples_leaf,
66 |
67 | },
68 | {
69 | 'reduce_dim__k': nb_features,
70 | 'classif': [LogisticRegression(n_jobs=-1)],
71 | 'classif__C': C,
72 | 'classif__penalty': penalty
73 | }
74 | ]
75 |
76 | # cross validation grid search instance
77 | grid = GridSearchCV(pipe, cv=4, n_jobs=-1, param_grid=param_grid, verbose=10)
78 |
79 | # fit grid
80 | grid.fit(X_train, Y_train)
81 |
82 | # print best params
83 | print(grid.best_params_)
84 |
85 | # get params
86 | print(grid.best_params_)
87 | parameters = grid.best_params_
88 |
89 | # model instance for prediction
90 | model = grid.best_estimator_
91 |
92 | # cross validated predictions
93 | k = 5
94 | kf = StratifiedKFold(k)
95 | predictions = np.zeros((X_test.shape[0], k))
96 | i = 0
97 |
98 | for train_index, test_index in kf.split(X_train, Y_train):
99 | model.fit(X_train[train_index], Y_train[train_index])
100 | Y_pred = model.predict(X_train[test_index])
101 | Y_pred_train = model.predict(X_train[train_index])
102 | predictions[:, i] = model.predict(X_test)
103 | print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
104 | print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
105 | i += 1
106 |
107 | Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
108 | submission = pd.DataFrame(Y_test)
109 | submission.to_csv(
110 | path_or_buf=path_to_submissions + "stack_sub2.csv",
111 | index=True,
112 | index_label="id",
113 | header=["category"]
114 | )
115 |
--------------------------------------------------------------------------------
/stacking/stacking_tuning_micro.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.ensemble import RandomForestClassifier
3 | from sklearn.model_selection import GridSearchCV
4 |
5 | # path
6 | path_to_data = "data/"
7 | path_to_submissions = "submissions/"
8 | path_to_stacking = "stacking/"
9 |
10 | # get labels
11 | names = ["id1", "id2", "target"]
12 | Y_train = pd.read_csv(path_to_data + "training_set.txt", names=names, delimiter=" ")
13 | Y_test = pd.read_csv(path_to_data + "testing_set.txt", names=names, delimiter=" ")
14 | Y_train = Y_train['target'].values
15 | Y_test = Y_test['target'].values
16 |
17 | # group model predictions as features
18 | model_strings = ['lgbm', 'rf', 'svmlinear', 'nn', 'nn_deep']
19 | X_train = pd.DataFrame(columns=model_strings)
20 | X_test = pd.DataFrame(columns=model_strings)
21 | for model in model_strings:
22 | X_train[model] = pd.read_csv(path_to_stacking + model + "_train.csv")['category']
23 | # take the mean of the test set probs of each cv fold
24 | if model == 'svm_linear':
25 | X_test[model] = 0.5 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
26 | else:
27 | X_test[model] = 0.2 * pd.read_csv(path_to_stacking + model + "_test.csv")['category'].values
28 | print(X_train.head(), X_test.head())
29 | X_train = X_train.values
30 | X_test = X_test.values
31 |
32 | # GridSearchCV to fine tune the stacking parameters
33 |
34 | # instantiate param grid
35 |
36 | tuned_parameters = {
37 | "n_estimators": [100],
38 | "max_depth": [3, 7, 10],
39 | "min_samples_leaf": [6],
40 | "criterion": ["entropy"]
41 | }
42 |
43 | # fit a GridSearchCV instance and print optimal parameters
44 | rf = RandomForestClassifier(
45 | bootstrap=True,
46 | n_jobs=-1
47 | )
48 |
49 | metrics = ["f1"]
50 | grid_RF = GridSearchCV(rf,
51 | param_grid=tuned_parameters,
52 | scoring=metrics,
53 | refit='f1',
54 | cv=4,
55 | n_jobs=-1,
56 | verbose=10
57 | )
58 | grid_RF.fit(X_train, Y_train)
59 | print("GridSearch best parameters", grid_RF.best_params_)
60 |
--------------------------------------------------------------------------------
/task_manager.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raph-m/link-prediction/ddfa1bf50040ec411cc1bdafd0513b757e4c1378/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_author_graph_features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "ename": "ImportError",
10 | "evalue": "No module named 'code.feature_engineering'; 'code' is not a package",
11 | "traceback": [
12 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
13 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
14 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0migraph\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_engineering\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtools\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mlit_eval_nan_proof\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# progress bar for pandas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
15 | "\u001b[0;31mImportError\u001b[0m: No module named 'code.feature_engineering'; 'code' is not a package"
16 | ],
17 | "output_type": "error"
18 | }
19 | ],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import numpy as np\n",
23 | "from tqdm import tqdm\n",
24 | "from itertools import permutations\n",
25 | "import igraph\n",
26 | "\n",
27 | "from code.feature_engineering.tools import lit_eval_nan_proof\n",
28 | "\n",
29 | "# progress bar for pandas\n",
30 | "tqdm.pandas(tqdm())\n",
31 | "\n",
32 | "# path\n",
33 | "path_to_data = \"../../data/\"\n",
34 | "\n",
35 | "# loading data\n",
36 | "converter_dict = {'authors': lit_eval_nan_proof, 'journal': lit_eval_nan_proof,\n",
37 | " 'title': lit_eval_nan_proof, 'abstract': lit_eval_nan_proof}\n",
38 | "nodes = pd.read_csv(path_to_data + \"nodes_preprocessed.csv\", converters=converter_dict)\n",
39 | "nodes.set_index(\"id\", inplace=True)\n",
40 | "training = pd.read_csv(path_to_data + \"training_features.txt\")\n",
41 | "training.set_index(\"my_index\", inplace=True)\n",
42 | "testing = pd.read_csv(path_to_data + \"testing_features.txt\")\n",
43 | "testing.set_index(\"my_index\", inplace=True)\n",
44 | "\n",
45 | "# create author graph\n",
46 | "# vertices are authors\n",
47 | "# edge of weight 1 if they co-wrote a paper, 2 if they only cite each other\n",
48 | "\n",
49 | "# create empty directed graph\n",
50 | "g = igraph.Graph(directed=True)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# add vertices\n",
60 | "authors = nodes['authors']\n",
61 | "authors_set = list(set(authors.dropna().sum()))\n",
62 | "g.add_vertices(authors_set)"
63 | ]
64 | }
65 | ],
66 | "metadata": {
67 | "kernelspec": {
68 | "display_name": "Python 3",
69 | "language": "python",
70 | "name": "python3"
71 | },
72 | "language_info": {
73 | "codemirror_mode": {
74 | "name": "ipython",
75 | "version": 3
76 | },
77 | "file_extension": ".py",
78 | "mimetype": "text/x-python",
79 | "name": "python",
80 | "nbconvert_exporter": "python",
81 | "pygments_lexer": "ipython3",
82 | "version": "3.5.2"
83 | }
84 | },
85 | "nbformat": 4,
86 | "nbformat_minor": 2
87 | }
88 |
--------------------------------------------------------------------------------
/tests/test_baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 9,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "\n",
12 | "\n",
25 | "
\n",
26 | " \n",
27 | " \n",
28 | " | \n",
29 | " my_index | \n",
30 | " id1 | \n",
31 | " id2 | \n",
32 | " target | \n",
33 | " overlap_title | \n",
34 | " date_diff | \n",
35 | " common_author | \n",
36 | " score_1_2 | \n",
37 | " score_2_1 | \n",
38 | " cosine_distance | \n",
39 | " jaccard | \n",
40 | " adar | \n",
41 | " preferential_attachment | \n",
42 | " resource_allocation_index | \n",
43 | " common_neighbors | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " 9510123|9502114 | \n",
50 | " 9510123 | \n",
51 | " 9502114 | \n",
52 | " 1 | \n",
53 | " 2 | \n",
54 | " 0 | \n",
55 | " 0 | \n",
56 | " 17.844392 | \n",
57 | " 14.535935 | \n",
58 | " 0.075791 | \n",
59 | " 0.066667 | \n",
60 | " 0.513898 | \n",
61 | " 55.0 | \n",
62 | " 0.142857 | \n",
63 | " 0.142857 | \n",
64 | "
\n",
65 | " \n",
66 | " 1 | \n",
67 | " 9707075|9604178 | \n",
68 | " 9707075 | \n",
69 | " 9604178 | \n",
70 | " 1 | \n",
71 | " 1 | \n",
72 | " 1 | \n",
73 | " 0 | \n",
74 | " 19.415184 | \n",
75 | " 24.296850 | \n",
76 | " 0.082450 | \n",
77 | " 0.098039 | \n",
78 | " 4.320366 | \n",
79 | " 11388.0 | \n",
80 | " 0.226401 | \n",
81 | " 0.226401 | \n",
82 | "
\n",
83 | " \n",
84 | " 2 | \n",
85 | " 9312155|9506142 | \n",
86 | " 9312155 | \n",
87 | " 9506142 | \n",
88 | " 0 | \n",
89 | " 0 | \n",
90 | " -2 | \n",
91 | " 0 | \n",
92 | " 15.116037 | \n",
93 | " 10.080194 | \n",
94 | " 0.018402 | \n",
95 | " 0.000000 | \n",
96 | " 0.000000 | \n",
97 | " 5.0 | \n",
98 | " 0.000000 | \n",
99 | " 0.000000 | \n",
100 | "
\n",
101 | " \n",
102 | " 3 | \n",
103 | " 9911255|302165 | \n",
104 | " 9911255 | \n",
105 | " 302165 | \n",
106 | " 0 | \n",
107 | " 0 | \n",
108 | " -4 | \n",
109 | " 0 | \n",
110 | " 16.765770 | \n",
111 | " 20.295904 | \n",
112 | " 0.058245 | \n",
113 | " 0.000000 | \n",
114 | " 0.000000 | \n",
115 | " 280.0 | \n",
116 | " 0.000000 | \n",
117 | " 0.000000 | \n",
118 | "
\n",
119 | " \n",
120 | " 4 | \n",
121 | " 9701033|209076 | \n",
122 | " 9701033 | \n",
123 | " 209076 | \n",
124 | " 0 | \n",
125 | " 0 | \n",
126 | " -5 | \n",
127 | " 0 | \n",
128 | " 21.457809 | \n",
129 | " 25.240819 | \n",
130 | " 0.069025 | \n",
131 | " 0.000000 | \n",
132 | " 0.000000 | \n",
133 | " 168.0 | \n",
134 | " 0.000000 | \n",
135 | " 0.000000 | \n",
136 | "
\n",
137 | " \n",
138 | "
\n",
139 | "
"
140 | ],
141 | "text/plain": [
142 | " my_index id1 id2 target overlap_title date_diff \\\n",
143 | "0 9510123|9502114 9510123 9502114 1 2 0 \n",
144 | "1 9707075|9604178 9707075 9604178 1 1 1 \n",
145 | "2 9312155|9506142 9312155 9506142 0 0 -2 \n",
146 | "3 9911255|302165 9911255 302165 0 0 -4 \n",
147 | "4 9701033|209076 9701033 209076 0 0 -5 \n",
148 | "\n",
149 | " common_author score_1_2 score_2_1 cosine_distance jaccard adar \\\n",
150 | "0 0 17.844392 14.535935 0.075791 0.066667 0.513898 \n",
151 | "1 0 19.415184 24.296850 0.082450 0.098039 4.320366 \n",
152 | "2 0 15.116037 10.080194 0.018402 0.000000 0.000000 \n",
153 | "3 0 16.765770 20.295904 0.058245 0.000000 0.000000 \n",
154 | "4 0 21.457809 25.240819 0.069025 0.000000 0.000000 \n",
155 | "\n",
156 | " preferential_attachment resource_allocation_index common_neighbors \n",
157 | "0 55.0 0.142857 0.142857 \n",
158 | "1 11388.0 0.226401 0.226401 \n",
159 | "2 5.0 0.000000 0.000000 \n",
160 | "3 280.0 0.000000 0.000000 \n",
161 | "4 168.0 0.000000 0.000000 "
162 | ]
163 | },
164 | "execution_count": 9,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "import numpy as np\n",
171 | "import pandas as pd\n",
172 | "from tqdm import tqdm\n",
173 | "\n",
174 | "path_to_data = \"../data/\"\n",
175 | "training = pd.read_csv(path_to_data+\"training_features.txt\")\n",
176 | "training.head()"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 7,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/html": [
187 | "\n",
188 | "\n",
201 | "
\n",
202 | " \n",
203 | " \n",
204 | " | \n",
205 | " id1 | \n",
206 | " id2 | \n",
207 | " target | \n",
208 | " overlap_title | \n",
209 | " date_diff | \n",
210 | " common_author | \n",
211 | "
\n",
212 | " \n",
213 | " \n",
214 | " \n",
215 | " count | \n",
216 | " 6.155120e+05 | \n",
217 | " 6.155120e+05 | \n",
218 | " 615512.000000 | \n",
219 | " 615512.000000 | \n",
220 | " 615512.000000 | \n",
221 | " 615512.000000 | \n",
222 | "
\n",
223 | " \n",
224 | " mean | \n",
225 | " 5.317422e+06 | \n",
226 | " 6.798460e+06 | \n",
227 | " 0.544474 | \n",
228 | " 0.518416 | \n",
229 | " 1.156681 | \n",
230 | " 0.079396 | \n",
231 | "
\n",
232 | " \n",
233 | " std | \n",
234 | " 4.749198e+06 | \n",
235 | " 4.343138e+06 | \n",
236 | " 0.498019 | \n",
237 | " 0.907113 | \n",
238 | " 3.521691 | \n",
239 | " 0.372206 | \n",
240 | "
\n",
241 | " \n",
242 | " min | \n",
243 | " 1.001000e+03 | \n",
244 | " 1.001000e+03 | \n",
245 | " 0.000000 | \n",
246 | " 0.000000 | \n",
247 | " -11.000000 | \n",
248 | " 0.000000 | \n",
249 | "
\n",
250 | " \n",
251 | " 25% | \n",
252 | " 1.112660e+05 | \n",
253 | " 2.080790e+05 | \n",
254 | " 0.000000 | \n",
255 | " 0.000000 | \n",
256 | " 0.000000 | \n",
257 | " 0.000000 | \n",
258 | "
\n",
259 | " \n",
260 | " 50% | \n",
261 | " 9.310036e+06 | \n",
262 | " 9.505058e+06 | \n",
263 | " 1.000000 | \n",
264 | " 0.000000 | \n",
265 | " 1.000000 | \n",
266 | " 0.000000 | \n",
267 | "
\n",
268 | " \n",
269 | " 75% | \n",
270 | " 9.708050e+06 | \n",
271 | " 9.709097e+06 | \n",
272 | " 1.000000 | \n",
273 | " 1.000000 | \n",
274 | " 3.000000 | \n",
275 | " 0.000000 | \n",
276 | "
\n",
277 | " \n",
278 | " max | \n",
279 | " 9.912293e+06 | \n",
280 | " 9.912293e+06 | \n",
281 | " 1.000000 | \n",
282 | " 10.000000 | \n",
283 | " 11.000000 | \n",
284 | " 8.000000 | \n",
285 | "
\n",
286 | " \n",
287 | "
\n",
288 | "
"
289 | ],
290 | "text/plain": [
291 | " id1 id2 target overlap_title \\\n",
292 | "count 6.155120e+05 6.155120e+05 615512.000000 615512.000000 \n",
293 | "mean 5.317422e+06 6.798460e+06 0.544474 0.518416 \n",
294 | "std 4.749198e+06 4.343138e+06 0.498019 0.907113 \n",
295 | "min 1.001000e+03 1.001000e+03 0.000000 0.000000 \n",
296 | "25% 1.112660e+05 2.080790e+05 0.000000 0.000000 \n",
297 | "50% 9.310036e+06 9.505058e+06 1.000000 0.000000 \n",
298 | "75% 9.708050e+06 9.709097e+06 1.000000 1.000000 \n",
299 | "max 9.912293e+06 9.912293e+06 1.000000 10.000000 \n",
300 | "\n",
301 | " date_diff common_author \n",
302 | "count 615512.000000 615512.000000 \n",
303 | "mean 1.156681 0.079396 \n",
304 | "std 3.521691 0.372206 \n",
305 | "min -11.000000 0.000000 \n",
306 | "25% 0.000000 0.000000 \n",
307 | "50% 1.000000 0.000000 \n",
308 | "75% 3.000000 0.000000 \n",
309 | "max 11.000000 8.000000 "
310 | ]
311 | },
312 | "execution_count": 7,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "training.describe()"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": []
327 | }
328 | ],
329 | "metadata": {
330 | "kernelspec": {
331 | "display_name": "Python 3",
332 | "language": "python",
333 | "name": "python3"
334 | },
335 | "language_info": {
336 | "codemirror_mode": {
337 | "name": "ipython",
338 | "version": 3
339 | },
340 | "file_extension": ".py",
341 | "mimetype": "text/x-python",
342 | "name": "python",
343 | "nbconvert_exporter": "python",
344 | "pygments_lexer": "ipython3",
345 | "version": "3.5.2"
346 | }
347 | },
348 | "nbformat": 4,
349 | "nbformat_minor": 2
350 | }
351 |
--------------------------------------------------------------------------------
/tests/test_preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "\n",
12 | "\n",
25 | "
\n",
26 | " \n",
27 | " \n",
28 | " | \n",
29 | " id | \n",
30 | " year | \n",
31 | " title | \n",
32 | " authors | \n",
33 | " journal | \n",
34 | " abstract | \n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | " 0 | \n",
40 | " 1001 | \n",
41 | " 2000 | \n",
42 | " compactification geometry and duality | \n",
43 | " Paul S. Aspinwall | \n",
44 | " NaN | \n",
45 | " these are notes based on lectures given at tas... | \n",
46 | "
\n",
47 | " \n",
48 | " 1 | \n",
49 | " 1002 | \n",
50 | " 2000 | \n",
51 | " domain walls and massive gauged supergravity p... | \n",
52 | " M. Cvetic, H. Lu, C.N. Pope | \n",
53 | " Class.Quant.Grav. | \n",
54 | " we point out that massive gauged supergravity ... | \n",
55 | "
\n",
56 | " \n",
57 | " 2 | \n",
58 | " 1003 | \n",
59 | " 2000 | \n",
60 | " comment on metric fluctuations in brane worlds | \n",
61 | " Y.S. Myung, Gungwon Kang | \n",
62 | " NaN | \n",
63 | " recently ivanov and volovich hep-th 9912242 cl... | \n",
64 | "
\n",
65 | " \n",
66 | " 3 | \n",
67 | " 1004 | \n",
68 | " 2000 | \n",
69 | " moving mirrors and thermodynamic paradoxes | \n",
70 | " Adam D. Helfer | \n",
71 | " Phys.Rev. | \n",
72 | " quantum fields responding to moving mirrors ha... | \n",
73 | "
\n",
74 | " \n",
75 | " 4 | \n",
76 | " 1005 | \n",
77 | " 2000 | \n",
78 | " bundles of chiral blocks and boundary conditio... | \n",
79 | " J. Fuchs, C. Schweigert | \n",
80 | " NaN | \n",
81 | " proceedings of lie iii clausthal july 1999 var... | \n",
82 | "
\n",
83 | " \n",
84 | "
\n",
85 | "
"
86 | ],
87 | "text/plain": [
88 | " id year title \\\n",
89 | "0 1001 2000 compactification geometry and duality \n",
90 | "1 1002 2000 domain walls and massive gauged supergravity p... \n",
91 | "2 1003 2000 comment on metric fluctuations in brane worlds \n",
92 | "3 1004 2000 moving mirrors and thermodynamic paradoxes \n",
93 | "4 1005 2000 bundles of chiral blocks and boundary conditio... \n",
94 | "\n",
95 | " authors journal \\\n",
96 | "0 Paul S. Aspinwall NaN \n",
97 | "1 M. Cvetic, H. Lu, C.N. Pope Class.Quant.Grav. \n",
98 | "2 Y.S. Myung, Gungwon Kang NaN \n",
99 | "3 Adam D. Helfer Phys.Rev. \n",
100 | "4 J. Fuchs, C. Schweigert NaN \n",
101 | "\n",
102 | " abstract \n",
103 | "0 these are notes based on lectures given at tas... \n",
104 | "1 we point out that massive gauged supergravity ... \n",
105 | "2 recently ivanov and volovich hep-th 9912242 cl... \n",
106 | "3 quantum fields responding to moving mirrors ha... \n",
107 | "4 proceedings of lie iii clausthal july 1999 var... "
108 | ]
109 | },
110 | "execution_count": 1,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "import numpy as np\n",
117 | "import pandas as pd\n",
118 | "from tqdm import tqdm\n",
119 | "\n",
120 | "path_to_data = \"../../data/\"\n",
121 | "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n",
122 | "nodes = pd.read_csv(path_to_data+\"node_information.csv\", names=nodes_header)\n",
123 | "nodes.head()"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 2,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": [
134 | "\n",
135 | "\n",
148 | "
\n",
149 | " \n",
150 | " \n",
151 | " | \n",
152 | " id | \n",
153 | " year | \n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " \n",
158 | " count | \n",
159 | " 2.777000e+04 | \n",
160 | " 27770.000000 | \n",
161 | "
\n",
162 | " \n",
163 | " mean | \n",
164 | " 6.096134e+06 | \n",
165 | " 1998.009039 | \n",
166 | "
\n",
167 | " \n",
168 | " std | \n",
169 | " 4.581677e+06 | \n",
170 | " 3.124684 | \n",
171 | "
\n",
172 | " \n",
173 | " min | \n",
174 | " 1.001000e+03 | \n",
175 | " 1992.000000 | \n",
176 | "
\n",
177 | " \n",
178 | " 25% | \n",
179 | " 2.041122e+05 | \n",
180 | " 1995.000000 | \n",
181 | "
\n",
182 | " \n",
183 | " 50% | \n",
184 | " 9.405182e+06 | \n",
185 | " 1998.000000 | \n",
186 | "
\n",
187 | " \n",
188 | " 75% | \n",
189 | " 9.705204e+06 | \n",
190 | " 2001.000000 | \n",
191 | "
\n",
192 | " \n",
193 | " max | \n",
194 | " 9.912293e+06 | \n",
195 | " 2003.000000 | \n",
196 | "
\n",
197 | " \n",
198 | "
\n",
199 | "
"
200 | ],
201 | "text/plain": [
202 | " id year\n",
203 | "count 2.777000e+04 27770.000000\n",
204 | "mean 6.096134e+06 1998.009039\n",
205 | "std 4.581677e+06 3.124684\n",
206 | "min 1.001000e+03 1992.000000\n",
207 | "25% 2.041122e+05 1995.000000\n",
208 | "50% 9.405182e+06 1998.000000\n",
209 | "75% 9.705204e+06 2001.000000\n",
210 | "max 9.912293e+06 2003.000000"
211 | ]
212 | },
213 | "execution_count": 2,
214 | "metadata": {},
215 | "output_type": "execute_result"
216 | }
217 | ],
218 | "source": [
219 | "nodes.describe()"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 5,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/html": [
230 | "\n",
231 | "\n",
244 | "
\n",
245 | " \n",
246 | " \n",
247 | " | \n",
248 | " id | \n",
249 | " year | \n",
250 | " title | \n",
251 | " authors | \n",
252 | " journal | \n",
253 | " abstract | \n",
254 | "
\n",
255 | " \n",
256 | " \n",
257 | " \n",
258 | " 0 | \n",
259 | " id | \n",
260 | " year | \n",
261 | " title | \n",
262 | " authors | \n",
263 | " journal | \n",
264 | " abstract | \n",
265 | "
\n",
266 | " \n",
267 | " 1 | \n",
268 | " 1001 | \n",
269 | " 2000 | \n",
270 | " ['compactif', 'geometri', 'dualiti'] | \n",
271 | " ['paul s. aspinwall'] | \n",
272 | " NaN | \n",
273 | " ['note', 'base', 'lectur', 'given', 'tasi99', ... | \n",
274 | "
\n",
275 | " \n",
276 | " 2 | \n",
277 | " 1002 | \n",
278 | " 2000 | \n",
279 | " ['domain', 'wall', 'massiv', 'gaug', 'supergra... | \n",
280 | " ['m. cvetic', 'h. lu', 'c.n. pope'] | \n",
281 | " ['class', 'quant', 'grav'] | \n",
282 | " ['point', 'massiv', 'gaug', 'supergrav', 'pote... | \n",
283 | "
\n",
284 | " \n",
285 | " 3 | \n",
286 | " 1003 | \n",
287 | " 2000 | \n",
288 | " ['comment', 'metric', 'fluctuat', 'brane', 'wo... | \n",
289 | " ['y.s. myung', 'gungwon kang'] | \n",
290 | " NaN | \n",
291 | " ['recent', 'ivanov', 'volovich', 'hep-th', '99... | \n",
292 | "
\n",
293 | " \n",
294 | " 4 | \n",
295 | " 1004 | \n",
296 | " 2000 | \n",
297 | " ['move', 'mirror', 'thermodynam', 'paradox'] | \n",
298 | " ['adam d. helfer'] | \n",
299 | " ['phys', 'rev'] | \n",
300 | " ['quantum', 'field', 'respond', 'move', 'mirro... | \n",
301 | "
\n",
302 | " \n",
303 | "
\n",
304 | "
"
305 | ],
306 | "text/plain": [
307 | " id year title \\\n",
308 | "0 id year title \n",
309 | "1 1001 2000 ['compactif', 'geometri', 'dualiti'] \n",
310 | "2 1002 2000 ['domain', 'wall', 'massiv', 'gaug', 'supergra... \n",
311 | "3 1003 2000 ['comment', 'metric', 'fluctuat', 'brane', 'wo... \n",
312 | "4 1004 2000 ['move', 'mirror', 'thermodynam', 'paradox'] \n",
313 | "\n",
314 | " authors journal \\\n",
315 | "0 authors journal \n",
316 | "1 ['paul s. aspinwall'] NaN \n",
317 | "2 ['m. cvetic', 'h. lu', 'c.n. pope'] ['class', 'quant', 'grav'] \n",
318 | "3 ['y.s. myung', 'gungwon kang'] NaN \n",
319 | "4 ['adam d. helfer'] ['phys', 'rev'] \n",
320 | "\n",
321 | " abstract \n",
322 | "0 abstract \n",
323 | "1 ['note', 'base', 'lectur', 'given', 'tasi99', ... \n",
324 | "2 ['point', 'massiv', 'gaug', 'supergrav', 'pote... \n",
325 | "3 ['recent', 'ivanov', 'volovich', 'hep-th', '99... \n",
326 | "4 ['quantum', 'field', 'respond', 'move', 'mirro... "
327 | ]
328 | },
329 | "execution_count": 5,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "nodes_header = [\"id\", \"year\", \"title\", \"authors\", \"journal\", \"abstract\"]\n",
336 | "nodes_preprocessed = pd.read_csv(path_to_data+\"nodes_preprocessed.csv\", names=nodes_header)\n",
337 | "nodes_preprocessed.head()"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 6,
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "data": {
347 | "text/html": [
348 | "\n",
349 | "\n",
362 | "
\n",
363 | " \n",
364 | " \n",
365 | " | \n",
366 | " id | \n",
367 | " year | \n",
368 | " title | \n",
369 | " authors | \n",
370 | " journal | \n",
371 | " abstract | \n",
372 | "
\n",
373 | " \n",
374 | " \n",
375 | " \n",
376 | " count | \n",
377 | " 27771 | \n",
378 | " 27771 | \n",
379 | " 27771 | \n",
380 | " 23738 | \n",
381 | " 20299 | \n",
382 | " 27771 | \n",
383 | "
\n",
384 | " \n",
385 | " unique | \n",
386 | " 27771 | \n",
387 | " 13 | \n",
388 | " 27496 | \n",
389 | " 15834 | \n",
390 | " 280 | \n",
391 | " 27765 | \n",
392 | "
\n",
393 | " \n",
394 | " top | \n",
395 | " 9702094 | \n",
396 | " 2002 | \n",
397 | " ['black', 'hole', 'entropi'] | \n",
398 | " [\"shin'ichi nojiri\", 'sergei d. odintsov'] | \n",
399 | " ['phys', 'lett'] | \n",
400 | " ['comment', 'start', 'paper', 'hep-th', '01060... | \n",
401 | "
\n",
402 | " \n",
403 | " freq | \n",
404 | " 1 | \n",
405 | " 3335 | \n",
406 | " 7 | \n",
407 | " 38 | \n",
408 | " 3575 | \n",
409 | " 3 | \n",
410 | "
\n",
411 | " \n",
412 | "
\n",
413 | "
"
414 | ],
415 | "text/plain": [
416 | " id year title \\\n",
417 | "count 27771 27771 27771 \n",
418 | "unique 27771 13 27496 \n",
419 | "top 9702094 2002 ['black', 'hole', 'entropi'] \n",
420 | "freq 1 3335 7 \n",
421 | "\n",
422 | " authors journal \\\n",
423 | "count 23738 20299 \n",
424 | "unique 15834 280 \n",
425 | "top [\"shin'ichi nojiri\", 'sergei d. odintsov'] ['phys', 'lett'] \n",
426 | "freq 38 3575 \n",
427 | "\n",
428 | " abstract \n",
429 | "count 27771 \n",
430 | "unique 27765 \n",
431 | "top ['comment', 'start', 'paper', 'hep-th', '01060... \n",
432 | "freq 3 "
433 | ]
434 | },
435 | "execution_count": 6,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "nodes_preprocessed.describe()"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": []
450 | }
451 | ],
452 | "metadata": {
453 | "kernelspec": {
454 | "display_name": "Python 3",
455 | "language": "python",
456 | "name": "python3"
457 | },
458 | "language_info": {
459 | "codemirror_mode": {
460 | "name": "ipython",
461 | "version": 3
462 | },
463 | "file_extension": ".py",
464 | "mimetype": "text/x-python",
465 | "name": "python",
466 | "nbconvert_exporter": "python",
467 | "pygments_lexer": "ipython3",
468 | "version": "3.5.2"
469 | }
470 | },
471 | "nbformat": 4,
472 | "nbformat_minor": 2
473 | }
474 |
--------------------------------------------------------------------------------