├── mm_feedback_site ├── requirements.txt ├── flask_templates │ ├── footer.html │ ├── index.html │ ├── head.html │ └── feedback.html ├── data │ ├── README.md │ └── ccn_feedback.json ├── README.md ├── process_feedback_data.py ├── create_mm_form_data.py ├── main.py └── static │ └── css │ ├── style.css │ └── normalize.css ├── figures ├── problem_setup.png └── paper_reviewer_matching.png ├── requirements.txt ├── data ├── people.csv ├── reviewer.csv ├── article.csv └── output_match.csv ├── paper_reviewer_matcher ├── __init__.py ├── preprocess.py ├── lp.py ├── mindmatch.py ├── affinity.py └── vectorizer.py ├── group_matching.py ├── .gitignore ├── mindmatch.py ├── README.md ├── mindmatch_cluster.py ├── ccn ├── ccn_paper_reviewer_matching_2019.py ├── ccn_mind_matching_2018.py └── ccn_mind_matching_2019.py ├── cosyne └── cosyne_paper_reviewer_matching_2020.py ├── nma └── pod_grouping_2020.py └── LICENSE /mm_feedback_site/requirements.txt: -------------------------------------------------------------------------------- 1 | flask -------------------------------------------------------------------------------- /figures/problem_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/titipata/paper-reviewer-matcher/HEAD/figures/problem_setup.png -------------------------------------------------------------------------------- /figures/paper_reviewer_matching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/titipata/paper-reviewer-matcher/HEAD/figures/paper_reviewer_matching.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | numpy 3 | scipy 4 | nltk 5 | scikit-learn 6 | protobuf==3.18.3 7 | ortools 8 | fuzzywuzzy 9 | dedupe-hcluster 10 | -------------------------------------------------------------------------------- /data/people.csv: -------------------------------------------------------------------------------- 1 | PersonID,FullName 2 | 1,Konrad Kording 3 | 2,Daniel Acuna 4 | 3,Hugo Fernades 5 | 4,Pavan Ramkumar 6 | 5,Joshua Glaser 7 | 6,Patrick Lawlor 8 | 7,Steve Antos -------------------------------------------------------------------------------- /mm_feedback_site/flask_templates/footer.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mm_feedback_site/data/README.md: -------------------------------------------------------------------------------- 1 | # Place holder for data and feedback data 2 | 3 | - `ccn_mind_match_feedback_form.json` - JSON file of mind-match information from the minimized CSV file sent to CCN. This file is generated by `create_feedback_form_data.py` 4 | - `ccn_feedback.json` - JSON separated by line, each contains `registrant_id` and list of `relevances`, `satisfactory`, and `coi` -------------------------------------------------------------------------------- /mm_feedback_site/README.md: -------------------------------------------------------------------------------- 1 | # Feedback Form site for CCN 2019 2 | 3 | Create a feedback form for Mind-Matching event at CCN 2019. 4 | 5 | First, put data in `data` folder naming e.g. `ccn_mind_match_feedback_form.json`. 6 | The JSON is produced by `create_mm_form_data.py` from the minimized mind-matching CSV file and the full CSV given by the conference. 7 | Then, edit `main.py` for `MINDMATCH_DATA_PATH` (path for data e.g. `ccn_mind_match_feedback_form.json`) and `FEEDBACK_DATA_PATH` (path to save responses) 8 | 9 | 10 | ```bash 11 | export FLASK_APP=main.py 12 | flask run --host=0.0.0.0 --port=5555 13 | ``` 14 | 15 | -------------------------------------------------------------------------------- /paper_reviewer_matcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocess import preprocess 2 | from .affinity import ( 3 | compute_topics, compute_affinity, 4 | calculate_affinity_distance, 5 | create_lp_matrix, create_assignment 6 | ) 7 | from .vectorizer import LogEntropyVectorizer, BM25Vectorizer 8 | try: 9 | from .lp import linprog 10 | print("Using Google ortools library for ILP solver.") 11 | except: 12 | from scipy.optimize import linprog 13 | print("Using scipy for ILP solver. It may take really long to solve. Please consider install ortool (see README).") 14 | from .mindmatch import perform_mindmatch, compute_conflicts 15 | -------------------------------------------------------------------------------- /mm_feedback_site/data/ccn_feedback.json: -------------------------------------------------------------------------------- 1 | {"registrant_id": "1013", "relevances": ["4", "3", "3", "0", "0", "0"], "satisfactory": ["4", "3", "4", "0", "0", "0"], "coi": ["0", "1", "0", "0", "0", "0"], "feedback_text": "Great one!", "arrange_before": "0", "useful": "9", "enjoyable": "8", "timestamp": "2019-09-17 17:51:10.572578"} 2 | {"registrant_id": "1013", "relevances": ["4", "3", "3", "0", "0", "0"], "satisfactory": ["4", "3", "4", "0", "0", "0"], "coi": ["0", "1", "0", "0", "0", "0"], "feedback_text": "Great one!", "arrange_before": "0", "useful": "10", "enjoyable": "8", "timestamp": "2019-09-17 17:51:20.803968"} 3 | {"registrant_id": "1013", "relevances": ["4", "3", "3", "0", "0", "0"], "satisfactory": ["4", "3", "4", "0", "0", "0"], "coi": ["0", "1", "0", "0", "0", "0"], "feedback_text": "Great one!", "arrange_before": "0", "useful": "7", "enjoyable": "7", "timestamp": "2019-09-17 17:52:10.838286"} -------------------------------------------------------------------------------- /mm_feedback_site/flask_templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Mind Matching Feedback Form 5 | {% include 'head.html' %} 6 | 7 | 8 | 9 |
10 |

Thank you!
You've completed CCN Mind-Matching 2019 feedback form!

11 | 12 |

13 | The individual data will not be shared elsewhere. 14 | We only collect the data to improve our algorithm in the coming year and to share the analysis of the data with you and organizers.

15 | 16 | We really appreciate your participation. This will improve the conference and neuroscience community in the future!
17 | 18 |
19 | - Konrad Kording, Titipat Achakulvisut, and CCN organizers 20 |


21 |

22 |
23 | 24 | {% include 'footer.html' %} 25 | 26 | 27 | -------------------------------------------------------------------------------- /paper_reviewer_matcher/preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | from unidecode import unidecode 4 | from nltk.stem.porter import PorterStemmer 5 | from nltk.tokenize import WhitespaceTokenizer 6 | 7 | __all__ = ["preprocess"] 8 | 9 | stemmer = PorterStemmer() 10 | w_tokenizer = WhitespaceTokenizer() 11 | punct_re = re.compile('[{}]'.format(re.escape(string.punctuation))) 12 | 13 | def preprocess(text, stemming=True): 14 | """ 15 | Apply Snowball stemmer to string 16 | 17 | Parameters 18 | ---------- 19 | text : str, input abstract of papers/posters string 20 | stemming : boolean, apply Porter stemmer if True, 21 | default True 22 | """ 23 | if isinstance(text, (type(None), float)): 24 | text_preprocess = '' 25 | else: 26 | text = unidecode(text).lower() 27 | text = punct_re.sub(' ', text) # remove punctuation 28 | if stemming: 29 | text_preprocess = [stemmer.stem(token) for token in w_tokenizer.tokenize(text)] 30 | else: 31 | text_preprocess = w_tokenizer.tokenize(text) 32 | text_preprocess = ' '.join(text_preprocess) 33 | return text_preprocess 34 | -------------------------------------------------------------------------------- /mm_feedback_site/process_feedback_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | 6 | if __name__ == '__main__': 7 | feedback_df = pd.read_json('data/ccn_feedback.json', orient='records', lines=True) 8 | feedback_df['timestamp'] = pd.to_datetime(feedback_df.timestamp, infer_datetime_format=True) 9 | feedback_df = feedback_df.sort_values('timestamp').groupby('registrant_id').last().reset_index().sort_values('timestamp') 10 | 11 | # text feedback from all responses 12 | n_text_feedback, n_responses = feedback_df.feedback_text.map(lambda x: x.strip() != '').sum(), len(feedback_df) 13 | print('number of response = {}, number of text feedback = {}, percentage = {} %'.format(n_responses, n_text_feedback, 100 * n_text_feedback / n_responses )) 14 | 15 | feedback_df['coi'] = feedback_df.coi.map(lambda x: ','.join(['1' if int(e) > 0 else '0' for e in x])) 16 | feedback_df['relevances'] = feedback_df.relevances.map(lambda x: ','.join(x)) 17 | feedback_df['satisfactory'] = feedback_df.satisfactory.map(lambda x: ','.join(x)) 18 | feedback_df.to_csv('data/ccn_2019_feedback.csv', index=False) # to send to organizer 19 | 20 | enjoyable = feedback_df.enjoyable.astype(int).values 21 | enjoyable = enjoyable[enjoyable > 0] 22 | print('average enjoyable score = {}'.format(enjoyable.mean())) 23 | 24 | usefulness = feedback_df.useful.astype(int).values 25 | usefulness = usefulness[usefulness > 0] 26 | print('average usefulness score = {}'.format(usefulness.mean())) -------------------------------------------------------------------------------- /mm_feedback_site/create_mm_form_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create the mind-matching data for the site from the matched mind-matching CSV file 3 | and CSV data given from the conference 4 | """ 5 | import pandas as pd 6 | import json 7 | 8 | if __name__ == '__main__': 9 | # read matched tables, full dataset 10 | match_df = pd.read_csv('../CCN_2019/ccn_mindmatch_2019.csv') 11 | df = pd.read_csv('../CCN_2019/CN19_MindMatchData_20190903-A.csv') 12 | df = df.merge(match_df, how='left', on='RegistrantID') 13 | for i in range(0, 6): 14 | df['table_%s' % str(i)] = df.ScheduleTables.map(lambda x: x.split('|')[i]) 15 | 16 | information = {} 17 | for _, r in df.iterrows(): 18 | information[r['RegistrantID']] = { 19 | 'registrant_id': r['RegistrantID'], 20 | 'full_name': r['full_name'], 21 | 'email': r['Email'], 22 | 'affiliation': r['Affiliation'] 23 | } 24 | 25 | mind_match_forms = [] 26 | for _, r in df.iterrows(): 27 | tables = r['ScheduleTables'].split('|') 28 | matches = [] 29 | for i, table in enumerate(tables): 30 | mind_match_id = list(set(df[df['table_%s' % i] == table].RegistrantID.values) - {r['RegistrantID']})[0] 31 | matches.append(information[mind_match_id]) 32 | mind_match_forms.append({ 33 | 'registrant_id': r['RegistrantID'], 34 | 'full_name': r['full_name'], 35 | 'email': r['Email'], 36 | 'affiliation': r['Affiliation'], 37 | 'matches_info': matches 38 | }) 39 | json.dump(mind_match_forms, open('../CCN_2019/ccn_mind_match_feedback_form.json', 'w'), indent=4) -------------------------------------------------------------------------------- /mm_feedback_site/flask_templates/head.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /paper_reviewer_matcher/lp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm.auto import tqdm 3 | from scipy.sparse import coo_matrix 4 | from ortools.linear_solver import pywraplp 5 | 6 | __all__ = ["linprog"] 7 | 8 | def linprog(f, A, b): 9 | """ 10 | Solve the following linear programming problem 11 | maximize_x (f.T).dot(x) 12 | subject to A.dot(x) <= b 13 | where A is a sparse matrix (coo_matrix) 14 | f is column vector of cost function associated with variable 15 | b is column vector 16 | """ 17 | 18 | # flatten the variable 19 | f = f.ravel() 20 | b = b.ravel() 21 | 22 | solver = pywraplp.Solver('SolveReviewerAssignment', 23 | pywraplp.Solver.GLOP_LINEAR_PROGRAMMING) 24 | 25 | infinity = solver.Infinity() 26 | n, m = A.shape 27 | x = [[]] * m 28 | c = [0] * n 29 | 30 | print("Setting up variables...") 31 | for j in tqdm(range(m)): 32 | x[j] = solver.NumVar(-infinity, infinity, 'x_%u' % j) 33 | 34 | # state objective function 35 | print("Setting up objective function...") 36 | objective = solver.Objective() 37 | for j in tqdm(range(m)): 38 | objective.SetCoefficient(x[j], f[j]) 39 | objective.SetMaximization() 40 | 41 | # state the constraints 42 | print("Setting up constraints...") 43 | for i in tqdm(range(n)): 44 | c[i] = solver.Constraint(-infinity, int(b[i])) 45 | for j in A.col[A.row == i]: 46 | c[i].SetCoefficient(x[j], A.data[np.logical_and(A.row == i, A.col == j)][0]) 47 | 48 | result_status = solver.Solve() 49 | if result_status != 0: 50 | print("The final solution might not converged") 51 | 52 | x_sol = np.array([x_tmp.SolutionValue() for x_tmp in x]) 53 | 54 | return {'x': x_sol, 'status': result_status} 55 | 56 | 57 | def test_example(): 58 | """ 59 | Solves example problem from http://www.vitutor.com/alg/linear_programming/example_programming.html 60 | f(x,y)= 50x + 40y 61 | subject to: 62 | 2x+3y <= 1500 63 | 2x + y <= 1000 64 | x >= 0 -> -x <= 0 65 | y >= 0 -> -y <= 0 66 | """ 67 | f = np.array([50, 40], dtype=np.float) 68 | A = np.array([[ 2, 3], 69 | [ 2, 1], 70 | [-1, 0], 71 | [ 0, -1]], dtype=np.float) 72 | C = np.array([1500, 1000, 0, 0]) 73 | x_sol = linprog(f, coo_matrix(A), C) 74 | print('Example Problem:') 75 | print('maximize_x\t 50x + 40y') 76 | print('s.t.\t\t 2x+3y <= 1500, 2x + y <= 1000, x >= 0, y >= 0') 77 | print('Solution: (x, y) = ', x_sol) 78 | 79 | 80 | if __name__ == '__main__': 81 | test_example() 82 | -------------------------------------------------------------------------------- /paper_reviewer_matcher/mindmatch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from fuzzywuzzy import fuzz 4 | from tqdm.auto import tqdm 5 | from .lp import linprog 6 | from .affinity import create_lp_matrix, create_assignment 7 | 8 | __all__ = ["perform_mindmatch"] 9 | 10 | 11 | def compute_conflicts(df: pd.DataFrame, ratio: int = 85, sep: str = ";"): 12 | """ 13 | Compute conflict for a given dataframe 14 | 15 | Parameters 16 | ========== 17 | df: pd.Dataframe, a dataframe which have a column "conflicts" 18 | where each row has 19 | scientist names with separator (default as semicolon ;) 20 | ratio: int, Fuzzy matching ratio, 100 mean exact match, 85 allow some errors 21 | sep: str, a separator 22 | """ 23 | cois = [] 24 | for i, r in tqdm(df.iterrows()): 25 | exclude_list = r['conflicts'].split(sep) 26 | for j, r_ in df.iterrows(): 27 | if max([fuzz.ratio(r_['fullname'], n) for n in exclude_list]) >= ratio: 28 | cois.append([i, j]) 29 | cois.append([j, i]) 30 | return cois 31 | 32 | 33 | def perform_mindmatch( 34 | A: np.array, n_trim: int = None, 35 | n_match: int = 6, cois: list = None 36 | ): 37 | """ 38 | Perform mindmatching with a given matrix A, 39 | trimming of n_trim (reduce problem size), 40 | matching between n_match people 41 | """ 42 | # setting distance in the diagonal 43 | A[np.arange(len(A)), np.arange(len(A))] = -1000 44 | 45 | # if conflict of interest (COIs) is available, add to the matrix 46 | cois = [(c1, c2) for (c1, c2) in cois 47 | if c1 <= len(A) and c2 <= len(A)] # make sure a given cois is in range 48 | A[np.array(cois)] = -1000 49 | 50 | # trimming affinity matrix to reduce the problem size 51 | if n_trim != 0: 52 | A_trim = [] 53 | for r in range(len(A)): 54 | a = A[r, :] 55 | a[np.argsort(a)[0:n_trim]] = 0 56 | A_trim.append(a) 57 | A_trim = np.vstack(A_trim) 58 | else: 59 | A_trim = A 60 | 61 | # solving matching problem 62 | print('Solving a matching problem...') 63 | v, K, d = create_lp_matrix(A_trim, 64 | min_reviewers_per_paper=n_match, max_reviewers_per_paper=n_match, 65 | min_papers_per_reviewer=n_match, max_papers_per_reviewer=n_match) 66 | x_sol = linprog(v, K, d)['x'] 67 | b = create_assignment(x_sol, A_trim) 68 | 69 | if (b.sum() == 0): 70 | print('Seems like the problem does not converge, try reducing but not too low!') 71 | else: 72 | print('Successfully assigned all the match!') 73 | return b 74 | -------------------------------------------------------------------------------- /group_matching.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================= 3 | neuromatch Group Matching 4 | ========================= 5 | 6 | Group matching script and its output. We read dataset from Cloud Firestore 7 | and export as CSV, and JSON file. 8 | 9 | TODO: make it as a script, add documentation on how it works 10 | """ 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from fuzzywuzzy import fuzz 15 | from tqdm import tqdm 16 | 17 | from scipy.cluster.hierarchy import linkage 18 | import hcluster # requires dedupe-hcluster 19 | from paper_reviewer_matcher import ( 20 | preprocess, compute_affinity 21 | ) 22 | 23 | 24 | def compute_conflicts(df): 25 | """ 26 | Compute conflict for a given dataframe 27 | """ 28 | cois = [] 29 | for i, r in tqdm(df.iterrows()): 30 | exclude_list = r['conflicts'].split(';') 31 | for j, r_ in df.iterrows(): 32 | if max([fuzz.ratio(r_['fullname'], n) for n in exclude_list]) >= 85: 33 | cois.append([i, j]) 34 | cois.append([j, i]) 35 | return cois 36 | 37 | def generate_pod_numbers(n_users, n_per_group): 38 | """ 39 | Generate pod numbers in sequence 40 | """ 41 | groups = [] 42 | for i in range(1, int(n_users / n_per_group) + 2): 43 | groups.extend([i] * n_per_group) 44 | groups = groups[:n_users] 45 | return groups 46 | 47 | 48 | if __name__ == '__main__': 49 | users = pd.read_csv('data/mindmatch_example.csv').to_dict(orient='records') 50 | n_users = len(users) 51 | print('Number of registered users: {}'.format(n_users)) 52 | 53 | users_df = pd.DataFrame(users).fillna('') 54 | users_dict = {r['user_id']: dict(r) for _, r in users_df.iterrows()} # map of user id to details 55 | persons_1 = list(map(preprocess, list(users_df['abstracts']))) 56 | persons_2 = list(map(preprocess, list(users_df['abstracts']))) 57 | A = compute_affinity( 58 | persons_1, persons_2, 59 | n_components=30, min_df=2, max_df=0.8, 60 | weighting='tfidf', projection='svd' 61 | ) 62 | cois_list = compute_conflicts(users_df) 63 | for i, j in cois_list: 64 | A[i, j] = -1 65 | 66 | A_cluster = - A 67 | A_cluster[A_cluster == 1000] = 1 68 | A_rand = np.random.randn(n_users, n_users) * 0.01 * A_cluster.var() # add randomness 69 | 70 | z = linkage(A_cluster + A_rand, 71 | method='average', 72 | metric='euclidean', 73 | optimal_ordering=True) 74 | cluster = hcluster.fcluster(z, t=0.01, 75 | criterion='distance') # distance 76 | users_group_df['cluster'] = cluster 77 | users_sorted_df = users_group_df.sort_values('cluster') 78 | cluster_numbers = generate_pod_numbers(n_users=len(users_sorted_df), n_per_group=5) 79 | users_sorted_df['cluster'] = cluster_numbers 80 | users_sorted_df.to_csv('group_matching_users.csv', index=False) 81 | -------------------------------------------------------------------------------- /mm_feedback_site/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import flask 4 | from flask import Flask, request 5 | import datetime 6 | 7 | 8 | def read_json(file_path): 9 | """ 10 | Read collected file from path 11 | """ 12 | if not os.path.exists(file_path): 13 | return [] 14 | else: 15 | with open(file_path, 'r') as fp: 16 | ls = [json.loads(line) for line in fp] 17 | return ls 18 | 19 | 20 | def save_json(ls, file_path): 21 | """ 22 | Save list of dictionary to JSON 23 | """ 24 | with open(file_path, 'w') as fp: 25 | fp.write('\n'.join(json.dumps(i) for i in ls)) 26 | 27 | 28 | app = Flask(__name__, 29 | template_folder='flask_templates') 30 | app.secret_key = 'made at Kording Lab.' 31 | app.config['TEMPLATES_AUTO_RELOAD'] = True 32 | 33 | MINDMATCH_DATA_PATH = 'data/ccn_mind_match_feedback_form.json' 34 | FEEDBACK_DATA_PATH = 'data/ccn_feedback.json' 35 | mind_match_data = json.load(open(MINDMATCH_DATA_PATH, 'r')) 36 | 37 | 38 | @app.route("/", methods=['GET', 'POST']) 39 | def index(): 40 | return flask.render_template('index.html') 41 | 42 | 43 | @app.route('/regid/') 44 | def feedback_form(reg_id): 45 | try: 46 | data = [d for d in mind_match_data 47 | if d['registrant_id'] == int(reg_id)][0] 48 | for match in data['matches_info']: 49 | match.pop('registrant_id', None) 50 | data.update({ 51 | 'enumerate': enumerate, 52 | }) 53 | except: 54 | data = { 55 | "registrant_id": 0, 56 | "full_name": "John Doe", 57 | "email": "john_doe@gmail.com", 58 | "affiliation": "Random University", 59 | "matches_info": [], 60 | "enumerate": enumerate 61 | } 62 | return flask.render_template('feedback.html', **data) 63 | 64 | 65 | @app.route('/handle_submit/', methods=['GET', 'POST']) 66 | def handle_submit(): 67 | # save data here 68 | if request.method == 'POST': 69 | print(request.form) 70 | feedback_data = read_json(FEEDBACK_DATA_PATH) 71 | registrant_id = request.form['registrant_id'] 72 | feedback_text = request.form.get('text_input', '') 73 | relevances = [request.form.get('relevance_%s' % i, '0') 74 | for i in range(0, 6)] 75 | satisfactory = [request.form.get('satisfactory_%s' % i, '0') 76 | for i in range(0, 6)] 77 | coi = [request.form.get('coi_%s' % i, '0') for i in range(0, 6)] 78 | arrange_before = request.form.get('before_checkbox', '0') 79 | 80 | useful = request.form.get('useful', '0') 81 | enjoyable = request.form.get('enjoyable', '0') 82 | 83 | feedback_data.append({ 84 | 'registrant_id': registrant_id, 85 | 'relevances': relevances, 86 | 'satisfactory': satisfactory, 87 | 'coi': coi, 88 | 'feedback_text': feedback_text, 89 | 'arrange_before': arrange_before, 90 | 'useful': useful, 91 | 'enjoyable': enjoyable, 92 | 'timestamp': str(datetime.datetime.now()) 93 | }) 94 | save_json(feedback_data, FEEDBACK_DATA_PATH) 95 | # return to default page 96 | return flask.redirect('/') 97 | 98 | 99 | if __name__ == "__main__": 100 | app.run(debug=True, host='0.0.0.0', thread=True) 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io 2 | 3 | mind_match_feedback_form/data/*.json 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | 65 | ### IPythonNotebook ### 66 | # Temporary data 67 | .ipynb_checkpoints/ 68 | 69 | 70 | ### Emacs ### 71 | # -*- mode: gitignore; -*- 72 | *~ 73 | \#*\# 74 | /.emacs.desktop 75 | /.emacs.desktop.lock 76 | *.elc 77 | auto-save-list 78 | tramp 79 | .\#* 80 | 81 | # Org-mode 82 | .org-id-locations 83 | *_archive 84 | 85 | # flymake-mode 86 | *_flymake.* 87 | 88 | # eshell files 89 | /eshell/history 90 | /eshell/lastdir 91 | 92 | # elpa packages 93 | /elpa/ 94 | 95 | # reftex files 96 | *.rel 97 | 98 | # AUCTeX auto folder 99 | /auto/ 100 | 101 | # cask packages 102 | .cask/ 103 | 104 | 105 | ### OSX ### 106 | .DS_Store 107 | .AppleDouble 108 | .LSOverride 109 | 110 | # Icon must end with two \r 111 | Icon 112 | 113 | 114 | # Thumbnails 115 | ._* 116 | 117 | # Files that might appear in the root of a volume 118 | .DocumentRevisions-V100 119 | .fseventsd 120 | .Spotlight-V100 121 | .TemporaryItems 122 | .Trashes 123 | .VolumeIcon.icns 124 | 125 | # Directories potentially created on remote AFP share 126 | .AppleDB 127 | .AppleDesktop 128 | Network Trash Folder 129 | Temporary Items 130 | .apdisk 131 | 132 | # R things 133 | .Rhistory 134 | .RData 135 | 136 | ### PyCharm ### 137 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 138 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 139 | 140 | # User-specific stuff: 141 | .idea/workspace.xml 142 | .idea/tasks.xml 143 | .idea/dictionaries 144 | .idea/vcs.xml 145 | .idea/jsLibraryMappings.xml 146 | 147 | # Sensitive or high-churn files: 148 | .idea/dataSources.ids 149 | .idea/dataSources.xml 150 | .idea/dataSources.local.xml 151 | .idea/sqlDataSources.xml 152 | .idea/dynamic.xml 153 | .idea/uiDesigner.xml 154 | 155 | # Gradle: 156 | .idea/gradle.xml 157 | .idea/libraries 158 | 159 | # Mongo Explorer plugin: 160 | .idea/mongoSettings.xml 161 | 162 | ## File-based project format: 163 | *.iws 164 | 165 | ## Plugin-specific files: 166 | 167 | # IntelliJ 168 | /out/ 169 | 170 | # mpeltonen/sbt-idea plugin 171 | .idea_modules/ 172 | 173 | # JIRA plugin 174 | atlassian-ide-plugin.xml 175 | 176 | # Crashlytics plugin (for Android Studio and IntelliJ) 177 | com_crashlytics_export_strings.xml 178 | crashlytics.properties 179 | crashlytics-build.properties 180 | fabric.properties 181 | 182 | ### PyCharm Patch ### 183 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 184 | 185 | # *.iml 186 | # modules.xml 187 | -------------------------------------------------------------------------------- /mindmatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """MindMatch: a script for matching people to people in the conference 4 | 5 | Usage: 6 | mindmatch.py PATH [--n_match=] [--n_trim=] [--output=] 7 | mindmatch.py [-h | --help] 8 | mindmatch.py [-v | --version] 9 | 10 | Arguments: 11 | PATH Path to a CSV file, 12 | a file need to have ('user_id', 'fullname', 'abstracts', 'conflicts') in the header 13 | 14 | Options: 15 | -h --help Show documentation helps 16 | --version Show version 17 | --n_match= Number of match per user 18 | --n_trim= Trimming parameter for distance matrix, increase to reduce problem size 19 | --output= Output CSV file contains 'user_id' and 'match_ids' which has match ids with ; separated 20 | """ 21 | import numpy as np 22 | import pandas as pd 23 | from docopt import docopt 24 | from paper_reviewer_matcher import ( 25 | preprocess, 26 | compute_affinity, 27 | perform_mindmatch, 28 | compute_conflicts 29 | ) 30 | 31 | 32 | if __name__ == "__main__": 33 | arguments = docopt(__doc__, version='MindMatch 0.1.dev') 34 | 35 | file_name = arguments['PATH'] 36 | df = pd.read_csv(file_name).fillna('').sample(n=500).reset_index(drop=True) 37 | assert 'user_id' in df.columns, "CSV file must have ``user_id`` in the columns" 38 | assert 'fullname' in df.columns, "CSV file must have ``fullname`` in the columns" 39 | assert 'abstracts' in df.columns, "CSV file must have ``abstracts`` in the columns" 40 | assert 'conflicts' in df.columns, "CSV file must have ``conflicts`` in the columns" 41 | print("Number of people in the file = {}".format(len(df))) 42 | 43 | n_match = arguments.get('--n_match') 44 | if n_match is None: 45 | n_match = 6 46 | print(' is set to default for 6 match per user') 47 | else: 48 | n_match = int(n_match) 49 | print('Number of match is set to {}'.format(n_match)) 50 | assert n_match >= 2, "You should set to be more than 2" 51 | 52 | n_trim = arguments.get('--n_trim') 53 | if n_trim is None: 54 | n_trim = 0 55 | print(' is set to default, this will take very long to converge for a large problem') 56 | else: 57 | n_trim = int(n_trim) 58 | print('Trimming parameter is set to {}'.format(n_trim)) 59 | 60 | output_filename = arguments.get('output') 61 | if output_filename is None: 62 | output_filename = 'output_match.csv' 63 | 64 | # create affinity matrix and compute conflicts 65 | persons_1 = list(map(preprocess, list(df['abstracts']))) 66 | persons_2 = list(map(preprocess, list(df['abstracts']))) 67 | A = compute_affinity( 68 | persons_1, persons_2, 69 | n_components=30, min_df=3, max_df=0.85, 70 | weighting='tfidf', projection='pca' 71 | ) 72 | print('Compute conflicts... (this may take a bit)') 73 | cois = compute_conflicts(df, ratio=85) 74 | print('Done computing conflicts!') 75 | 76 | # perform mindmatching 77 | b = perform_mindmatch(A, n_trim=n_trim, n_match=n_match, cois=cois) 78 | 79 | if (b.sum() != 0): 80 | output = [] 81 | user_ids_map = {ri: r['user_id'] for ri, r in df.iterrows()} 82 | for i in range(len(b)): 83 | match_ids = [str(user_ids_map[b_]) for b_ in np.nonzero(b[i])[0]] 84 | output.append({ 85 | 'user_id': user_ids_map[i], 86 | 'match_ids': ';'.join(match_ids) 87 | }) 88 | output_df = pd.DataFrame(output) 89 | output_df.to_csv(output_filename, index=False) 90 | print("Successfully save the output match to {}".format(output_filename)) 91 | else: 92 | print("Cannot solve the problem") 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Paper-Reviewer Matcher 2 | 3 | A python package for paper-reviewer matching algorithm based on topic modeling and linear programming. The algorithm is implemented based on this [article](http://www.cis.upenn.edu/~cjtaylor/PUBLICATIONS/pdfs/TaylorTR08.pdf)). This package solves problem of assigning paper to reviewers with constrains by solving linear programming problem. We minimize global distance between papers and reviewers in topic space (e.g. topic modeling can be Principal component, Latent Semantic Analysis (LSA), etc.). 4 | 5 | Here is a diagram of problem setup and how we solve the problem. 6 | 7 | 8 | 9 | ## Mind-Match Command Line 10 | 11 | Mind-Match is a session we run at Cognitive Computational Neuroscience (CCN) conference. 12 | We use a combination of topic modeling and linear programming to solve optimal matching problem. 13 | To run example Mind-Match algorithm on sample of 500 people, you can clone the repository and run the following 14 | 15 | ```sh 16 | python mindmatch.py data/mindmatch_example.csv --n_match=6 --n_trim=50 17 | ``` 18 | 19 | in the root of this repo. This should produce a matching output `output_match.csv` in this relative location. 20 | However, when people get much larger this script takes quite a long time to run. 21 | We use pre-cluster into groups before running the mind-matching to make the script runs faster. 22 | Below is an example script for pre-clustering and mind-matching on all data: 23 | 24 | ```sh 25 | python mindmatch_cluster.py data/mindmatch_example.csv --n_match=6 --n_trim=50 --n_clusters=4 26 | ``` 27 | 28 | ## Example script for the conferences 29 | 30 | Here, I include a recent scripts for our Mind Matching session for CCN conference. 31 | 32 | - `ccn_mind_matching_2019.py` contains script for Mind Matching session (match scientists to scientists) for [CCN conference](https://ccneuro.org/2018/) 33 | - `ccn_paper_reviewer_matching.py` contains script for matching publications to reviewers for [CCN conference](https://ccneuro.org/2019/), see example of CSV files in `data` folder 34 | 35 | The code makes the distance metric of topics between incoming papers with reviewers (for `ccn_paper_reviewer_matching.py`) and 36 | between people with people (for `ccn_mind_matching_2019`). We trim the metric so that the problem is not too big to solve using `or-tools`. 37 | It then solves linear programming problem to assign the best matches which minimize the global distance between papers to reviewers. 38 | After that, we make the output that can be used by the organizers of the CCN conference -- pairs of paper and reviewers or mind-matching 39 | schedule between people to people in the conference. 40 | You can see of how it works below. 41 | 42 | 43 | 44 | ## Dependencies 45 | 46 | Use `pip` to install dependencies 47 | 48 | ```sh 49 | pip install -r requirements.txt 50 | ``` 51 | 52 | Please see [Stackoverflow](http://stackoverflow.com/questions/26593497/cant-install-or-tools-on-mac-10-10) if you have a problem installing `or-tools` on MacOS. You can use `pip` to install `protobuf` before installing `or-tools` 53 | 54 | ```sh 55 | pip install protobuf==3.0.0b4 56 | pip install ortools 57 | ``` 58 | 59 | for Python 3.6, 60 | 61 | ```sh 62 | pip install --user --upgrade ortools 63 | ``` 64 | 65 | ## Citations 66 | 67 | If you use Paper-Reviewer Matcher in your work or conference, please cite us as follows 68 | 69 | ``` 70 | @misc{achakulvisut2018, 71 | author = {Achakulvisut, Titipat and Acuna, Daniel E. and Kording, Konrad}, 72 | title = {Paper-Reviewer Matcher}, 73 | year = {2018}, 74 | publisher = {GitHub}, 75 | journal = {GitHub repository}, 76 | howpublished = {\url{https://github.com/titipata/paper-reviewer-matcher}}, 77 | commit = {9d346ee008e2789d34034c2b330b6ba483537674} 78 | } 79 | ``` 80 | 81 | ## Members 82 | 83 | - [Daniel Acuna](https://scienceofscience.org/) (original author) 84 | - [Titipat Achakulvisut](https://github.com/titipata) (refactor) 85 | - [Konrad Kording](http://kordinglab.com/) 86 | -------------------------------------------------------------------------------- /mindmatch_cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Mind match with pre-clustering: a script for matching people to people in the conference 4 | Here, we add a trick to make the problem smaller by applying spectral clustering and 5 | apply mind-matching to each cluster. 6 | 7 | Usage: 8 | mindmatch_cluster.py PATH [--n_match=] [--n_trim=] [--output=] [--n_clusters=] 9 | mindmatch_cluster.py [-h | --help] 10 | mindmatch_cluster.py [-v | --version] 11 | 12 | Arguments: 13 | PATH Path to a CSV file, 14 | a file need to have ('user_id', 'fullname', 'abstracts', 'conflicts') in the header 15 | 16 | Options: 17 | -h --help Show documentation helps 18 | --version Show version 19 | --n_match= Number of match per user 20 | --n_clusters= Number of cluster before performing mindmatch 21 | --n_trim= Trimming parameter for distance matrix, increase to reduce problem size 22 | --output= Output CSV file contains 'user_id' and 'match_ids' which has match ids with ; separated 23 | """ 24 | import numpy as np 25 | import pandas as pd 26 | from docopt import docopt 27 | from paper_reviewer_matcher import ( 28 | preprocess, 29 | compute_topics, 30 | perform_mindmatch, 31 | compute_conflicts, 32 | calculate_affinity_distance 33 | ) 34 | from sklearn.cluster import SpectralClustering 35 | 36 | if __name__ == "__main__": 37 | arguments = docopt(__doc__, version='MindMatch 0.1.dev') 38 | 39 | file_name = arguments['PATH'] 40 | df = pd.read_csv(file_name).fillna('') 41 | assert 'user_id' in df.columns, "CSV file must have ``user_id`` in the columns" 42 | assert 'fullname' in df.columns, "CSV file must have ``fullname`` in the columns" 43 | assert 'abstracts' in df.columns, "CSV file must have ``abstracts`` in the columns" 44 | assert 'conflicts' in df.columns, "CSV file must have ``conflicts`` in the columns" 45 | print("Number of people in the file = {}".format(len(df))) 46 | 47 | n_match = arguments.get('--n_match') 48 | if n_match is None: 49 | n_match = 6 50 | print(' is set to default for 6 match per user') 51 | else: 52 | n_match = int(n_match) 53 | print('Number of match is set to {}'.format(n_match)) 54 | assert n_match >= 2, "You should set to be more than 2" 55 | 56 | n_trim = arguments.get('--n_trim') 57 | if n_trim is None: 58 | n_trim = 0 59 | print(' is set to default, this will take very long to converge for a large problem') 60 | else: 61 | n_trim = int(n_trim) 62 | print('Trimming parameter is set to {}'.format(n_trim)) 63 | 64 | n_clusters= arguments.get('--n_clusters') 65 | if n_clusters is None: 66 | n_cluters = 4 67 | print('Setting number of clusters to 4') 68 | else: 69 | n_clusters = int(n_clusters) 70 | print('Setting number of clusters to 4') 71 | 72 | output_filename = arguments.get('output') 73 | if output_filename is None: 74 | output_filename = 'output_match.csv' 75 | 76 | # compute topics 77 | X_topic = compute_topics(list(map(preprocess, list(df['abstracts'])))) 78 | spectral_clustering = SpectralClustering(n_clusters=n_clusters, random_state=42) 79 | labels = spectral_clustering.fit_predict(X_topic) 80 | labels[0] = 3 # make some trick so that each group has even numbers, this is specific to this example 81 | 82 | df["group"] = labels 83 | df["topics"] = [x for x in X_topic] 84 | 85 | output = [] 86 | for _, df_group in df.groupby("group"): 87 | X = np.vstack(df_group.topics.values) # topics 88 | A = calculate_affinity_distance(X, X) # calculate affinity matrix 89 | cois = compute_conflicts(df_group.reset_index(drop=True)) # COIs from names 90 | b = perform_mindmatch(A, n_trim=10, n_match=6, cois=cois) # performing 91 | 92 | user_ids_map = {ri: r['user_id'] for ri, r in df_group.reset_index(drop=True).iterrows()} 93 | for i in range(len(b)): 94 | match_ids = [str(user_ids_map[b_]) for b_ in np.nonzero(b[i])[0]] 95 | output.append({ 96 | 'user_id': user_ids_map[i], 97 | 'match_ids': ';'.join(match_ids) 98 | }) 99 | output_df = pd.DataFrame(output) 100 | output_df.to_csv(output_filename, index=False) 101 | -------------------------------------------------------------------------------- /ccn/ccn_paper_reviewer_matching_2019.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import numpy as np 3 | import pandas as pd 4 | import scipy.sparse as sp 5 | from paper_reviewer_matcher import ( 6 | preprocess, compute_affinity, 7 | create_lp_matrix, linprog, 8 | create_assignment 9 | ) 10 | 11 | 12 | def assign_articles_to_reviewers(article_df, reviewer_df, people_df): 13 | """ 14 | Perform reviewer-assignment from dataframe of article, reviewer, and people 15 | 16 | Parameters 17 | ========== 18 | article_df: a dataframe that has columns `PaperID`, `Title`, `Abstract`, and `PersonIDList` 19 | where PersonIDList contains string of simicolon separated list of PersonID 20 | reviewer_df: a dataframe that has columns `PersonID` and `Abstract` 21 | people_df: dataframe that has columns `PersonID`, `FullName` 22 | 23 | We assume `PersonID` is an integer 24 | 25 | Output 26 | ====== 27 | article_assignment_df: an assigned reviewers dataframe, each row of article will have 28 | list of reviewers in `ReviewerIDList` column and their name in reviewer_names 29 | """ 30 | papers = list((article_df['Title'] + ' ' + article_df['Abstract']).map(preprocess)) 31 | reviewers = list(reviewer_df['Abstract'].map(preprocess)) 32 | 33 | # Calculate conflict of interest based on co-authors 34 | coauthors_df = pd.DataFrame([[int(r.PaperID), int(co_author)] 35 | for _, r in article_df.iterrows() 36 | for co_author in r.PersonIDList.split(';')], 37 | columns = ['PaperID', 'PersonID']) 38 | article_df['paper_id'] = list(range(len(article_df))) 39 | reviewer_df['person_id'] = list(range(len(reviewer_df))) 40 | coi_df = coauthors_df.merge(article_df[['PaperID', 'paper_id']], 41 | on='PaperID').merge(reviewer_df[['PersonID', 'person_id']], 42 | on='PersonID')[['paper_id', 'person_id']] 43 | 44 | # calculate affinity matrix 45 | A = compute_affinity( 46 | papers, reviewers, 47 | n_components=10, min_df=2, max_df=0.8, 48 | weighting='tfidf', projection='pca' 49 | ) 50 | 51 | # trim distance that are too high 52 | A_trim = [] 53 | for r in range(len(A)): 54 | a = A[r, :] 55 | a[np.argsort(a)[0:200]] = 0 56 | A_trim.append(a) 57 | A_trim = np.vstack(A_trim) 58 | 59 | # assign conflict of interest to have high negative cost 60 | for i, j in zip(coi_df.paper_id.tolist(), coi_df.person_id.tolist()): 61 | A_trim[i, j] = -1000 62 | 63 | # for CCN case, 64 | v, K, d = create_lp_matrix(A_trim, 65 | min_reviewers_per_paper=6, max_reviewers_per_paper=6, 66 | min_papers_per_reviewer=4, max_papers_per_reviewer=6) 67 | x_sol = linprog(v, K, d)['x'] 68 | b = create_assignment(x_sol, A_trim) 69 | reviewer_ids = list(reviewer_df.PersonID) 70 | reviewer_name_dict = {r['PersonID']: r['FullName'] for _, r in people_df.iterrows()} # map reviewer id to reviewer name 71 | assignments = [] 72 | for i in range(len(b)): 73 | assignments.append([i, 74 | [reviewer_ids[b_] for b_ in np.nonzero(b[i])[0]], 75 | [reviewer_name_dict[reviewer_ids[b_]] for b_ in np.nonzero(b[i])[0]]]) 76 | assignments_df = pd.DataFrame(assignments, columns=['paper_id', 'ReviewerIDList', 'reviewer_names']) 77 | assignments_df['ReviewerIDList'] = assignments_df.ReviewerIDList.map(lambda e: ';'.join(str(e_) for e_ in e)) 78 | assignments_df['reviewer_names'] = assignments_df.reviewer_names.map(lambda x: ';'.join(x)) 79 | article_assignment_df = article_df.merge(assignments_df, on='paper_id').drop('paper_id', axis=1) 80 | return article_assignment_df 81 | 82 | 83 | if __name__ == '__main__': 84 | CCN_PATH = '/path/to/*.csv' 85 | article_path, reviewer_path, people_path = [path for path in glob(CCN_PATH) 86 | if 'CCN' in path and 'fixed' not in path] 87 | # there is a problem when encoding lines in the given CSV so we have to use ISO-8859-1 instead 88 | article_df = pd.read_csv(article_path) # has columns `PaperID`, `Title`, `Abstract`, `PersonIDList` 89 | reviewer_df = pd.read_csv(reviewer_path, encoding="ISO-8859-1") # has columns `PersonID`, `Abstract` 90 | people_df = pd.read_csv(people_path, encoding="ISO-8859-1") # has columns `PersonID`, `FullName` 91 | article_assignment_df = assign_articles_to_reviewers(article_df, reviewer_df, people_df) 92 | article_assignment_df.to_csv('article_assignment.csv', index=False) -------------------------------------------------------------------------------- /mm_feedback_site/static/css/style.css: -------------------------------------------------------------------------------- 1 | /* -------------- */ 2 | /* custom styling */ 3 | /* -------------- */ 4 | 5 | html { 6 | /* use default font from bootstrap 4 */ 7 | /* font-family: 'helvetica neue', helvetica, arial, sans-serif; */ 8 | position: relative; 9 | min-height: 100%; 10 | } 11 | 12 | body { 13 | /* Margin bottom by footer height */ 14 | margin-bottom: 60px; 15 | } 16 | 17 | body > .container { 18 | padding: 35px 0 15px 0; 19 | } 20 | 21 | .navbar .container { 22 | padding: 0; 23 | } 24 | 25 | .footer { 26 | text-align: center; 27 | position: absolute; 28 | bottom: 0; 29 | width: 100%; 30 | /* Set the fixed height of the footer here */ 31 | height: 70px; 32 | line-height: 20px; /* Vertically center the text there */ 33 | background-color: #1E303C; 34 | color: #fff; 35 | font-size: 12px; 36 | } 37 | 38 | .footer > .container { 39 | padding-top: 15px; 40 | } 41 | 42 | /* remove outer glow of textarea input */ 43 | .form-control:focus { 44 | outline-color: transparent; 45 | outline-style: none; 46 | box-shadow: none; 47 | } 48 | 49 | /* -------------- */ 50 | /* result styling */ 51 | /* -------------- */ 52 | 53 | .centerValue { 54 | width: auto; 55 | /*padding-right: 1%;*/ 56 | display: block; 57 | /*vertical-align: top;*/ 58 | padding-top: 0.25em; 59 | } 60 | 61 | .leftBox { 62 | text-align: center; 63 | width: 6%; 64 | display: inline-block; 65 | } 66 | 67 | .rightTxt { 68 | margin-bottom: 20px; 69 | width: 92%; 70 | padding-left: 2%; 71 | display: inline-block; 72 | vertical-align: middle; 73 | } 74 | 75 | .navbar-default .navbar-brand { 76 | font-weight: 600; 77 | } 78 | 79 | /* mobile first */ 80 | @media only screen and (min-width: 768px) { 81 | .navbar-default .navbar-brand { 82 | font-size: 1.5rem; 83 | } 84 | } 85 | 86 | /* mobile */ 87 | @media only screen and (max-width: 768px) { 88 | .leftBox { 89 | width: 15%; 90 | } 91 | 92 | .rightTxt { 93 | width: 83%; 94 | } 95 | } 96 | 97 | @media only screen and (max-width: 575px) { 98 | body > .container { 99 | width: 92%; 100 | } 101 | 102 | .leftBox { 103 | display: none; 104 | } 105 | 106 | .rightTxt { 107 | width: 100%; 108 | /* padding: 0; */ 109 | } 110 | 111 | #data-entity-results-dim { 112 | border-left: 4px solid #81ff76; 113 | } 114 | #data-entity-conclusions-dim { 115 | border-left: 4px solid #ffa425; 116 | } 117 | #data-entity-methods-dim { 118 | border-left: 4px solid #7cb9e8; 119 | } 120 | #data-entity-objective-dim { 121 | border-left: 4px solid #fb607f; 122 | } 123 | #data-entity-background-dim { 124 | border-left: 4px solid #f6f3b4; 125 | } 126 | } 127 | 128 | @media only screen and (max-width: 450px) { 129 | body { 130 | margin-bottom: 90px; 131 | } 132 | 133 | .footer { 134 | height: 90px; 135 | } 136 | } 137 | 138 | /* ----------------------- */ 139 | /* progress bar overriding */ 140 | /* ----------------------- */ 141 | 142 | .progress { 143 | vertical-align: top; 144 | height: 6px; 145 | margin-top: 0.5em; 146 | margin-bottom: auto; 147 | border-radius: 2px; 148 | /*background-color: #DDD; 149 | box-shadow: none; 150 | border: solid #ddd 0.25px; 151 | border-left: solid #000 1px;*/ 152 | } 153 | 154 | .progress-bar { 155 | background-color: #F8B902; 156 | } 157 | 158 | /* ----------- */ 159 | /* submit form */ 160 | /*------------ */ 161 | 162 | .submit-button-wrapper { 163 | display: flex; 164 | flex: 1; 165 | padding: 15px 0 30px 0; 166 | align-items: center; 167 | justify-content: center; 168 | } 169 | 170 | /* ----------------- */ 171 | /* navbar overriding */ 172 | /* ----------------- */ 173 | 174 | .p-in-nav { 175 | padding: 15px 15px 0px 15px; 176 | color: #ff3848; 177 | } 178 | 179 | .override-nav-color, .panel-default>.panel-heading { 180 | background-color:#1E303C; 181 | } 182 | 183 | .override-nav-color, .navbar-default .navbar-nav>li>a, .navbar-default .navbar-brand, .panel-default>.panel-heading { 184 | color: #fff; 185 | } 186 | 187 | .navbar-default .navbar-nav>li>a:hover, .navbar-default .navbar-brand:hover { 188 | color: #ddd; 189 | } 190 | 191 | .navbar-default .navbar-nav>li>a:focus, .navbar-default .navbar-brand:focus { 192 | color: #bbb; 193 | } 194 | 195 | .login-panel { 196 | margin-top: 10%; 197 | } 198 | 199 | .override-button-font { 200 | font-size: 1.25em; 201 | font-weight: bold; 202 | color: #333; 203 | } 204 | 205 | .override-button-size { 206 | padding: .6rem 1rem; 207 | } 208 | 209 | .label-entity { 210 | padding: .2em .3em; 211 | font-size: 0.8rem; 212 | margin: 0 .25em; 213 | line-height: 1; 214 | display: inline-block; 215 | border-radius: .25em; 216 | box-sizing: border-box; 217 | } 218 | 219 | #data-entity-background { 220 | background-color: #f6f3b4; 221 | } 222 | 223 | #data-entity-objective { 224 | background-color: #fb607f; 225 | } 226 | 227 | #data-entity-methods { 228 | background-color: #7cb9e8; 229 | } 230 | 231 | #data-entity-conclusions { 232 | background-color: #ffa525; 233 | } 234 | 235 | #data-entity-results { 236 | background-color: #81ff76; 237 | } 238 | 239 | #data-entity-claim { 240 | background-color: #ff7f7f; 241 | } 242 | 243 | #data-entity-not-claim { 244 | background-color: #dddddd; 245 | } -------------------------------------------------------------------------------- /cosyne/cosyne_paper_reviewer_matching_2020.py: -------------------------------------------------------------------------------- 1 | import re 2 | from glob import glob 3 | import numpy as np 4 | import pandas as pd 5 | import paper_reviewer_matcher as pm 6 | import scipy.sparse as sp 7 | from paper_reviewer_matcher import ( 8 | preprocess, compute_affinity, 9 | create_lp_matrix, create_assignment, 10 | linprog 11 | ) 12 | from fuzzywuzzy import fuzz 13 | 14 | 15 | def find_user_ids(authors): 16 | user_ids = re.findall(r'#(\w+)', authors) 17 | return [int(idx) for idx in user_ids] 18 | 19 | 20 | def clean_keywords(keywords): 21 | keywords = keywords.replace('[', '') 22 | keywords = keywords.replace(']', '') 23 | keywords = keywords.replace(',', '') 24 | keywords = keywords.replace('/', '') 25 | return keywords 26 | 27 | 28 | def clean_authors(authors): 29 | return re.sub(r'#(\w+)', '', authors).replace('()', '') 30 | 31 | 32 | def create_coi_list(authors, df): 33 | cois = [] 34 | for i, r in df.iterrows(): 35 | for cl in r['CollaboratorsList']: 36 | if max([fuzz.ratio(a, cl) for a in authors]) >= 80: 37 | cois.append(i) 38 | return cois 39 | 40 | 41 | def create_coi_author_ids(user_ids, df): 42 | cois = [] 43 | for i, r in df.iterrows(): 44 | if r['UserID'] in user_ids: 45 | cois.append(i) 46 | return cois 47 | 48 | 49 | def create_assignment_dataframe(b, reviewer_map, paper_id_map, pool_group='a'): 50 | """ 51 | Get the assignment array, generate assignment dataframe 52 | """ 53 | assignments = [] 54 | for i in range(len(b)): 55 | assignments.append([ 56 | paper_id_map[i], [reviewer_map[b_] for b_ in np.nonzero(b[i])[0]] 57 | ]) 58 | assignments_df = pd.DataFrame(assignments, columns=['PaperID', 'UserIDs']) 59 | n_reviewers = len(assignments_df.UserIDs.iloc[0]) 60 | for c in range(n_reviewers): 61 | assignments_df['UserID_{}_{}'.format( 62 | pool_group, c + 1)] = assignments_df.UserIDs.map(lambda x: x[c]) 63 | return assignments_df.drop('UserIDs', axis=1) 64 | 65 | 66 | if __name__ == '__main__': 67 | submission_path, reviewer_a_path, reviewer_b_path = glob('PATH_TO/cosyne-2020/*.csv') 68 | submission_df = pd.read_csv(submission_path) 69 | reviewer_a_df = pd.read_csv(reviewer_a_path) 70 | reviewer_b_df = pd.read_csv(reviewer_b_path) 71 | submission_df.loc[:, 'keywords'] = submission_df.Keywords.map(lambda x: clean_keywords(x)) 72 | reviewer_a_df.loc[:, 'keywords'] = reviewer_a_df.Keywords.fillna('').map(lambda x: clean_keywords(x)) 73 | reviewer_b_df.loc[:, 'keywords'] = reviewer_b_df.Keywords.fillna('').map(lambda x: clean_keywords(x)) 74 | reviewer_a_df['UserID'] = reviewer_a_df.UserID.astype(int) 75 | reviewer_b_df['UserID'] = reviewer_b_df.UserID.astype(int) 76 | reviewer_a_df['FullName'] = reviewer_a_df['FirstName'] + \ 77 | ' ' + reviewer_a_df['LastName'] 78 | reviewer_b_df['FullName'] = reviewer_b_df['FirstName'] + \ 79 | ' ' + reviewer_b_df['LastName'] 80 | submission_df['AuthorIds'] = submission_df.Authors.map(find_user_ids) 81 | submission_df['AuthorsList'] = submission_df.Authors.map( 82 | lambda x: [n.strip() for n in clean_authors(x).split(',')]) 83 | 84 | reviewer_a_df['CollaboratorsList'] = reviewer_a_df['Collaborators'].map( 85 | lambda x: [n.strip() for n in x.replace(',', ';').split(';') if n is not None]) 86 | reviewer_b_df['CollaboratorsList'] = reviewer_b_df['Collaborators'].map( 87 | lambda x: [n.strip() for n in x.replace(',', ';').split(';') if n is not None]) 88 | reviewer_a_df['CollaboratorsList'] = reviewer_a_df['FullName'].map( 89 | lambda x: [x]) + reviewer_a_df['CollaboratorsList'] 90 | reviewer_b_df['CollaboratorsList'] = reviewer_b_df['FullName'].map( 91 | lambda x: [x]) + reviewer_b_df['CollaboratorsList'] 92 | reviewer_df = pd.concat( 93 | (reviewer_a_df, reviewer_b_df)).reset_index(drop=True) 94 | 95 | # affinity matrix 96 | papers = list((submission_df['keywords'] + 97 | ' ' + submission_df['Title'] + 98 | ' ' + submission_df['Abstract']).map(preprocess)) 99 | reviewers_a = list((reviewer_a_df['keywords'] + 100 | ' ' + reviewer_a_df['SampleAbstract1'].fillna('') + 101 | ' ' + reviewer_a_df['SampleAbstract2'].fillna('')).map(preprocess)) 102 | reviewers_b = list((reviewer_b_df['keywords'] + 103 | ' ' + reviewer_b_df['SampleAbstract1'].fillna('') + 104 | ' ' + reviewer_b_df['SampleAbstract2'].fillna('')).map(preprocess)) 105 | A = compute_affinity(papers, reviewers_a + reviewers_b, 106 | n_components=15, min_df=2, max_df=0.85, 107 | weighting='tfidf', projection='pca') 108 | 109 | # COIs 110 | cois_ids = submission_df.AuthorIds.map( 111 | lambda x: create_coi_author_ids(x, reviewer_df)) 112 | cois = submission_df.AuthorsList.map( 113 | lambda x: create_coi_list(x, reviewer_df)) 114 | cois_df = pd.DataFrame(cois + cois_ids, columns=['AuthorsList']) 115 | for i, r in cois_df.iterrows(): 116 | if len(r['AuthorsList']) > 0: 117 | for idx in r['AuthorsList']: 118 | A[i, idx] = -1000 119 | 120 | # assignment 121 | A_a, A_b = A[:, :len(reviewer_a_df)], A[:, len(reviewer_a_df):] 122 | v, K, d = create_lp_matrix(A_a, 123 | min_reviewers_per_paper=2, max_reviewers_per_paper=2, 124 | min_papers_per_reviewer=10, max_papers_per_reviewer=12) 125 | x_sol = linprog(v, K, d)['x'] 126 | b_a = create_assignment(x_sol, A_a) 127 | 128 | v, K, d = create_lp_matrix(A_b, 129 | min_reviewers_per_paper=2, max_reviewers_per_paper=2, 130 | min_papers_per_reviewer=10, max_papers_per_reviewer=12) 131 | x_sol = linprog(v, K, d)['x'] 132 | b_b = create_assignment(x_sol, A_b) 133 | 134 | reviewer_a_map = {i: r['UserID'] for i, r in reviewer_a_df.iterrows()} 135 | reviewer_b_map = {i: r['UserID'] for i, r in reviewer_b_df.iterrows()} 136 | paper_id_map = {i: r['PaperID'] for i, r in submission_df.iterrows()} 137 | 138 | assignments_a_df = create_assignment_dataframe(b_a, reviewer_a_map, 139 | paper_id_map, 140 | pool_group='a') 141 | assignments_b_df = create_assignment_dataframe(b_b, reviewer_b_map, 142 | paper_id_map, 143 | pool_group='b') 144 | 145 | # write to excel sheets 146 | writer = pd.ExcelWriter('cosyne-2020-match.xlsx', 147 | engine='xlsxwriter') 148 | assignments_a_df.to_excel(writer, sheet_name='reviewer_pool_a') 149 | assignments_b_df.to_excel(writer, sheet_name='reviewer_pool_b') 150 | writer.save() 151 | -------------------------------------------------------------------------------- /paper_reviewer_matcher/affinity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 5 | from .vectorizer import LogEntropyVectorizer, BM25Vectorizer 6 | from sklearn.decomposition import PCA, TruncatedSVD 7 | from sklearn.neighbors import NearestNeighbors 8 | from sklearn.metrics.pairwise import euclidean_distances, cosine_distances 9 | 10 | __all__ = ["compute_topics", 11 | "calculate_affinity_distance", 12 | "compute_affinity", 13 | "create_lp_matrix", 14 | "create_assignment"] 15 | 16 | 17 | def compute_topics( 18 | papers: list, 19 | weighting='tfidf', 20 | projection='svd', 21 | min_df=3, max_df=0.8, 22 | lowercase=True, norm='l2', 23 | analyzer='word', token_pattern=r'\w{1,}', 24 | ngram_range=(1, 1), 25 | n_components=30, 26 | stop_words='english' 27 | ): 28 | """ 29 | Compute topics from a given list of ``papers`` 30 | """ 31 | if weighting == 'count': 32 | model = CountVectorizer(min_df=min_df, max_df=max_df, 33 | token_pattern=token_pattern, 34 | ngram_range=ngram_range, 35 | stop_words=stop_words) 36 | elif weighting == 'tfidf': 37 | model = TfidfVectorizer(min_df=min_df, max_df=max_df, 38 | lowercase=lowercase, norm=norm, 39 | token_pattern=token_pattern, 40 | ngram_range=ngram_range, 41 | use_idf=True, smooth_idf=True, sublinear_tf=True, 42 | stop_words=stop_words) 43 | elif weighting == 'entropy': 44 | model = LogEntropyVectorizer(min_df=min_df, max_df=max_df, 45 | lowercase=lowercase, 46 | token_pattern=token_pattern, 47 | ngram_range=ngram_range, 48 | stop_words=stop_words) 49 | elif weighting == 'bm25': 50 | model = BM25Vectorizer(min_df=min_df, max_df=max_df, 51 | lowercase=lowercase, 52 | token_pattern=token_pattern, 53 | ngram_range=ngram_range, 54 | stop_words=stop_words) 55 | else: 56 | print("select weighting scheme from ['count', 'tfidf', 'entropy', 'bm25']") 57 | 58 | X = model.fit_transform(papers) # weighting matrix 59 | 60 | # topic modeling 61 | if projection == 'svd': 62 | topic_model = TruncatedSVD(n_components=n_components, algorithm='arpack') 63 | X_topic = topic_model.fit_transform(X) 64 | elif projection == 'pca': 65 | topic_model = PCA(n_components=n_components) 66 | X_topic = topic_model.fit_transform(X.todense()) 67 | else: 68 | print("select projection from ['svd', 'pca']") 69 | return X_topic 70 | 71 | 72 | def calculate_affinity_distance(X1, X2, distance: str = "euclidean"): 73 | """ 74 | Calculate affinity matrix between matrix X1 and X2 75 | """ 76 | if distance == 'euclidean': 77 | D = - euclidean_distances(X1, X2) # dense affinity matrix 78 | elif distance == 'cosine': 79 | D = - cosine_distances(X1, X2) # dense affinity matrix 80 | else: 81 | D = None 82 | print("Distance function can only be selected from `euclidean` or `cosine`") 83 | return D 84 | 85 | 86 | def compute_affinity(papers, reviewers, 87 | weighting='tfidf', 88 | projection='svd', 89 | min_df=3, max_df=0.8, 90 | distance='euclidean', 91 | lowercase=True, norm='l2', 92 | token_pattern=r'\w{1,}', 93 | ngram_range=(1, 1), 94 | n_components=30, 95 | stop_words='english'): 96 | """ 97 | Create affinity matrix (or distance matrix) 98 | from given list of papers' abstract and reviewers' abstract 99 | 100 | Parameters 101 | ---------- 102 | papers: list, list of string (incoming paper for the conference) 103 | reviewers: list, list of string from reviewers (e.g. paper that they prefer) 104 | weighting: str, weighting scheme for count vector matrix 105 | this can be ('count', 'tfidf', 'entropy', 'bm25') 106 | projection: str, either 'svd' or 'pca' for topic modeling 107 | distance: str, either 'euclidean' or 'cosine' distance 108 | 109 | Returns 110 | ------- 111 | A: ndarray, affinity array from given papers and reviewers 112 | """ 113 | n_papers = len(papers) 114 | 115 | X_topic = compute_topics( 116 | papers + reviewers, 117 | weighting=weighting, 118 | projection=projection, 119 | min_df=min_df, max_df=max_df, 120 | lowercase=lowercase, norm=norm, 121 | token_pattern=token_pattern, 122 | ngram_range=ngram_range, 123 | n_components=n_components, 124 | stop_words=stop_words 125 | ) 126 | 127 | # compute affinity matrix 128 | paper_vectors = X_topic[:n_papers, :] 129 | reviewer_vectors = X_topic[n_papers:, :] 130 | A = calculate_affinity_distance(paper_vectors, reviewer_vectors, distance=distance) 131 | return A 132 | 133 | 134 | def create_lp_matrix(A, min_reviewers_per_paper=0, max_reviewers_per_paper=10, 135 | min_papers_per_reviewer=0, max_papers_per_reviewer=10): 136 | """ 137 | The problem formulation of paper-reviewer matching problem is as follow: 138 | we want to maximize this cost function with constraint 139 | 140 | maximize A.T * b 141 | subject to N_p * b <= c_p (c_p = maximum number of reviewer per paper) 142 | N_r * b <= c_r (c_r = maximum number of paper per reviewer) 143 | b <= 1 144 | b >= 0 145 | 146 | This problem can be reformulate as 147 | maximize A.T * b 148 | subject to K * b <= d 149 | where K = [N_p; N_r; I; -I] and d = [c_p, c_r, 1, 0] 150 | 151 | where A is an affinity matrix (e.g. topic distance matrix) 152 | N is node edge adjacency matrix, N = [N_p; N_r; I; -I] 153 | d is column constraint vector, d = [c_p, c_r, 1, 0] 154 | 155 | Reference 156 | --------- 157 | Taylor, Camillo J. "On the optimal assignment of conference papers to reviewers." (2008). 158 | """ 159 | n_papers, n_reviewers = A.shape 160 | n_edges = np.count_nonzero(A) 161 | 162 | i, j = A.nonzero() 163 | v = A[i, j] 164 | 165 | N_e = sp.dok_matrix((n_papers + n_reviewers, n_edges), dtype=np.float) 166 | N_e[i, range(n_edges)] = 1 167 | N_e[j + n_papers, range(n_edges)] = 1 168 | 169 | N_p = sp.dok_matrix((n_papers, n_edges), dtype=np.int) 170 | N_p[i, range(n_edges)] = -1 171 | 172 | N_r = sp.dok_matrix((n_reviewers, n_edges), dtype=np.int) 173 | N_r[j, range(n_edges)] = -1 174 | 175 | K = sp.vstack([N_e, N_p, N_r, sp.identity(n_edges), -sp.identity(n_edges)]) 176 | 177 | d = [max_reviewers_per_paper] * n_papers + [max_papers_per_reviewer] * n_reviewers + \ 178 | [-min_reviewers_per_paper] * n_papers + [-min_papers_per_reviewer] * n_reviewers + \ 179 | [1] * n_edges + [0] * n_edges 180 | d = np.atleast_2d(d).T # column constraint vector 181 | 182 | return v, K, d 183 | 184 | 185 | def create_assignment(x_sol, A): 186 | """ 187 | Given a solution from linear programming problem for paper assignments 188 | with affinity matrix A, produce the actual assignment matrix b 189 | """ 190 | n_papers, n_reviewers = A.shape 191 | i, j = A.nonzero() 192 | t = np.array(x_sol > 0.5).flatten() 193 | b = np.zeros((n_papers, n_reviewers)) 194 | b[i[t], j[t]] = 1 195 | return b 196 | -------------------------------------------------------------------------------- /data/reviewer.csv: -------------------------------------------------------------------------------- 1 | PersonID,Abstract 2 | 1,"Perceptual events derive their significance to an animal from their meaning about the world, that is from the information they carry about their causes. The brain should thus be able to efficiently infer the causes underlying our sensory events. Here we use multisensory cue combination to study causal inference in perception. We formulate an ideal-observer model that infers whether two sensory cues originate from the same location and that also estimates their location(s). This model accurately predicts the nonlinear integration of cues by human subjects in two auditory-visual localization tasks. The results show that indeed humans can efficiently infer the causal structure as well as the location of causes. By combining insights from the study of causal inference with the ideal-observer approach to sensory cue combination, we show that the capacity to infer causal structure is not limited to conscious, high-level cognition; it is also performed continually and effortlessly in perception." 3 | 2,"We use graphical models and structure learning to explore how people learn policies in sequential decision making tasks. Studies of sequential decision-making in humans frequently find suboptimal performance relative to an ideal actor that knows the graph model that generates reward in the environment. We argue that the learning problem humans face also involves learning the graph structure for reward generation in the environment. We formulate the structure learning problem using mixtures of reward models, and solve the optimal action selection problem using Bayesian Reinforcement Learning. We show that structure learning in one and two armed bandit problems produces many of the qualitative behaviors deemed suboptimal in previous studies. Our argument is supported by the results of experiments that demonstrate humans rapidly learn and exploit new reward structure." 4 | 3,"A large number of experiments have asked to what degree human reaching movements can be understood as being close to optimal in a statistical sense. However, little is known about whether these principles are relevant for other classes of movements. Here we analyzed movement in a task that is similar to surfing or snowboarding. Human subjects stand on a force plate that measures their center of pressure. This center of pressure affects the acceleration of a cursor that is displayed in a noisy fashion (as a cloud of dots) on a projection screen while the subject is incentivized to keep the cursor close to a fixed position. We find that salient aspects of observed behavior are well-described by optimal control models where a Bayesian estimation model (Kalman filter) is combined with an optimal controller (either a Linear-Quadratic-Regulator or Bang-bang controller). We find evidence that subjects integrate information over time taking into account uncertainty. However, behavior in this continuous steering task appears to be a highly non-linear function of the visual feedback. While the nervous system appears to implement Bayes-like mechanisms for a full-body, dynamic task, it may additionally take into account the specific costs and constraints of the task." 5 | 4,"Rhythmic brain activity, measured by magnetoencephalography (MEG), is modulated during stimulation and task performance. Here, we introduce an oscillatory response function (ORF) to predict the dynamic suppression rebound modulation of brain rhythms during a stimulus sequence. We derived a class of parametric models for the ORF in a generalized convolution framework. The model parameters were estimated from MEG data acquired from 10 subjects during bilateral tactile stimulation of fingers (stimulus rates of 4 Hz and 10 Hz in blocks of 0.5, 1, 2, and 4 s). The envelopes of the 17 to 23 Hz rhythmic activity, computed for sensors above the rolandic region, correlated 25% to 43% better with the envelopes predicted by the models than by the stimulus time course (boxcar). A linear model with separate convolution kernels for onset and offset responses gave the best prediction. We studied the generalizability of this model with data from 5 different subjects during a separate bilateral tactile sequence by first identifying neural sources of the 17 to 23 Hz activity using cortically constrained minimum norm estimates.Both the model and the boxcar predicted strongest modulation in the primary motor cortex. For short-duration stimulus blocks, the model predicted the envelope of the cortical currents 20% better than the boxcar did. These results suggest that ORFs could concisely describe brain rhythms during different stimuli, tasks, and pathologies" 6 | 5,"A molecular device that records time-varying signals would enable new approaches in neuroscience. We have recently proposed such a device, termed a molecular ticker tape, in which an engineered DNA polymerase (DNAP) writes time-varying signals into DNA in the form of nucleotide misincorporation patterns. Here, we define a theoretical framework quantifying the expected capabilities of molecular ticker tapes as a function of experimental parameters. We present a decoding algorithm for estimating time-dependent input signals, and DNAP kinetic parameters, directly from misincorporation rates as determined by sequencing. We explore the requirements for accurate signal decoding, particularly the constraints on (1) the polymerase biochemical parameters, and (2) the amplitude, temporal resolution, and duration of the time-varying input signals. Our results suggest that molecular recording devices with kinetic properties similar to natural polymerases could be used to perform experiments in which neural activity is compared across several experimental conditions, and that devices engineered by combining favorable biochemical properties from multiple known polymerases could potentially measure faster phenomena such as slow synchronization of neuronal oscillations. Sophisticated engineering of DNAPs is likely required to achieve molecular recording of neuronal activity with single-spike temporal resolution over experimentally relevant timescales" 7 | 6,"Cancer and healthy cells have distinct distributions of molecular properties and thus respond differently to drugs. Cancer drugs ideally kill cancer cells while limiting harm to healthy cells. However, the inherent variance among cells in both cancer and healthy cell populations increases the difficulty of selective drug action. Here we formalize a classification framework based on the idea that an ideal cancer drug should maximally discriminate between cancer and healthy cells. More specifically, this discrimination should be performed on the basis of measurable cell markers. We divide the problem into three parts which we explore with examples. First, molecular markers should discriminate cancer cells from healthy cells at the single-cell level. Second, the effects of drugs should be statistically predicted by these molecular markers. Third, drugs should be optimized for classification performance. We find that expression levels of a handful of genes suffice to discriminate well between individual cells in cancer and healthy tissue. We also find that gene expression predicts the efficacy of some cancer drugs, suggesting that these cancer drugs act as suboptimal classifiers using gene profiles. Finally, we formulate a framework that defines an optimal drug, and predicts drug cocktails that may target cancer more accurately than the individual drugs alone. Conceptualizing cancer drugs as solving a discrimination problem in the high-dimensional space of molecular markers promises to inform the design of new cancer drugs and drug cocktails." 8 | 7,"For rehabilitation and diagnoses, an understanding of patient activities and movements is important. Modern smartphones have built in accelerometers which promise to enable quantifying minute-by-minute what patients do (e.g. walk or sit). Such a capability could inform recommendations of physical activities and improve medical diagnostics. However, a major problem is that during everyday life, we carry our phone in different ways, e.g. on our belt, in our pocket, in our hand, or in a bag. The recorded accelerations are not only affected by our activities but also by the phone's location. Here we develop a method to solve this kind of problem, based on the intuition that activities change rarely, and phone locations change even less often. A hidden Markov model (HMM) tracks changes across both activities and locations, enabled by a static support vector machine (SVM) classifier that probabilistically identifies activity–location pairs. We find that this approach improves tracking accuracy on healthy subjects as compared to a static classifier alone. The obtained method can be readily applied to patient populations. Our research enables the use of phones as activity tracking devices, without the need of previous approaches to instruct subjects to always carry the phone in the same location." -------------------------------------------------------------------------------- /data/article.csv: -------------------------------------------------------------------------------- 1 | PaperID,Title,Abstract,PersonIDList 2 | 1,"Causal Inference in Multisensory Perception","Perceptual events derive their significance to an animal from their meaning about the world, that is from the information they carry about their causes. The brain should thus be able to efficiently infer the causes underlying our sensory events. Here we use multisensory cue combination to study causal inference in perception. We formulate an ideal-observer model that infers whether two sensory cues originate from the same location and that also estimates their location(s). This model accurately predicts the nonlinear integration of cues by human subjects in two auditory-visual localization tasks. The results show that indeed humans can efficiently infer the causal structure as well as the location of causes. By combining insights from the study of causal inference with the ideal-observer approach to sensory cue combination, we show that the capacity to infer causal structure is not limited to conscious, high-level cognition; it is also performed continually and effortlessly in perception.",1 3 | 2,Bayesian Integration and Non-Linear Feedback Control in a Full-Body Motor Task,"A large number of experiments have asked to what degree human reaching movements can be understood as being close to optimal in a statistical sense. However, little is known about whether these principles are relevant for other classes of movements. Here we analyzed movement in a task that is similar to surfing or snowboarding. Human subjects stand on a force plate that measures their center of pressure. This center of pressure affects the acceleration of a cursor that is displayed in a noisy fashion (as a cloud of dots) on a projection screen while the subject is incentivized to keep the cursor close to a fixed position. We find that salient aspects of observed behavior are well-described by optimal control models where a Bayesian estimation model (Kalman filter) is combined with an optimal controller (either a Linear-Quadratic-Regulator or Bang-bang controller). We find evidence that subjects integrate information over time taking into account uncertainty. However, behavior in this continuous steering task appears to be a highly non-linear function of the visual feedback. While the nervous system appears to implement Bayes-like mechanisms for a full-body, dynamic task, it may additionally take into account the specific costs and constraints of the task.",3;1 4 | 3,Conceptualizing Cancer Drugs as Classifiers,"Cancer and healthy cells have distinct distributions of molecular properties and thus respond differently to drugs. Cancer drugs ideally kill cancer cells while limiting harm to healthy cells. However, the inherent variance among cells in both cancer and healthy cell populations increases the difficulty of selective drug action. Here we formalize a classification framework based on the idea that an ideal cancer drug should maximally discriminate between cancer and healthy cells. More specifically, this discrimination should be performed on the basis of measurable cell markers. We divide the problem into three parts which we explore with examples. First, molecular markers should discriminate cancer cells from healthy cells at the single-cell level. Second, the effects of drugs should be statistically predicted by these molecular markers. Third, drugs should be optimized for classification performance. We find that expression levels of a handful of genes suffice to discriminate well between individual cells in cancer and healthy tissue. We also find that gene expression predicts the efficacy of some cancer drugs, suggesting that these cancer drugs act as suboptimal classifiers using gene profiles. Finally, we formulate a framework that defines an optimal drug, and predicts drug cocktails that may target cancer more accurately than the individual drugs alone. Conceptualizing cancer drugs as solving a discrimination problem in the high-dimensional space of molecular markers promises to inform the design of new cancer drugs and drug cocktails.",6;1 5 | 4,Structure Learning in Human Sequential Decision-Making,"We use graphical models and structure learning to explore how people learn policies in sequential decision making tasks. Studies of sequential decision-making in humans frequently find suboptimal performance relative to an ideal actor that knows the graph model that generates reward in the environment. We argue that the learning problem humans face also involves learning the graph structure for reward generation in the environment. We formulate the structure learning problem using mixtures of reward models, and solve the optimal action selection problem using Bayesian Reinforcement Learning. We show that structure learning in one and two armed bandit problems produces many of the qualitative behaviors deemed suboptimal in previous studies. Our argument is supported by the results of experiments that demonstrate humans rapidly learn and exploit new reward structure.",2 6 | 5,"Hand, belt, pocket or bag: Practical activity tracking with mobile phones","For rehabilitation and diagnoses, an understanding of patient activities and movements is important. Modern smartphones have built in accelerometers which promise to enable quantifying minute-by-minute what patients do (e.g. walk or sit). Such a capability could inform recommendations of physical activities and improve medical diagnostics. However, a major problem is that during everyday life, we carry our phone in different ways, e.g. on our belt, in our pocket, in our hand, or in a bag. The recorded accelerations are not only affected by our activities but also by the phone's location. Here we develop a method to solve this kind of problem, based on the intuition that activities change rarely, and phone locations change even less often. A hidden Markov model (HMM) tracks changes across both activities and locations, enabled by a static support vector machine (SVM) classifier that probabilistically identifies activity location pairs. We find that this approach improves tracking accuracy on healthy subjects as compared to a static classifier alone. The obtained method can be readily applied to patient populations. Our research enables the use of phones as activity tracking devices, without the need of previous approaches to instruct subjects to always carry the phone in the same location.",7 7 | 6,Oscillatory Response Function: Towards a Parametric Model of Rhythmic Brain Activity,"Rhythmic brain activity, measured by magnetoencephalography (MEG), is modulated during stimulation and task performance. Here, we introduce an oscillatory response function (ORF) to predict the dynamic suppression rebound modulation of brain rhythms during a stimulus sequence. We derived a class of parametric models for the ORF in a generalized convolution framework. The model parameters were estimated from MEG data acquired from 10 subjects during bilateral tactile stimulation of fingers (stimulus rates of 4 Hz and 10 Hz in blocks of 0.5, 1, 2, and 4 s). The envelopes of the 17 to 23 Hz rhythmic activity, computed for sensors above the rolandic region, correlated 25% to 43% better with the envelopes predicted by the models than by the stimulus time course (boxcar). A linear model with separate convolution kernels for onset and offset responses gave the best prediction. We studied the generalizability of this model with data from 5 different subjects during a separate bilateral tactile sequence by first identifying neural sources of the 17 to 23 Hz activity using cortically constrained minimum norm estimates.Both the model and the boxcar predicted strongest modulation in the primary motor cortex. For short-duration stimulus blocks, the model predicted the envelope of the cortical currents 20% better than the boxcar did. These results suggest that ORFs could concisely describe brain rhythms during different stimuli, tasks, and pathologies",4 8 | 7,Statistical Analysis of Molecular Signal Recording,"A molecular device that records time-varying signals would enable new approaches in neuroscience. We have recently proposed such a device, termed a molecular ticker tape, in which an engineered DNA polymerase (DNAP) writes time-varying signals into DNA in the form of nucleotide misincorporation patterns. Here, we define a theoretical framework quantifying the expected capabilities of molecular ticker tapes as a function of experimental parameters. We present a decoding algorithm for estimating time-dependent input signals, and DNAP kinetic parameters, directly from misincorporation rates as determined by sequencing. We explore the requirements for accurate signal decoding, particularly the constraints on (1) the polymerase biochemical parameters, and (2) the amplitude, temporal resolution, and duration of the time-varying input signals. Our results suggest that molecular recording devices with kinetic properties similar to natural polymerases could be used to perform experiments in which neural activity is compared across several experimental conditions, and that devices engineered by combining favorable biochemical properties from multiple known polymerases could potentially measure faster phenomena such as slow synchronization of neuronal oscillations. Sophisticated engineering of DNAPs is likely required to achieve molecular recording of neuronal activity with single-spike temporal resolution over experimentally relevant timescales",5 -------------------------------------------------------------------------------- /ccn/ccn_mind_matching_2018.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code snippet for producing CCN Mind Matching session 2018. 3 | We create affinity matrix of people-people using topic modeling 4 | then solve linear programming problem and apply networkx to solve the schedule problem 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from paper_reviewer_matcher import ( 10 | preprocess, compute_affinity, 11 | create_lp_matrix, linprog, 12 | create_assignment 13 | ) 14 | import random 15 | import networkx as nx 16 | 17 | 18 | def build_line_graph(people): 19 | """ 20 | Edge coloring and Vizing's theorem solution 21 | can be found from Stack Overflow question below 22 | 23 | ref: https://stackoverflow.com/questions/51758406/creating-time-schedule-from-list-of-people-and-who-they-have-to-meet 24 | """ 25 | G = nx.Graph() 26 | G.add_edges_from(((p, q) for p, L in people for q in L)) 27 | return nx.line_graph(G) 28 | 29 | 30 | def color_graph(G): 31 | return nx.greedy_color(G) 32 | 33 | 34 | def format_answer(coloring): 35 | res = {} 36 | N = max(coloring.values()) + 1 37 | for meeting in coloring: 38 | time_slot = coloring[meeting] 39 | for meeting_member in (0, 1): 40 | if meeting[meeting_member] not in res: 41 | res[meeting[meeting_member]] = [None] * N 42 | res[meeting[meeting_member]][time_slot] = meeting[1-meeting_member] 43 | return res 44 | 45 | 46 | def nest_answer(people, formatted): 47 | return [[p, formatted[p]] for p, v in people] 48 | 49 | 50 | def schedule_to_timeslot(schedule, n_timeslot=15): 51 | """ 52 | Create personal schedule from list of schedule 53 | """ 54 | schedule_df = pd.DataFrame(schedule, columns=['person', 'person_to_meet']) 55 | person_to_meet_df = pd.DataFrame(schedule_df.person_to_meet.values.tolist(), 56 | columns=range(1, n_timeslot)) 57 | # schedule to dataframe 58 | schedule_df = pd.concat((schedule_df[['person']], person_to_meet_df), axis=1) 59 | 60 | # create person list and map to row/ column 61 | person_list = pd.unique(list(schedule_df['person'])) 62 | P_map = {v: k for k, v in enumerate(person_list)} 63 | 64 | 65 | timeslot_list = [] 66 | for i in range(1, n_timeslot): 67 | timeslot_df = schedule_df[['person', i]].dropna().astype(int).reset_index(drop=True) 68 | P = np.zeros((len(person_list), len(person_list)), dtype=int) 69 | 70 | # adding table number 71 | count = 1 72 | for _, r in schedule_df.iterrows(): 73 | if not pd.isnull(r['person']) and not pd.isnull(r[i]) and P[P_map[r['person']], P_map[r[i]]] == 0 and P[P_map[r[i]], P_map[r['person']]] == 0: 74 | P[P_map[r['person']], P_map[r[i]]] = count 75 | P[P_map[r[i]], P_map[r['person']]] = count 76 | count += 1 77 | 78 | # fill in pair of people (add random pair of people) 79 | left_person = list(set(person_list) - set(pd.unique(list(timeslot_df.person) + list(timeslot_df[i].dropna().astype(int))))) 80 | random.shuffle(left_person) 81 | 82 | random_pair = list(zip(left_person[0:int(len(left_person)/2)], left_person[int(len(left_person)/2)::])) 83 | for p1, p2 in random_pair: 84 | count += 1 85 | P[P_map[p1], P_map[p2]] = count 86 | P[P_map[p2], P_map[p1]] = count 87 | 88 | additional_pair = \ 89 | [[p1, p2, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair] + \ 90 | [[p2, p1, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair] 91 | left_person_df = pd.DataFrame(additional_pair, columns=['person', i, 'table_number']) 92 | 93 | # concatenate 94 | table_number = [int(P[P_map[r['person']], P_map[r[i]]]) for _, r in timeslot_df.iterrows()] 95 | timeslot_df['table_number'] = table_number 96 | timeslot_df = pd.concat((timeslot_df, left_person_df)) 97 | timeslot_list.append(timeslot_df) 98 | 99 | # for all person, make schedule 100 | person_schedule_all = [] 101 | for p in person_list: 102 | person_schedule = [] 103 | for t_df in timeslot_list: 104 | person_schedule.append(t_df[t_df.person == p]) 105 | person_schedule_all.append(pd.concat(person_schedule)) 106 | 107 | return person_schedule_all # list of dataframe each contains schedule 108 | 109 | 110 | def create_dating_schedule(person_df, n_meeting=10): 111 | """ 112 | Function to create speed dating schedule at CCN 2018 conference 113 | 114 | Parameters 115 | ========== 116 | person_df: pandas dataframe contains - PersonID, FullName, Abstract 117 | n_meeting: int, number of meeting we would like to have 118 | 119 | Output 120 | ====== 121 | schedule: list, list of person id and person ids to meet in the 122 | following format: [PersonID, [PersonID to meet]] 123 | """ 124 | # linear programming 125 | persons_1 = list(map(preprocess, list(person_df['Abstract']))) 126 | persons_2 = list(map(preprocess, list(person_df['Abstract']))) 127 | 128 | A = compute_affinity( 129 | persons_1, persons_2, 130 | n_components=10, min_df=1, max_df=0.8, 131 | weighting='tfidf', projection='pca' 132 | ) 133 | # constraints, conflict of interest 134 | A[np.arange(len(A)), np.arange(len(A))] = -1000 135 | 136 | # for dating at CCN 137 | v, K, d = create_lp_matrix( 138 | A, 139 | min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting, 140 | min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting 141 | ) 142 | x_sol = linprog(v, K, d)['x'] 143 | b = create_assignment(x_sol, A) 144 | 145 | output = [] 146 | for i in range(len(b)): 147 | r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]] 148 | output.append([list(person_df.PersonID)[i], r]) 149 | 150 | # make optimal schedule 151 | schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output)))) 152 | 153 | return schedule 154 | 155 | 156 | def partition_cluster(D): 157 | """ 158 | Given a distance matrix, performing hierarchical clustering to rank it 159 | """ 160 | import fastcluster 161 | import scipy.cluster.hierarchy as hierarchy 162 | linkage = fastcluster.linkage(D, 163 | method='centroid', 164 | preserve_input=True) 165 | partition = hierarchy.fcluster(linkage, 166 | t=0.5, 167 | criterion='distance') # distance 168 | return partition 169 | 170 | 171 | def convert_names_to_ids(names, person_id_map, threshold=85): 172 | """ 173 | Convert string of names with separated comma to list of IDs using fuzzy string match 174 | 175 | Parameters 176 | ========== 177 | names: str, string in the following format 'FirstName1 LastName1, ...' 178 | person_id_map: dict, dictionary mapping id to name 179 | 180 | Example 181 | ======= 182 | >> convert_names_to_ids('Jone Doe, Sarah Doe', 183 | {1: 'Jone Doe', 2: 'Sarah Deo'}, threshold=85) # output [1, 2] 184 | """ 185 | from fuzzywuzzy import fuzz 186 | 187 | matched_ids = [] 188 | names = [name.strip() for name in names.split(',')] 189 | for name in names: 190 | matched_ids.extend([idx for (idx, n) in person_id_map.items() if fuzz.ratio(n, name) >= threshold]) 191 | return pd.unique(matched_ids) 192 | 193 | 194 | if __name__ == '__main__': 195 | """ 196 | Example script to create dating schedule for CCN 2018 conference 197 | """ 198 | person_df = pd.ExcelFile('CCN18_MindMatchData.xlsx').parse('Grid Results') 199 | person_df['FullName'] = person_df['NameFirst'] + ' ' + person_df['NameLast'] 200 | person_df['PersonID'] = np.arange(len(person_df)) 201 | person_id_map = {r['PersonID']: r['FullName'] for _, r in person_df.iterrows()} 202 | person_affil_map = {r['PersonID']: r['Affiliation'] for _, r in person_df.iterrows()} 203 | 204 | schedule = create_dating_schedule(person_df) 205 | n_timeslot = len(schedule[0][-1]) + 1 206 | person_schedule_all = schedule_to_timeslot(schedule, n_timeslot=n_timeslot) 207 | 208 | # print out 209 | n_meeting = 6 210 | output_text = [] 211 | for person_schedule_df in person_schedule_all: 212 | output_text.extend(['You are: ', str(person_id_map[person_schedule_df.person.unique()[0]])]) 213 | output_text.extend(['--------------------']) 214 | output_text.extend(['Dating schedule']) 215 | output_text.extend(['--------------------']) 216 | r = 0 217 | for i in range(1, n_meeting + 1): 218 | person_to_meet = [l for l in list(person_schedule_df[i]) if not pd.isnull(l)] 219 | if len(person_to_meet) > 0: 220 | table_number = person_schedule_df['table_number'].iloc[r] 221 | output_text.extend(['timeslot: %d, table number: %d, date: %s' % 222 | (i, table_number, person_id_map[person_to_meet[0]])]) 223 | r += 1 224 | else: 225 | output_text.extend(['timeslot: %d, Waiting area!' % i]) 226 | output_text.extend(['']) 227 | 228 | # save to text file 229 | with open('output_date_schedule.txt', 'w') as f: 230 | for l in output_text: 231 | f.write("{}\n".format(l)) -------------------------------------------------------------------------------- /paper_reviewer_matcher/vectorizer.py: -------------------------------------------------------------------------------- 1 | # weighting function 2 | # from https://github.com/titipata/science_concierge/blob/master/science_concierge/vectorizer.py 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from sklearn.preprocessing import normalize 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.utils.validation import check_is_fitted 9 | 10 | class LogEntropyVectorizer(CountVectorizer): 11 | """Log-entropy vectorizer 12 | Convert collection of raw documents to matrix of log-entropy features 13 | Adds on functionality for scikit-learn CountVectorizer to 14 | calculate log-entropy term matrix 15 | Log-entropy 16 | ----------- 17 | Assume we have term i in document j can be calculated as follows 18 | Global entropy 19 | p_ij = f_ij / sum_j(f_ij) 20 | g_i = 1 + sum_j (p_ij * log p_ij / log n) 21 | log-entropy of term i in document j is 22 | l_ij = log(1 + f_ij) * g_i 23 | where 24 | f_ij is number of term i that appears in document j 25 | sum_j(f_ij) is total number of times term i occurs in 26 | the whole documents 27 | n is total number of documents 28 | g_i is sum of entropy across all documents j 29 | Parameters 30 | ---------- 31 | encoding : string, 'utf-8' by default. 32 | If bytes or files are given to analyze, this encoding is used to 33 | decode. 34 | decode_error : {'strict', 'ignore', 'replace'} 35 | Instruction on what to do if a byte sequence is given to analyze that 36 | contains characters not of the given `encoding`. By default, it is 37 | 'strict', meaning that a UnicodeDecodeError will be raised. Other 38 | values are 'ignore' and 'replace'. 39 | ngram_range : tuple (min_n, max_n) 40 | The lower and upper boundary of the range of n-values for different 41 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 42 | will be used. 43 | stop_words : string {'english'}, list, or None (default) 44 | lowercase : boolean, default True 45 | Convert all characters to lowercase before tokenizing. 46 | token_pattern : string 47 | Regular expression denoting what constitutes a "token", only used 48 | if ``analyzer == 'word'``. The default regexp selects tokens of 2 49 | or more alphanumeric characters (punctuation is completely ignored 50 | and always treated as a token separator). 51 | max_df : float in range [0, 1] or int, default=1.0 52 | min_df : float in range [0, 1] or int, default=1 53 | norm : 'l1', 'l2' or None, optional 54 | Norm used to normalize term vectors. None for no normalization. 55 | smooth_idf: boolean, default=False 56 | See also 57 | -------- 58 | CountVectorizer 59 | Tokenize the documents and count the occurrences of token and return 60 | them as a sparse matrix 61 | TfidfTransformer 62 | Apply Term Frequency Inverse Document Frequency normalization to a 63 | sparse matrix of occurrence counts. 64 | Example 65 | ------- 66 | >> model = LogEntropyVectorizer(norm=None, ngram_range=(1,1)) 67 | >> docs = ['this this this book', 68 | 'this cat good', 69 | 'cat good shit'] 70 | >> X = model.fit_transform(docs) 71 | References 72 | ---------- 73 | - https://en.wikipedia.org/wiki/Latent_semantic_indexing 74 | - http://webpages.ursinus.edu/akontostathis/KontostathisHICSSFinal.pdf 75 | """ 76 | def __init__(self, encoding='utf-8', decode_error='strict', 77 | lowercase=True, preprocessor=None, tokenizer=None, 78 | analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", 79 | vocabulary=None, binary=False, 80 | ngram_range=(1, 1), max_df=1.0, min_df=1, 81 | max_features=None, norm='l2', smooth_idf=False): 82 | 83 | 84 | super(LogEntropyVectorizer, self).__init__( 85 | encoding=encoding, 86 | decode_error=decode_error, 87 | lowercase=lowercase, 88 | preprocessor=preprocessor, 89 | tokenizer=tokenizer, 90 | analyzer=analyzer, 91 | stop_words=stop_words, 92 | token_pattern=token_pattern, 93 | ngram_range=ngram_range, 94 | max_df=max_df, 95 | min_df=min_df, 96 | max_features=max_features, 97 | vocabulary=vocabulary, 98 | binary=binary, 99 | ) 100 | 101 | self.norm = norm 102 | self.smooth_idf = smooth_idf 103 | 104 | 105 | def fit(self, raw_documents, y=None): 106 | """Learn vocabulary and log-entropy from training set. 107 | Parameters 108 | ---------- 109 | raw_documents : iterable 110 | an iterable which yields either str, unicode or file objects 111 | Returns 112 | ------- 113 | self : LogEntropyVectorizer 114 | """ 115 | X = super(LogEntropyVectorizer, self).fit_transform(raw_documents) 116 | 117 | n_samples, n_features = X.shape 118 | gf = np.ravel(X.sum(axis=0)) # count total number of each words 119 | 120 | if self.smooth_idf: 121 | n_samples += int(self.smooth_idf) 122 | gf += int(self.smooth_idf) 123 | 124 | P = (X * sp.spdiags(1./gf, diags=0, m=n_features, n=n_features)) # probability of word occurence 125 | p = P.data 126 | P.data = (p * np.log2(p) / np.log2(n_samples)) 127 | g = 1 + np.ravel(P.sum(axis=0)) 128 | f = np.log2(1 + X.data) 129 | X.data = f 130 | # global weights 131 | self._G = sp.spdiags(g, diags=0, m=n_features, n=n_features) 132 | return self 133 | 134 | 135 | def fit_transform(self, raw_documents, y=None): 136 | self.fit(raw_documents) 137 | return self.transform(raw_documents) 138 | 139 | 140 | def transform(self, raw_documents): 141 | X = super(LogEntropyVectorizer, self).transform(raw_documents) 142 | check_is_fitted(self, '_G', 'global weight vector is not fitted') 143 | L = X * self._G # sparse entropy matrix 144 | 145 | if self.norm is not None: 146 | L = normalize(L, norm=self.norm, copy=False) 147 | return L 148 | 149 | 150 | class BM25Vectorizer(CountVectorizer): 151 | """ 152 | Implementation of Okapi BM25 153 | Parameters 154 | ---------- 155 | encoding : string, 'utf-8' by default. 156 | If bytes or files are given to analyze, this encoding is used to 157 | decode. 158 | decode_error : {'strict', 'ignore', 'replace'} 159 | Instruction on what to do if a byte sequence is given to analyze that 160 | contains characters not of the given `encoding`. By default, it is 161 | 'strict', meaning that a UnicodeDecodeError will be raised. Other 162 | values are 'ignore' and 'replace'. 163 | ngram_range : tuple (min_n, max_n) 164 | The lower and upper boundary of the range of n-values for different 165 | n-grams to be extracted. All values of n such that min_n <= n <= max_n 166 | will be used. 167 | stop_words : string {'english'}, list, or None (default) 168 | lowercase : boolean, default True 169 | Convert all characters to lowercase before tokenizing. 170 | token_pattern : string 171 | Regular expression denoting what constitutes a "token", only used 172 | if ``analyzer == 'word'``. The default regexp selects tokens of 2 173 | or more alphanumeric characters (punctuation is completely ignored 174 | and always treated as a token separator). 175 | max_df : float in range [0, 1] or int, default=1.0 176 | min_df : float in range [0, 1] or int, default=1 177 | b : float, default 0.75 178 | parameter for Okapi BM25 179 | k1 : float, suggested value from [1.2, 2.0] 180 | parameter for Okapi BM25 181 | References 182 | ---------- 183 | - Okapi BM25 https://en.wikipedia.org/wiki/Okapi_BM25 184 | - Introduction to Information Retrieval http://nlp.stanford.edu/IR-book/essir2011/pdf/11prob.pdf 185 | """ 186 | def __init__(self, encoding='utf-8', decode_error='strict', 187 | lowercase=True, preprocessor=None, tokenizer=None, 188 | analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", 189 | vocabulary=None, binary=False, 190 | ngram_range=(1, 1), max_df=1.0, min_df=1, 191 | max_features=None, b=0.75, k1=1.5): 192 | 193 | super(BM25Vectorizer, self).__init__( 194 | encoding=encoding, 195 | decode_error=decode_error, 196 | lowercase=lowercase, 197 | preprocessor=preprocessor, 198 | tokenizer=tokenizer, 199 | analyzer=analyzer, 200 | stop_words=stop_words, 201 | token_pattern=token_pattern, 202 | ngram_range=ngram_range, 203 | max_df=max_df, 204 | min_df=min_df, 205 | max_features=max_features, 206 | vocabulary=vocabulary, 207 | binary=binary, 208 | ) 209 | 210 | self.b = b 211 | self.k1 = k1 212 | 213 | def fit_transform(self, raw_documents, y=None): 214 | 215 | X = super(BM25Vectorizer, self).fit_transform(raw_documents) 216 | X = X.tocoo() 217 | n_samples, n_features = X.shape 218 | doc_len = np.ravel(X.sum(axis=1)) 219 | avg_len = doc_len.mean() 220 | len_norm = 1.0 - self.b + (self.b * doc_len / avg_len) 221 | idf = np.log(float(n_samples) / (1 + np.bincount(X.col))) 222 | X.data = X.data * (self.k1 + 1.0) / (self.k1 * len_norm[X.row] + X.data) * idf[X.col] 223 | return X.tocsr() 224 | -------------------------------------------------------------------------------- /nma/pod_grouping_2020.py: -------------------------------------------------------------------------------- 1 | import re 2 | from math import atan2 3 | import numpy as np 4 | import pandas as pd 5 | import paper_reviewer_matcher as pp 6 | from paper_reviewer_matcher import ( 7 | preprocess, compute_affinity, 8 | create_lp_matrix, create_assignment 9 | ) 10 | from scipy.cluster.hierarchy import linkage 11 | from sklearn.preprocessing import MinMaxScaler 12 | 13 | from itertools import product 14 | from tqdm import tqdm, tqdm_notebook 15 | 16 | from sklearn.manifold import MDS 17 | from copkmeans.cop_kmeans import cop_kmeans 18 | 19 | selected_cols = [ 20 | 'index', 'gender', 'institution', 'home_country', 21 | 'institute_city', 'residence_country', 22 | 'timezone', 'second_timezone', 'third_timezone', 23 | 'Statement' 24 | ] 25 | 26 | 27 | def remove_text_parentheses(text): 28 | """ 29 | Remove text inside parentheses 30 | """ 31 | return re.sub(r"[\(\[].*?[\)\]]", "", text).strip() 32 | 33 | 34 | def compute_tz_distance(node_1, node_2): 35 | """ 36 | Compute timezone distance 37 | 38 | TODO: tweak distance between timezone 39 | """ 40 | if node_1[0] == node_2[0] and node_1[1] == node_2[1]: 41 | return 0 42 | if node_1[0] == node_2[0] and node_1[1] != node_2[1]: 43 | return 5 44 | else: 45 | return 20 46 | 47 | 48 | def compute_tz_distance_dict(d1, d2): 49 | """ 50 | Compute timezone distance 51 | """ 52 | idx1 = d1['idx'] 53 | idx2 = d2['idx'] 54 | if d1['timezone'] == d2['timezone'] and d1['second_timezone'] == d2['second_timezone']: 55 | return (idx1, idx2, 0.0) 56 | elif d1['timezone'] == d2['timezone'] and d1['second_timezone'] != d2['second_timezone']: 57 | return (idx1, idx2, 0.3) 58 | elif d1['timezone'] == d2['timezone'] or d1['second_timezone'] == d2['second_timezone']\ 59 | or d1['second_timezone'] == d2['timezone'] or d1['timezone'] == d2['second_timezone']: 60 | return (idx1, idx2, 0.3) 61 | else: 62 | return (idx1, idx2, 1.0) 63 | 64 | 65 | def calculate_timezone_distance(preferred_tz): 66 | """ 67 | Sending array and distance function 68 | then calculate distance matrix as an output 69 | """ 70 | D_preferred_tz = [] 71 | for tz1 in preferred_tz: 72 | D_preferred_tz.append([compute_tz_distance(tz1, tz2) for tz2 in preferred_tz]) 73 | D_preferred_tz = np.array(D_preferred_tz) 74 | return D_preferred_tz 75 | 76 | 77 | def generate_pod_numbers(n_students=2157, n_per_group=18): 78 | """ 79 | Generate pod numbers in sequence 80 | """ 81 | groups = [] 82 | for i in range(1, int(n_students / n_per_group) + 2): 83 | groups.extend([i] * n_per_group) 84 | groups = groups[:n_students] 85 | return groups 86 | 87 | 88 | def calculate_geo_distance(d1, d2, R=6373.0): 89 | """ 90 | Calculate geolocation in kilometers between two geolocation 91 | """ 92 | lat1, lng1 = d1['lat'], d1['lng'] 93 | lat2, lng2 = d2['lat'], d2['lng'] 94 | try: 95 | d_lng = lng1 - lng2 96 | d_lat = lat1 - lat2 97 | a = np.sin(d_lat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(d_lng / 2)**2 98 | c = 2 * atan2(np.sqrt(a), np.sqrt(1 - a)) 99 | distance = R * c 100 | return (d1['idx'], d2['idx'], distance) 101 | except: 102 | return (d1['idx'], d2['idx'], np.nan) 103 | 104 | 105 | def calculate_geo_distance_matrix(df): 106 | """ 107 | Calculate geo distance matrix from a given dataframe 108 | """ 109 | n_users = len(df) 110 | lat_lng_df = df[['idx', 'index', 'institute_longitude', 'institute_latitude']].rename( 111 | columns={'institute_longitude': 'lng', 'institute_latitude': 'lat'} 112 | ) 113 | lat_lng_list = lat_lng_df.to_dict(orient='records') 114 | distance_df = pd.DataFrame(list(product(lat_lng_list, lat_lng_list)), columns=['loc1', 'loc2']).apply( 115 | lambda r: calculate_geo_distance(r['loc1'], r['loc2']), axis=1 116 | ) 117 | d_fill = np.nanmean([d for _, _, d in distance_df.values]) 118 | D_lat_lng = np.zeros((n_users, n_users)) 119 | for idx1, idx2, d in distance_df.values: 120 | if not pd.isnull(d): 121 | D_lat_lng[idx1, idx2] = d 122 | else: 123 | D_lat_lng[idx1, idx2] = d_fill 124 | return D_lat_lng 125 | 126 | 127 | def calculate_language_distance_matrix(df): 128 | """ 129 | Calculate langugage distance matrix from a given dataframe 130 | 131 | The distance will be -0.5 if they have the same language preference 132 | """ 133 | n_users = len(df) 134 | language_list = df[['idx', 'language']].to_dict(orient='records') 135 | D_language = np.zeros((n_users, n_users)) 136 | for d1, d2 in product(language_list, language_list): 137 | if (d1['language'] or '') == (d2['language'] or '') and d1['idx'] != d2['idx']: 138 | D_language[d1['idx'], d2['idx']] = -0.5 139 | return D_language 140 | 141 | 142 | def calculate_timezone_distance_matrix(df): 143 | """ 144 | Calculate timezone distance matrix from a given dataframe 145 | """ 146 | n_users = len(df) 147 | timezone_df = df[['idx', 'timezone', 'second_timezone']] 148 | timezone_df.loc[:, 'timezone'] = timezone_df.timezone.map( 149 | lambda t: remove_text_parentheses(t).split(' ')[-1] 150 | ) 151 | timezone_df.loc[:, 'second_timezone'] = timezone_df.second_timezone.map( 152 | lambda t: remove_text_parentheses(t).split(' ')[-1].replace('me', ' ') 153 | ) 154 | timezone_list = timezone_df.to_dict(orient='records') 155 | D_tz = np.zeros((n_users, n_users)) 156 | for d1, d2 in product(timezone_list, timezone_list): 157 | idx1, idx2, tz_dist = compute_tz_distance_dict(d1, d2) 158 | D_tz[idx1, idx2] = tz_dist 159 | return D_tz 160 | 161 | 162 | def check_if_overlap(r1, r2, 163 | cols_tz=['timezone', 'second_timezone', 'third_timezone']): 164 | """ 165 | Check if slots have overlap 166 | if some of the slots overlap, return True, 167 | else return False 168 | """ 169 | r1_ = [e for e in r1[cols_tz].fillna('').values 170 | if 'Slot' in e] 171 | r2_ = [e for e in r2[cols_tz].fillna('').values 172 | if 'Slot' in e] 173 | return any([tz1 == tz2 for tz1, tz2 in product(r1_, r2_)]) 174 | 175 | 176 | def check_if_timezone_overlap(d1, d2): 177 | """ 178 | Check if two dictionary have overlap in timezone, 179 | if not, return an index between two dictionaries 180 | """ 181 | tz_avail_1 = set([v for k, v in d1.items() 182 | if (k != 'idx' and not pd.isnull(v) and v != '')]) 183 | tz_avail_2 = set([v for k, v in d2.items() 184 | if (k != 'idx' and not pd.isnull(v) and v != '')]) 185 | if len(tz_avail_1.intersection(tz_avail_2)) == 0: 186 | return (d1['idx'], d2['idx']) 187 | else: 188 | return None 189 | 190 | 191 | def generate_cannot_link_list(df, cols_tz=['timezone', 'second_timezone', 'third_timezone']): 192 | """ 193 | Return list of cannot link tuple between indices e.g. 194 | [(1, 10), (10, 1), ...] 195 | """ 196 | cols = ['index', 'timezone', 'second_timezone', 'third_timezone'] 197 | cannot_link = [] 198 | for i, r1 in tqdm_notebook(df[cols].iterrows()): 199 | for j, r2 in df[cols].iterrows(): 200 | if not check_if_overlap(r1, r2): 201 | cannot_link.append((i, j)) 202 | return cannot_link 203 | 204 | 205 | def generate_cannot_list_list(df): 206 | """ 207 | A more efficient way to generate cannot link list 208 | """ 209 | cols_tz = ['idx', 'timezone', 'second_timezone', 'third_timezone'] 210 | tz_df = df[cols_tz] 211 | tz_df.fillna('', inplace=True) 212 | tz_df['timezone'] = tz_df.timezone.map(lambda t: remove_text_parentheses(t).split(' ')[-1].replace('me', ' ')) 213 | tz_df['second_timezone'] = tz_df.second_timezone.map(lambda t: remove_text_parentheses(t).split(' ')[-1].replace('me', ' ')) 214 | tz_df['third_timezone'] = tz_df.third_timezone.map(lambda t: remove_text_parentheses(t).split(' ')[-1].replace('me', ' ')) 215 | tz_list = tz_df.to_dict(orient='records') 216 | tz_pair_df = pd.DataFrame(list(product(tz_list, tz_list)), columns=['tz1', 'tz2']) 217 | cannot_link = list(tz_pair_df.apply(lambda r: check_if_timezone_overlap(r['tz1'], r['tz2']), axis=1).dropna()) 218 | return cannot_link 219 | 220 | 221 | if __name__ == '__main__': 222 | # starter 223 | scaler = MinMaxScaler() 224 | df = pd.read_csv('nma_applicants.csv', index=False) 225 | 226 | # calculate timezone distance 227 | D_tz = calculate_timezone_distance_matrix(df) 228 | 229 | # calculate geolocation distance 230 | D_lat_lng = calculate_geo_distance_matrix(df) 231 | D_lat_lng_scale = scaler.fit_transform(D_lat_lng) 232 | D_lat_lng_scale = pd.DataFrame(D_lat_lng_scale).fillna(np.nanmean(D_lat_lng_scale)).values 233 | 234 | # calculate topic distance between statement 235 | persons_1 = list(map(preprocess, list(df['Statement']))) 236 | persons_2 = list(map(preprocess, list(df['Statement']))) 237 | D_statement = - compute_affinity(persons_1, persons_2, 238 | n_components=30, min_df=2, max_df=0.8, 239 | weighting='tfidf', projection='svd') 240 | std_topic = D_statement.std() 241 | 242 | # list of cannot link 243 | cannot_link = generate_cannot_list_list(df) 244 | 245 | # clustering 246 | D_final = (D_statement) + (10 * std_topic * D_tz) + (std_topic * D_lat_lng_scale) # final distance 247 | X_mds = MDS(n_components=30).fit_transform(D_final) 248 | clusters_kmean, centers_kmean = cop_kmeans(dataset=X_mds, k=200, cl=cannot_link) 249 | output_df = df[selected_cols] 250 | output_df['pod_number'] = clusters_kmean 251 | 252 | # rearrange 253 | df_rearrange = [] 254 | pod_num = 1 255 | for _, df_tz in output_df.groupby('timezone'): 256 | for _, df_pod_num in df_tz.groupby('pod_number'): 257 | df_pod_num['pod_number'] = pod_num 258 | df_rearrange.append(df_pod_num) 259 | pod_num += 1 260 | df_rearrange = pd.concat(df_rearrange)[selected_cols] 261 | df_rearrange.to_csv('pod_matching_rearrange_mds.csv', index=False) -------------------------------------------------------------------------------- /mm_feedback_site/static/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v7.0.0 | MIT License | github.com/necolas/normalize.css */ 2 | 3 | /* Document 4 | ========================================================================== */ 5 | 6 | /** 7 | * 1. Correct the line height in all browsers. 8 | * 2. Prevent adjustments of font size after orientation changes in 9 | * IE on Windows Phone and in iOS. 10 | */ 11 | 12 | html { 13 | line-height: 1.15; /* 1 */ 14 | -ms-text-size-adjust: 100%; /* 2 */ 15 | -webkit-text-size-adjust: 100%; /* 2 */ 16 | } 17 | 18 | /* Sections 19 | ========================================================================== */ 20 | 21 | /** 22 | * Remove the margin in all browsers (opinionated). 23 | */ 24 | 25 | body { 26 | margin: 0; 27 | } 28 | 29 | /** 30 | * Add the correct display in IE 9-. 31 | */ 32 | 33 | article, 34 | aside, 35 | footer, 36 | header, 37 | nav, 38 | section { 39 | display: block; 40 | } 41 | 42 | /** 43 | * Correct the font size and margin on `h1` elements within `section` and 44 | * `article` contexts in Chrome, Firefox, and Safari. 45 | */ 46 | 47 | h1 { 48 | font-size: 2em; 49 | margin: 0.67em 0; 50 | } 51 | 52 | /* Grouping content 53 | ========================================================================== */ 54 | 55 | /** 56 | * Add the correct display in IE 9-. 57 | * 1. Add the correct display in IE. 58 | */ 59 | 60 | figcaption, 61 | figure, 62 | main { /* 1 */ 63 | display: block; 64 | } 65 | 66 | /** 67 | * Add the correct margin in IE 8. 68 | */ 69 | 70 | figure { 71 | margin: 1em 40px; 72 | } 73 | 74 | /** 75 | * 1. Add the correct box sizing in Firefox. 76 | * 2. Show the overflow in Edge and IE. 77 | */ 78 | 79 | hr { 80 | box-sizing: content-box; /* 1 */ 81 | height: 0; /* 1 */ 82 | overflow: visible; /* 2 */ 83 | } 84 | 85 | /** 86 | * 1. Correct the inheritance and scaling of font size in all browsers. 87 | * 2. Correct the odd `em` font sizing in all browsers. 88 | */ 89 | 90 | pre { 91 | font-family: monospace, monospace; /* 1 */ 92 | font-size: 1em; /* 2 */ 93 | } 94 | 95 | /* Text-level semantics 96 | ========================================================================== */ 97 | 98 | /** 99 | * 1. Remove the gray background on active links in IE 10. 100 | * 2. Remove gaps in links underline in iOS 8+ and Safari 8+. 101 | */ 102 | 103 | a { 104 | background-color: transparent; /* 1 */ 105 | -webkit-text-decoration-skip: objects; /* 2 */ 106 | } 107 | 108 | /** 109 | * 1. Remove the bottom border in Chrome 57- and Firefox 39-. 110 | * 2. Add the correct text decoration in Chrome, Edge, IE, Opera, and Safari. 111 | */ 112 | 113 | abbr[title] { 114 | border-bottom: none; /* 1 */ 115 | text-decoration: underline; /* 2 */ 116 | text-decoration: underline dotted; /* 2 */ 117 | } 118 | 119 | /** 120 | * Prevent the duplicate application of `bolder` by the next rule in Safari 6. 121 | */ 122 | 123 | b, 124 | strong { 125 | font-weight: inherit; 126 | } 127 | 128 | /** 129 | * Add the correct font weight in Chrome, Edge, and Safari. 130 | */ 131 | 132 | b, 133 | strong { 134 | font-weight: bolder; 135 | } 136 | 137 | /** 138 | * 1. Correct the inheritance and scaling of font size in all browsers. 139 | * 2. Correct the odd `em` font sizing in all browsers. 140 | */ 141 | 142 | code, 143 | kbd, 144 | samp { 145 | font-family: monospace, monospace; /* 1 */ 146 | font-size: 1em; /* 2 */ 147 | } 148 | 149 | /** 150 | * Add the correct font style in Android 4.3-. 151 | */ 152 | 153 | dfn { 154 | font-style: italic; 155 | } 156 | 157 | /** 158 | * Add the correct background and color in IE 9-. 159 | */ 160 | 161 | mark { 162 | background-color: #ff0; 163 | color: #000; 164 | } 165 | 166 | /** 167 | * Add the correct font size in all browsers. 168 | */ 169 | 170 | small { 171 | font-size: 80%; 172 | } 173 | 174 | /** 175 | * Prevent `sub` and `sup` elements from affecting the line height in 176 | * all browsers. 177 | */ 178 | 179 | sub, 180 | sup { 181 | font-size: 75%; 182 | line-height: 0; 183 | position: relative; 184 | vertical-align: baseline; 185 | } 186 | 187 | sub { 188 | bottom: -0.25em; 189 | } 190 | 191 | sup { 192 | top: -0.5em; 193 | } 194 | 195 | /* Embedded content 196 | ========================================================================== */ 197 | 198 | /** 199 | * Add the correct display in IE 9-. 200 | */ 201 | 202 | audio, 203 | video { 204 | display: inline-block; 205 | } 206 | 207 | /** 208 | * Add the correct display in iOS 4-7. 209 | */ 210 | 211 | audio:not([controls]) { 212 | display: none; 213 | height: 0; 214 | } 215 | 216 | /** 217 | * Remove the border on images inside links in IE 10-. 218 | */ 219 | 220 | img { 221 | border-style: none; 222 | } 223 | 224 | /** 225 | * Hide the overflow in IE. 226 | */ 227 | 228 | svg:not(:root) { 229 | overflow: hidden; 230 | } 231 | 232 | /* Forms 233 | ========================================================================== */ 234 | 235 | /** 236 | * 1. Change the font styles in all browsers (opinionated). 237 | * 2. Remove the margin in Firefox and Safari. 238 | */ 239 | 240 | button, 241 | input, 242 | optgroup, 243 | select, 244 | textarea { 245 | font-family: sans-serif; /* 1 */ 246 | font-size: 100%; /* 1 */ 247 | line-height: 1.15; /* 1 */ 248 | margin: 0; /* 2 */ 249 | } 250 | 251 | /** 252 | * Show the overflow in IE. 253 | * 1. Show the overflow in Edge. 254 | */ 255 | 256 | button, 257 | input { /* 1 */ 258 | overflow: visible; 259 | } 260 | 261 | /** 262 | * Remove the inheritance of text transform in Edge, Firefox, and IE. 263 | * 1. Remove the inheritance of text transform in Firefox. 264 | */ 265 | 266 | button, 267 | select { /* 1 */ 268 | text-transform: none; 269 | } 270 | 271 | /** 272 | * 1. Prevent a WebKit bug where (2) destroys native `audio` and `video` 273 | * controls in Android 4. 274 | * 2. Correct the inability to style clickable types in iOS and Safari. 275 | */ 276 | 277 | button, 278 | html [type="button"], /* 1 */ 279 | [type="reset"], 280 | [type="submit"] { 281 | -webkit-appearance: button; /* 2 */ 282 | } 283 | 284 | /** 285 | * Remove the inner border and padding in Firefox. 286 | */ 287 | 288 | button::-moz-focus-inner, 289 | [type="button"]::-moz-focus-inner, 290 | [type="reset"]::-moz-focus-inner, 291 | [type="submit"]::-moz-focus-inner { 292 | border-style: none; 293 | padding: 0; 294 | } 295 | 296 | /** 297 | * Restore the focus styles unset by the previous rule. 298 | */ 299 | 300 | button:-moz-focusring, 301 | [type="button"]:-moz-focusring, 302 | [type="reset"]:-moz-focusring, 303 | [type="submit"]:-moz-focusring { 304 | outline: 1px dotted ButtonText; 305 | } 306 | 307 | /** 308 | * Correct the padding in Firefox. 309 | */ 310 | 311 | fieldset { 312 | padding: 0.35em 0.75em 0.625em; 313 | } 314 | 315 | /** 316 | * 1. Correct the text wrapping in Edge and IE. 317 | * 2. Correct the color inheritance from `fieldset` elements in IE. 318 | * 3. Remove the padding so developers are not caught out when they zero out 319 | * `fieldset` elements in all browsers. 320 | */ 321 | 322 | legend { 323 | box-sizing: border-box; /* 1 */ 324 | color: inherit; /* 2 */ 325 | display: table; /* 1 */ 326 | max-width: 100%; /* 1 */ 327 | padding: 0; /* 3 */ 328 | white-space: normal; /* 1 */ 329 | } 330 | 331 | /** 332 | * 1. Add the correct display in IE 9-. 333 | * 2. Add the correct vertical alignment in Chrome, Firefox, and Opera. 334 | */ 335 | 336 | progress { 337 | display: inline-block; /* 1 */ 338 | vertical-align: baseline; /* 2 */ 339 | } 340 | 341 | /** 342 | * Remove the default vertical scrollbar in IE. 343 | */ 344 | 345 | textarea { 346 | overflow: auto; 347 | } 348 | 349 | /** 350 | * 1. Add the correct box sizing in IE 10-. 351 | * 2. Remove the padding in IE 10-. 352 | */ 353 | 354 | [type="checkbox"], 355 | [type="radio"] { 356 | box-sizing: border-box; /* 1 */ 357 | padding: 0; /* 2 */ 358 | } 359 | 360 | /** 361 | * Correct the cursor style of increment and decrement buttons in Chrome. 362 | */ 363 | 364 | [type="number"]::-webkit-inner-spin-button, 365 | [type="number"]::-webkit-outer-spin-button { 366 | height: auto; 367 | } 368 | 369 | /** 370 | * 1. Correct the odd appearance in Chrome and Safari. 371 | * 2. Correct the outline style in Safari. 372 | */ 373 | 374 | [type="search"] { 375 | -webkit-appearance: textfield; /* 1 */ 376 | outline-offset: -2px; /* 2 */ 377 | } 378 | 379 | /** 380 | * Remove the inner padding and cancel buttons in Chrome and Safari on macOS. 381 | */ 382 | 383 | [type="search"]::-webkit-search-cancel-button, 384 | [type="search"]::-webkit-search-decoration { 385 | -webkit-appearance: none; 386 | } 387 | 388 | /** 389 | * 1. Correct the inability to style clickable types in iOS and Safari. 390 | * 2. Change font properties to `inherit` in Safari. 391 | */ 392 | 393 | ::-webkit-file-upload-button { 394 | -webkit-appearance: button; /* 1 */ 395 | font: inherit; /* 2 */ 396 | } 397 | 398 | /* Interactive 399 | ========================================================================== */ 400 | 401 | /* 402 | * Add the correct display in IE 9-. 403 | * 1. Add the correct display in Edge, IE, and Firefox. 404 | */ 405 | 406 | details, /* 1 */ 407 | menu { 408 | display: block; 409 | } 410 | 411 | /* 412 | * Add the correct display in all browsers. 413 | */ 414 | 415 | summary { 416 | display: list-item; 417 | } 418 | 419 | /* Scripting 420 | ========================================================================== */ 421 | 422 | /** 423 | * Add the correct display in IE 9-. 424 | */ 425 | 426 | canvas { 427 | display: inline-block; 428 | } 429 | 430 | /** 431 | * Add the correct display in IE. 432 | */ 433 | 434 | template { 435 | display: none; 436 | } 437 | 438 | /* Hidden 439 | ========================================================================== */ 440 | 441 | /** 442 | * Add the correct display in IE 10-. 443 | */ 444 | 445 | [hidden] { 446 | display: none; 447 | } -------------------------------------------------------------------------------- /mm_feedback_site/flask_templates/feedback.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Mind Matching Feedback 5 | {% include 'head.html' %} 6 | 7 | 8 | 9 |
10 |

Feedback Form: CCN 2019 Mind-Match

11 | This is a feedback form for CCN Mind-Matching 2019. 12 | For each match, you can rate
13 | (1) research relevance in the scale 1 to 5
14 | (2) satisfaction of the match you get in the scale 1 to 5.
15 | where 1 means the least relevance or least satisfactory and 5 means highly satisfactory or highly relevance.
16 | We also list their email addresses, just in case they are useful for you to follow up on your discussions. 17 | 18 |
19 |

20 | Your information:
21 | {{ full_name }}
22 | {{ affiliation }}
23 | {{ registrant_id }} (registration id) 24 |


25 |

26 | 27 |
28 | {% for i, match_dict in enumerate(matches_info) %} 29 | Mind Match {{i + 1}}: 30 | {{match_dict.full_name}}, {{match_dict.affiliation}}, 31 | {{match_dict.email}} 32 |
33 |
34 |
35 | Research Relevance: 36 |
37 |
38 | 1 39 | 40 | 41 | 42 | 43 | 44 | 5 45 |
46 |
47 |
48 |
49 | Satisfaction: 50 |
51 |
52 | 1 53 | 54 | 55 | 56 | 57 | 58 | 5 59 |
60 |
61 |
62 |
63 | Did you know the person so well before that the meeting was less meaningful? (check if you knew the person too well.) 64 |
65 |
66 | 67 | 68 | 69 | 70 |
71 |
72 |
73 | {% endfor %} 74 | 75 |

Additional feedback for the session

76 | 77 |
78 |
79 | How useful Mind-Matching session were to you? (1=worst, 10=best) 80 |
81 |
82 | 1 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 10 94 |
95 |
96 | 97 |
98 |
99 | How enjoyable Mind-Matching session were to you? (1=worst, 10=best) 100 |
101 |
102 | 1 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 10 114 |
115 |
116 | 117 | 124 |
125 | 126 |
127 |
128 | Do you think it will be more beneficial to have the mind matching session before or early during the conference? (check if yes it will be more beneficial) 129 |
130 | 131 |
132 | 133 | 134 | 135 | 136 |
137 |
138 | 139 |
140 | 141 | 142 |
143 |
144 |
145 |
146 | 147 | {% include 'footer.html' %} 148 | 149 | 195 | 196 | 254 | 255 | 256 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | Copyright 2013-2018 Docker, Inc. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | https://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /ccn/ccn_mind_matching_2019.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code snippet for producing CCN Mind Matching session 2019. 3 | We create affinity matrix of people-people using topic modeling 4 | then solve linear programming problem and apply networkx to solve the schedule problem 5 | 6 | the given data includes the following columns 7 | - RegistrantID 8 | - NameFirst, first name of the attendee 9 | - NameLast, last name of the attendee 10 | - Affiliation 11 | - Email 12 | - mindMatchPersons, list of people attendee wants to meet (not used) 13 | - RepresentativeWork 14 | - mindMatchExclude 15 | """ 16 | 17 | import itertools 18 | import numpy as np 19 | import pandas as pd 20 | import random 21 | import networkx as nx 22 | from itertools import chain 23 | from fuzzywuzzy import fuzz 24 | from paper_reviewer_matcher import ( 25 | preprocess, compute_affinity, 26 | create_lp_matrix, linprog, 27 | create_assignment 28 | ) 29 | from docx import Document 30 | 31 | 32 | def build_line_graph(people): 33 | """ 34 | Edge coloring and Vizing's theorem solution 35 | can be found from Stack Overflow question below 36 | ref: https://stackoverflow.com/questions/51758406/creating-time-schedule-from-list-of-people-and-who-they-have-to-meet 37 | """ 38 | G = nx.Graph() 39 | G.add_edges_from(((p, q) for p, L in people for q in L)) 40 | return nx.line_graph(G) 41 | 42 | 43 | def color_graph(G): 44 | return nx.greedy_color(G) 45 | 46 | 47 | def format_answer(coloring): 48 | res = {} 49 | N = max(coloring.values()) + 1 50 | for meeting in coloring: 51 | time_slot = coloring[meeting] 52 | for meeting_member in (0, 1): 53 | if meeting[meeting_member] not in res: 54 | res[meeting[meeting_member]] = [None] * N 55 | res[meeting[meeting_member]][time_slot] = meeting[1-meeting_member] 56 | return res 57 | 58 | 59 | def nest_answer(people, formatted): 60 | return [[p, formatted[p]] for p, v in people] 61 | 62 | 63 | def split_exclude_string(people): 64 | """ 65 | Function to split a given text of persons' name who wants to exclude 66 | with comma separated for each name e.g. ``Konrad, Titipat`` 67 | """ 68 | people = people.replace('Mentor: ', '').replace('Lab-mates: ', '').replace('\r\n', ',').replace(';', ',') 69 | people_list = people.split(',') 70 | return [p.strip() for p in people_list if p.strip() is not ''] 71 | 72 | 73 | def create_coi_dataframe(df, people_maps, threshold=85, coreferred=True): 74 | """ 75 | For a given dataframe of for mind-match people with 76 | ``full_name``, ``mindMatchExcludeList`` column, and 77 | a dictionary that map ``full_name`` to person_id, 78 | create conflict of interest dataframe 79 | 80 | Parameters 81 | ========== 82 | df: dataframe, original mind matching dataset 83 | people_maps: list, list dictionary that map person id to their person_id, full_name, and affiliation 84 | threshold: int, fuzzy string match ratio for matching name in ``mindMatchExcludeList`` and ``full_name`` 85 | coreferred: bool, if True, add extra conflict of interest for people who mentioned the same person 86 | 87 | Output 88 | ====== 89 | coi_df: dataframe, conflict of interest 90 | """ 91 | coi_list = [] 92 | for i, r in df.iterrows(): 93 | if len(r['mindMatchExcludeList']) > 0: 94 | exclude_list = [] 95 | for exclude in r['mindMatchExcludeList']: 96 | exclude_list.extend([ 97 | p['person_id'] for p in people_maps if 98 | exclude in p['full_name'] or 99 | fuzz.ratio(p['full_name'], exclude) >= threshold or 100 | fuzz.ratio(p['affiliation'], exclude) >= threshold 101 | ]) 102 | exclude_list = sorted(pd.unique(exclude_list)) 103 | if len(exclude_list) > 0: 104 | for e in exclude_list: 105 | coi_list.append([i, e]) 106 | coi_df = pd.DataFrame(coi_list, columns=['person_id', 'person_id_exclude']) 107 | 108 | # add extra co-referred COI for people who refers the same person 109 | if coreferred: 110 | coi_coreferred = [[g, list(g_df.person_id)] for g, g_df in coi_df.groupby(['person_id_exclude']) 111 | if len(list(g_df.person_id)) >= 2] 112 | 113 | coi_coreferred_list = [] 114 | for _, exclude_list in coi_coreferred: 115 | coi_coreferred_list.extend(list(itertools.combinations(exclude_list, 2))) 116 | coi_coreferred_df = pd.DataFrame(coi_coreferred_list, columns=['person_id', 'person_id_exclude']) 117 | coi_df = pd.concat((coi_df, coi_coreferred_df)) 118 | return coi_df 119 | else: 120 | return coi_df 121 | 122 | 123 | def convert_mind_match_to_document(mind_matching_df, table_map=None, file_name='ccn_mindmatch_2019.docx'): 124 | """ 125 | Create full schedule for mind matching into word document format, 126 | printing person name, affiliation, registration id, and list of person to meet 127 | """ 128 | pages = [] 129 | for person_id, mind_matching_schedule_df in mind_matching_df.groupby('person_id'): 130 | page = [] 131 | page.extend([ 132 | person_id_map[person_id], 133 | person_affil_map[person_id], 134 | 'RegID: {}'.format(registration_id_map[person_id]) 135 | ]) 136 | page.extend([ 137 | '----------------------', 138 | 'Mind Matching Schedule', 139 | '----------------------' 140 | ]) 141 | for _, r in mind_matching_schedule_df.iterrows(): 142 | if table_map is not None: 143 | table_number = table_map[r['table_number']] 144 | else: 145 | table_number = r['table_number'] 146 | page.extend([ 147 | 'timeslot: {}, table number: {}, mind-match: {} ({})'.\ 148 | format(r['timeslot'], table_number, person_id_map[r['person_to_meet_id']], person_affil_map[r['person_to_meet_id']]) 149 | ]) 150 | pages.append('\n'.join(page)) 151 | 152 | # save to word document 153 | document = Document() 154 | for page in pages: 155 | document.add_paragraph(page) 156 | document.add_page_break() 157 | document.save(file_name) 158 | 159 | 160 | def convert_mind_match_to_minimized_format(mind_matching_df, table_map=None, file_name='ccn_mindmatch_2019_minimized.csv'): 161 | """ 162 | Convert full schedule for mind matching into CSV file with 2 columns 163 | ``RegistrantID`` and ``ScheduleTables`` e.g. 1013, 1a|32a|1a|1a|1a|1a 164 | """ 165 | # output CSV for CCN mind-matching with 2 columns RegistrantID, ScheduleTables e.g. 1013, 1a|32a|1a|1a|1a|1a 166 | minimized_mind_matching = [] 167 | for person_id, mind_matching_schedule_df in mind_matching_df.groupby('person_id'): 168 | if table_map is not None: 169 | minimized_mind_matching.append({ 170 | 'RegistrantID': registration_id_map[person_id], 171 | 'ScheduleTables': '|'.join([table_map[e] for e in list(mind_matching_schedule_df.sort_values('timeslot').table_number.values)]) 172 | }) 173 | else: 174 | minimized_mind_matching.append({ 175 | 'RegistrantID': registration_id_map[person_id], 176 | 'ScheduleTables': '|'.join([e for e in list(mind_matching_schedule_df.sort_values('timeslot').table_number.values)]) 177 | }) 178 | minimized_mind_matching_df = pd.DataFrame(minimized_mind_matching) 179 | minimized_mind_matching_df.to_csv(file_name, index=False) 180 | 181 | 182 | if __name__ == '__main__': 183 | df = pd.read_csv('CN19_MindMatchData_20190903-A.csv', encoding='iso-8859-1') 184 | df['full_name'] = df['NameFirst'] + ' ' + df['NameLast'] 185 | df['person_id'] = list(range(len(df))) 186 | 187 | people_maps = [{'person_id': r['person_id'], 188 | 'full_name': r['full_name'], 189 | 'affiliation': r['Affiliation']} 190 | for i, r in df.iterrows()] 191 | person_id_map = {r['person_id']: r['full_name'] for _, r in df.iterrows()} 192 | person_affil_map = {r['person_id']: r['Affiliation'] for _, r in df.iterrows()} 193 | registration_id_map = {r['person_id']: r['RegistrantID'] for _, r in df.iterrows()} 194 | if 'mindMatchExclude' in df.columns: 195 | df['mindMatchExcludeList'] = df.mindMatchExclude.fillna(',').map(split_exclude_string) 196 | coi_df = create_coi_dataframe(df, people_maps, threshold=85, coreferred=True) 197 | 198 | # create assignment matrix 199 | n_meeting = 6 200 | persons_1 = list(map(preprocess, list(df['RepresentativeWork']))) 201 | persons_2 = list(map(preprocess, list(df['RepresentativeWork']))) 202 | A = compute_affinity(persons_1, persons_2, 203 | n_components=10, min_df=2, max_df=0.8, 204 | weighting='tfidf', projection='pca') 205 | # add constraints: conflict of interest 206 | A[np.arange(len(A)), np.arange(len(A))] = -1000 # set diagonal to prevent matching with themselve 207 | for _, r in coi_df.iterrows(): 208 | A[r['person_id'], r['person_id_exclude']] = -1000 209 | A[r['person_id_exclude'], r['person_id']] = -1000 210 | 211 | # trimming affinity matrix to reduce problem size 212 | n_trim = 2 213 | A_trim = [] 214 | for r in range(len(A)): 215 | a = A[r, :] 216 | a[np.argsort(a)[0:n_trim]] = 0 217 | A_trim.append(a) 218 | A_trim = np.vstack(A_trim) 219 | 220 | print('Solving linear programming for Mind-Matching session...') 221 | v, K, d = create_lp_matrix(A_trim, 222 | min_reviewers_per_paper=6, max_reviewers_per_paper=6, 223 | min_papers_per_reviewer=6, max_papers_per_reviewer=6) 224 | x_sol = linprog(v, K, d)['x'] 225 | b = create_assignment(x_sol, A_trim) 226 | print('Done!') 227 | 228 | output = [] 229 | for i in range(len(b)): 230 | r = [list(df['person_id'])[b_] for b_ in np.nonzero(b[i])[0]] 231 | output.append([list(df.person_id)[i], r]) 232 | 233 | # make optimal schedule [[person_id, [match_id_1, match_id_2, ...]], ...] 234 | schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output)))) 235 | 236 | # make the document from calculated schedule 237 | schedule_df = pd.DataFrame(schedule, columns=['person_id', 'match_id']) 238 | schedule_df['match_id'] = schedule_df.match_id.map(lambda x: x[0: n_meeting]) 239 | 240 | # create a full mind-matching dataframe 241 | mind_matching_df = [] 242 | for i in range(n_meeting): 243 | schedule_df['match'] = schedule_df.match_id.map(lambda x: x[i]) 244 | match_pairs = list(pd.unique([frozenset((r['person_id'], int(r['match']))) 245 | for _, r in schedule_df.iterrows() if not pd.isnull(r['match'])])) 246 | 247 | r = list(set(schedule_df.person_id) - set(schedule_df['match'].dropna().unique().astype(int))) 248 | random.shuffle(r) 249 | match_pairs.extend(list(map(frozenset, zip(r[0:int(len(r)/2)], r[int(len(r)/2):])))) 250 | match_lookup = [(list(k), v) for v, k in enumerate(match_pairs, start=1)] 251 | person_lookup = {} 252 | for k, v in match_lookup: 253 | person_lookup[k[0]] = k[1] 254 | person_lookup[k[1]] = k[0] 255 | match_df = pd.DataFrame(list(chain.from_iterable([[[k[0], v], [k[1], v]] for k, v in match_lookup])), 256 | columns=['person_id', 'table_number']) 257 | match_df['person_to_meet_id'] = match_df.person_id.map(lambda x: person_lookup[x]) 258 | match_df['timeslot'] = i + 1 259 | mind_matching_df.append(match_df) 260 | mind_matching_df = pd.concat(mind_matching_df) 261 | 262 | # For CCN, we have table each for 4 pairs and we need to have 32 tables for the session i.e. 4 pairs per table, 32 tables for 250 people 263 | table_map = {k: v for k, v in enumerate([str(i) + c 264 | for i in range(1, 33) 265 | for c in 'abcd'], start=1)} 266 | 267 | # create full schedule for mind matching in word document format and minimized CSV format (for organizers) 268 | convert_mind_match_to_document(mind_matching_df, table_map, file_name='ccn_mindmatch_2019.docx') # output for organizer to see 269 | convert_mind_match_to_minimized_format(mind_matching_df, table_map, file_name='ccn_mindmatch_2019_minimized.csv') 270 | print('Saved matched files into CSV and DOCX format.') -------------------------------------------------------------------------------- /data/output_match.csv: -------------------------------------------------------------------------------- 1 | user_id,match_ids 2 | 1,26;41;134;181;310;463 3 | 2,133;141;145;151;152;263 4 | 3,4;5;19;91;135;166 5 | 4,3;52;117;135;193;401 6 | 5,3;4;10;19;41;483 7 | 6,30;52;135;395;402;449 8 | 7,91;171;330;396;400;486 9 | 8,108;113;162;254;462;469 10 | 9,102;103;200;272;374;445 11 | 10,3;5;41;42;157;284 12 | 11,410;415;419;421;424;500 13 | 12,65;90;124;168;387;494 14 | 13,211;222;296;422;437;472 15 | 14,138;160;169;294;330;457 16 | 15,69;139;167;217;292;380 17 | 16,18;26;122;336;347;407 18 | 17,48;70;105;167;260;397 19 | 18,16;106;205;267;408;500 20 | 19,3;5;91;135;396;400 21 | 20,128;155;279;380;459;460 22 | 21,37;39;194;349;359;368 23 | 22,110;236;246;273;362;410 24 | 23,79;102;130;156;272;489 25 | 24,121;178;328;388;426;427 26 | 25,104;165;257;289;290;372 27 | 26,1;162;249;341;397;469 28 | 27,159;171;260;391;395;492 29 | 28,27;47;294;391;392;399 30 | 29,30;69;127;279;401;402 31 | 30,6;29;135;171;398;400 32 | 31,74;78;143;230;413;414 33 | 32,86;131;136;147;322;471 34 | 33,73;74;140;351;364;464 35 | 34,35;79;189;367;380;463 36 | 35,73;83;85;323;348;365 37 | 36,38;275;307;339;346;367 38 | 37,21;40;206;297;405;466 39 | 38,36;89;275;321;346;388 40 | 39,21;40;253;349;359;466 41 | 40,37;39;203;253;307;466 42 | 41,1;10;181;373;377;389 43 | 42,119;138;154;190;241;315 44 | 43,126;144;150;413;455;498 45 | 44,46;182;305;306;334;451 46 | 45,32;296;313;314;434;435 47 | 46,44;182;305;306;334;451 48 | 47,28;80;101;300;301;327 49 | 48,17;47;80;191;217;300 50 | 49,97;113;166;387;452;461 51 | 50,38;203;283;308;321;350 52 | 51,204;237;303;390;417;485 53 | 52,6;116;228;392;447;448 54 | 53,88;198;387;391;392;399 55 | 54,186;276;325;370;473;497 56 | 55,110;277;284;302;345;373 57 | 56,57;163;250;269;384;453 58 | 57,45;58;59;201;418;478 59 | 58,57;59;185;188;331;386 60 | 59,57;58;200;331;418;478 61 | 60,97;158;166;228;261;397 62 | 61,280;361;369;381;405;494 63 | 62,78;218;245;317;347;375 64 | 63,64;184;220;238;249;470 65 | 64,203;214;234;383;432;484 66 | 65,12;80;125;159;300;327 67 | 66,104;141;151;441;487;496 68 | 67,222;271;287;326;482;499 69 | 68,94;123;173;226;318;381 70 | 69,15;292;330;394;402;456 71 | 70,81;161;167;223;329;491 72 | 71,72;75;89;235;297;348 73 | 72,71;75;83;235;353;356 74 | 73,33;35;83;85;140;142 75 | 74,31;33;140;351;364;464 76 | 75,71;72;76;235;352;353 77 | 76,75;153;235;253;349;465 78 | 77,89;309;322;352;358;497 79 | 78,31;62;197;237;464;473 80 | 79,34;81;93;101;329;491 81 | 80,47;65;159;300;301;327 82 | 81,5;156;187;242;412;436 83 | 82,120;170;196;199;200;289 84 | 83,71;72;73;85;140;352 85 | 84,145;153;197;351;355;423 86 | 85,35;73;83;323;348;365 87 | 86,32;131;147;298;322;358 88 | 87,74;140;235;354;359;423 89 | 88,34;53;142;237;350;355 90 | 89,38;71;77;235;321;466 91 | 90,12;65;216;379;406;486 92 | 91,7;19;171;260;396;400 93 | 92,154;202;241;336;345;417 94 | 93,79;101;102;220;240;494 95 | 94,68;196;226;318;363;381 96 | 95,100;106;120;205;284;386 97 | 96,2;109;248;286;472;493 98 | 97,49;60;183;217;240;261 99 | 98,100;120;257;273;289;290 100 | 99,220;221;287;326;490;499 101 | 100,25;95;98;108;205;257 102 | 101,79;81;93;102;272;445 103 | 102,9;23;93;101;272;445 104 | 103,9;94;200;278;362;382 105 | 104,123;165;173;226;372;407 106 | 105,17;167;172;173;223;291 107 | 106,18;119;120;205;269;289 108 | 107,183;217;267;291;304;446 109 | 108,8;100;168;257;383;444 110 | 109,96;132;186;248;472;493 111 | 110,55;275;277;302;408;498 112 | 111,195;261;293;323;324;365 113 | 112,164;227;339;369;374;454 114 | 113,49;66;428;461;462;491 115 | 114,133;256;261;277;406;408 116 | 115,116;264;265;448;449;483 117 | 116,52;115;166;264;265;448 118 | 117,4;265;399;401;447;462 119 | 118,13;185;200;207;250;478 120 | 119,18;42;121;178;190;205 121 | 120,82;95;98;106;257;289 122 | 121,24;178;328;388;426;427 123 | 122,16;130;172;193;202;205 124 | 123,16;104;226;266;372;407 125 | 124,158;198;225;301;387;401 126 | 125,12;65;224;266;288;494 127 | 126,43;188;316;455;474;498 128 | 127,29;128;157;180;398;469 129 | 128,20;127;333;380;480;485 130 | 129,139;177;198;455;458;460 131 | 130,79;122;134;168;206;489 132 | 131,86;147;276;322;370;495 133 | 132,109;186;248;325;376;493 134 | 133,2;145;256;324;367;406 135 | 134,1;130;206;379;429;489 136 | 135,3;4;6;30;166;449 137 | 136,77;146;358;370;471;475 138 | 137,208;210;213;215;267;324 139 | 138,14;42;160;392;444;457 140 | 139,15;28;129;172;191;410 141 | 140,33;73;74;83;87;359 142 | 141,2;229;230;233;480;488 143 | 142,71;73;143;350;355;356 144 | 143,142;350;351;356;364;371 145 | 144,55;56;270;312;422;431 146 | 145,2;84;133;299;351;357 147 | 146,136;298;358;370;471;475 148 | 147,32;86;131;194;276;495 149 | 148,132;232;325;357;411;467 150 | 149,194;277;285;366;411;495 151 | 150,43;144;231;354;463;498 152 | 151,2;96;141;344;354;371 153 | 152,145;259;282;332;403;429 154 | 153,76;84;413;414;423;465 155 | 154,42;92;202;241;303;376 156 | 155,20;179;279;335;398;468 157 | 156,23;41;102;130;256;500 158 | 157,10;29;127;162;395;402 159 | 158,104;156;242;254;439;443 160 | 159,239;403;411;436;437;442 161 | 160,14;138;294;393;394;457 162 | 161,66;70;223;329;381;491 163 | 162,8;26;157;168;389;469 164 | 163,56;170;290;342;384;453 165 | 164,9;112;240;272;445;446 166 | 165,10;22;23;251;271;313 167 | 166,3;49;60;116;264;452 168 | 167,17;70;81;105;223;319 169 | 168,12;108;130;162;255;383 170 | 169,14;138;192;279;444;457 171 | 170,82;163;196;199;342;384 172 | 171,7;27;91;260;396;400 173 | 172,105;139;174;319;337;397 174 | 173,187;219;254;309;416;440 175 | 174,62;93;139;172;240;280 176 | 175,130;412;416;419;421;424 177 | 176,234;262;295;319;458;459 178 | 177,17;129;158;262;293;455 179 | 178,24;119;121;328;426;427 180 | 179,24;382;403;430;481;483 181 | 180,127;239;280;320;344;485 182 | 181,1;41;310;332;480;488 183 | 182,44;46;183;305;306;451 184 | 183,15;97;107;182;217;304 185 | 184,61;63;220;238;278;361 186 | 185,57;58;118;415;418;478 187 | 186,109;132;248;299;472;493 188 | 187,26;127;183;304;469;500 189 | 188,58;126;270;284;331;362 190 | 189,34;262;323;380;442;463 191 | 190,42;119;234;268;315;338 192 | 191,48;139;240;300;397;500 193 | 192,169;268;291;337;346;410 194 | 193,4;51;134;286;363;417 195 | 194,21;78;147;149;285;470 196 | 195,111;227;244;259;263;293 197 | 196,82;94;170;250;289;407 198 | 197,78;84;147;237;364;464 199 | 198,53;124;129;225;301;327 200 | 199,82;164;170;291;363;368 201 | 200,9;82;103;118;196;386 202 | 201,45;57;58;59;386;415 203 | 202,209;311;312;313;440;441 204 | 203,40;50;88;281;308;350 205 | 204,231;324;366;417;474;495 206 | 205,18;95;100;106;119;122 207 | 206,37;134;283;307;379;489 208 | 207,118;250;269;386;418;478 209 | 208,137;210;212;213;215;252 210 | 209,211;212;213;214;252;338 211 | 210,137;208;215;252;267;304 212 | 211,137;209;213;247;338;369 213 | 212,137;208;209;211;213;252 214 | 213,137;208;209;211;212;252 215 | 214,50;109;209;252;275;321 216 | 215,88;137;208;210;216;267 217 | 216,90;107;210;215;242;379 218 | 217,15;48;66;97;191;304 219 | 218,62;248;285;317;373;375 220 | 219,128;192;333;452;477;482 221 | 220,15;63;93;99;184;454 222 | 221,99;287;311;326;490;499 223 | 222,13;67;185;242;271;287 224 | 223,70;81;105;161;329;491 225 | 224,12;61;66;288;454;494 226 | 225,124;159;198;292;387;456 227 | 226,68;94;123;363;407;450 228 | 227,112;195;268;288;346;442 229 | 228,52;60;233;424;480;488 230 | 229,141;255;444;480;485;488 231 | 230,31;87;197;359;409;414 232 | 231,144;150;204;366;367;495 233 | 232,148;357;371;376;409;467 234 | 233,45;68;156;174;251;416 235 | 234,176;190;293;295;315;492 236 | 235,71;72;75;76;87;140 237 | 236,22;114;246;273;347;360 238 | 237,51;78;88;197;303;485 239 | 238,63;247;278;361;378;405 240 | 239,174;180;258;263;320;344 241 | 240,48;97;101;164;288;365 242 | 241,92;154;303;340;368;417 243 | 242,103;183;222;224;278;334 244 | 243,284;303;390;470;474;498 245 | 244,64;247;249;310;343;378 246 | 245,62;317;336;343;347;375 247 | 246,23;96;186;422;472;481 248 | 247,211;238;244;361;369;405 249 | 248,13;32;96;187;401;437 250 | 249,64;244;340;368;378;470 251 | 250,56;118;196;207;269;453 252 | 251,30;157;179;279;402;420 253 | 252,208;209;210;212;213;214 254 | 253,39;40;76;349;355;466 255 | 254,8;113;271;316;461;462 256 | 255,168;229;256;316;383;444 257 | 256,114;133;245;255;404;406 258 | 257,25;98;100;108;120;342 259 | 258,286;320;343;344;470;493 260 | 259,152;195;282;332;403;404 261 | 260,8;22;151;236;431;442 262 | 261,60;97;114;348;380;408 263 | 262,158;177;189;295;319;458 264 | 263,2;180;195;239;343;404 265 | 264,52;115;116;265;448;449 266 | 265,115;116;117;264;448;449 267 | 266,123;125;328;407;426;427 268 | 267,18;107;125;266;304;408 269 | 268,190;227;314;315;323;338 270 | 269,56;106;207;250;384;453 271 | 270,43;188;331;362;374;446 272 | 271,67;222;254;287;326;490 273 | 272,151;270;308;309;403;441 274 | 273,98;233;236;255;257;487 275 | 274,296;378;382;438;472;476 276 | 275,36;38;110;203;302;345 277 | 276,10;98;309;434;435;476 278 | 277,110;114;149;285;296;366 279 | 278,103;184;224;238;242;382 280 | 279,29;155;179;251;335;429 281 | 280,22;222;254;273;416;441 282 | 281,22;187;201;218;236;404 283 | 282,152;195;259;332;403;404 284 | 283,50;206;379;412;465;489 285 | 284,95;188;243;258;373;498 286 | 285,149;194;277;296;343;376 287 | 286,96;258;320;343;344;470 288 | 287,67;99;221;222;311;490 289 | 288,125;224;365;408;454;494 290 | 289,25;82;95;106;120;196 291 | 290,25;98;163;342;384;453 292 | 291,105;107;192;199;331;363 293 | 292,15;28;69;225;341;387 294 | 293,111;129;177;234;295;459 295 | 294,14;27;28;330;394;457 296 | 295,158;176;262;293;458;459 297 | 296,201;314;430;434;435;483 298 | 297,37;77;281;321;352;356 299 | 298,86;146;309;322;471;475 300 | 299,148;186;357;371;425;467 301 | 300,47;48;65;80;191;301 302 | 301,47;80;124;198;300;327 303 | 302,55;110;188;275;345;406 304 | 303,51;154;237;241;243;340 305 | 304,107;183;210;217;225;267 306 | 305,44;46;182;306;334;451 307 | 306,44;46;182;305;334;451 308 | 307,36;40;206;345;367;379 309 | 308,50;54;203;377;473;497 310 | 309,77;89;131;297;298;370 311 | 310,1;181;244;360;378;463 312 | 311,63;221;335;474;482;499 313 | 312,63;184;220;246;313;390 314 | 313,184;220;246;278;312;361 315 | 314,20;42;155;268;315;460 316 | 315,190;234;268;314;420;492 317 | 316,114;126;169;255;444;462 318 | 317,62;218;245;336;373;375 319 | 318,68;173;374;381;461;492 320 | 319,17;167;260;262;458;492 321 | 320,180;258;286;333;344;479 322 | 321,38;50;89;297;377;388 323 | 322,86;131;298;471;473;475 324 | 323,35;85;111;164;189;348 325 | 324,111;133;144;204;377;390 326 | 325,54;132;148;276;340;495 327 | 326,67;99;221;271;490;499 328 | 327,47;65;80;198;301;456 329 | 328,24;121;178;388;426;427 330 | 329,70;159;161;167;223;491 331 | 330,7;69;160;294;395;456 332 | 331,59;188;270;291;445;446 333 | 332,152;181;259;282;479;484 334 | 333,10;174;239;309;436;476 335 | 334,44;46;242;305;306;451 336 | 335,199;219;234;383;431;433 337 | 336,16;202;245;317;347;375 338 | 337,172;192;341;385;410;450 339 | 338,119;190;227;268;339;346 340 | 339,36;112;227;338;406;410 341 | 340,241;249;303;325;389;425 342 | 341,165;292;337;385;428;450 343 | 342,25;163;170;290;384;453 344 | 343,180;244;258;263;285;286 345 | 344,8;156;201;236;296;431 346 | 345,55;92;275;302;307;347 347 | 346,36;38;192;227;338;339 348 | 347,16;62;92;245;336;345 349 | 348,35;85;189;261;323;365 350 | 349,21;39;76;253;353;465 351 | 350,50;88;142;203;321;356 352 | 351,33;74;84;143;145;364 353 | 352,75;77;83;281;297;358 354 | 353,72;75;349;355;465;466 355 | 354,43;87;150;413;414;423 356 | 355,84;88;142;253;353;356 357 | 356,72;142;143;297;350;355 358 | 357,145;148;232;299;409;425 359 | 358,77;86;136;146;352;370 360 | 359,21;39;87;230;281;413 361 | 360,202;246;274;310;389;390 362 | 361,61;184;238;247;405;454 363 | 362,55;68;122;274;432;439 364 | 363,94;165;193;199;226;368 365 | 364,33;74;143;197;351;464 366 | 365,35;85;111;240;280;348 367 | 366,176;438;439;440;441;476 368 | 367,34;36;111;144;231;307 369 | 368,21;199;249;362;363;374 370 | 369,61;103;112;247;374;405 371 | 370,54;131;136;146;352;358 372 | 371,143;151;232;299;376;464 373 | 372,25;104;123;165;428;450 374 | 373,41;55;154;284;317;375 375 | 374,9;105;112;173;318;368 376 | 375,202;218;245;317;336;373 377 | 376,132;149;285;371;467;493 378 | 377,109;248;308;324;473;497 379 | 378,63;64;238;244;249;310 380 | 379,90;134;206;283;307;489 381 | 380,19;151;274;314;383;432 382 | 381,66;68;94;161;318;454 383 | 382,103;161;200;246;278;362 384 | 383,90;100;108;168;255;316 385 | 384,56;163;269;290;342;388 386 | 385,337;341;397;428;450;452 387 | 386,95;118;161;207;418;446 388 | 387,12;49;53;124;225;292 389 | 388,96;201;401;433;438;477 390 | 389,162;193;340;360;425;469 391 | 390,231;243;274;324;360;479 392 | 391,28;53;392;394;399;448 393 | 392,201;219;270;274;438;439 394 | 393,160;394;395;456;457;486 395 | 394,28;69;160;294;391;399 396 | 395,6;27;330;393;447;449 397 | 396,7;19;91;171;400;486 398 | 397,17;60;172;173;191;385 399 | 398,30;155;179;335;402;468 400 | 399,53;117;391;392;394;447 401 | 400,7;19;30;91;171;396 402 | 401,4;27;29;53;117;124 403 | 402,6;29;69;155;157;398 404 | 403,152;195;259;263;282;404 405 | 404,256;259;263;282;398;485 406 | 405,37;61;247;280;361;369 407 | 406,90;114;133;256;261;339 408 | 407,16;104;123;226;266;372 409 | 408,212;214;312;441;481;496 410 | 409,230;232;357;414;422;425 411 | 410,11;139;192;337;339;346 412 | 411,148;149;194;232;422;467 413 | 412,175;283;416;419;421;465 414 | 413,31;43;153;354;359;423 415 | 414,31;153;230;354;409;423 416 | 415,11;185;419;420;421;424 417 | 416,92;175;193;283;412;417 418 | 417,51;92;193;241;367;416 419 | 418,57;59;185;207;386;478 420 | 419,11;175;412;415;421;424 421 | 420,14;169;251;315;429;500 422 | 421,11;175;412;415;419;424 423 | 422,31;230;409;411;419;420 424 | 423,84;87;153;354;413;414 425 | 424,11;175;189;228;415;421 426 | 425,299;340;357;389;409;467 427 | 426,24;121;178;266;328;427 428 | 427,24;121;178;266;328;426 429 | 428,113;341;372;385;450;452 430 | 429,134;152;279;282;420;463 431 | 430,218;378;411;443;483;496 432 | 431,214;314;431;434;435;476 433 | 432,56;58;219;435;436;443 434 | 433,32;45;211;236;443;496 435 | 434,216;277;291;433;437;440 436 | 435,216;313;430;434;440;477 437 | 436,187;411;430;481;487;496 438 | 437,122;313;429;432;434;436 439 | 438,109;186;187;422;432;439 440 | 439,174;216;311;430;432;437 441 | 440,13;274;431;433;438;440 442 | 441,5;45;239;436;438;468 443 | 442,8;23;176;218;312;437 444 | 443,64;141;215;251;382;435 445 | 444,108;138;169;229;316;393 446 | 445,9;101;102;164;272;446 447 | 446,34;107;164;270;331;445 448 | 447,13;212;439;442;443;483 449 | 448,52;115;264;391;399;447 450 | 449,6;115;264;265;395;447 451 | 450,165;337;341;372;385;428 452 | 451,44;46;182;305;306;334 453 | 452,49;60;233;385;428;486 454 | 453,163;170;250;269;290;342 455 | 454,112;125;224;288;369;381 456 | 455,126;129;144;150;177;460 457 | 456,225;292;327;330;393;486 458 | 457,14;138;160;169;294;393 459 | 458,158;176;262;295;319;492 460 | 459,20;176;177;293;295;460 461 | 460,20;129;177;455;459;468 462 | 461,49;66;113;318;329;462 463 | 462,113;117;166;254;316;461 464 | 463,1;34;150;189;310;429 465 | 464,33;78;197;231;364;371 466 | 465,76;153;175;283;349;353 467 | 466,37;39;40;89;253;353 468 | 467,148;232;299;325;376;425 469 | 468,155;179;335;460;474;476 470 | 469,26;127;128;157;162;389 471 | 470,64;132;194;243;258;286 472 | 471,54;136;146;298;322;475 473 | 472,156;174;219;239;382;443 474 | 473,54;276;281;308;377;497 475 | 474,51;126;204;243;366;468 476 | 475,136;146;147;298;471;497 477 | 476,20;204;398;459;468;474 478 | 477,219;333;479;482;484;487 479 | 478,13;59;118;185;207;418 480 | 479,332;333;390;477;482;484 481 | 480,181;228;229;233;452;488 482 | 481,273;333;360;477;487;496 483 | 482,67;311;320;477;479;484 484 | 483,5;115;116;117;135;265 485 | 484,228;320;332;479;482;488 486 | 485,51;128;180;229;237;280 487 | 486,7;90;233;393;396;456 488 | 487,32;45;128;302;430;433 489 | 488,141;181;228;229;480;484 490 | 489,23;214;251;312;433;442 491 | 490,99;221;271;287;326;499 492 | 491,70;79;81;223;329;461 493 | 492,27;159;260;318;319;458 494 | 493,22;122;215;216;302;481 495 | 494,61;93;125;154;224;288 496 | 495,149;204;231;276;325;366 497 | 496,179;273;335;360;481;487 498 | 497,54;281;308;377;473;475 499 | 498,43;110;126;150;243;455 500 | 499,67;99;221;311;326;490 501 | 500,11;18;26;48;191;420 502 | --------------------------------------------------------------------------------