├── .gitignore ├── .idea ├── .gitignore ├── vcs.xml ├── sqldialects.xml ├── other.xml ├── modules.xml ├── misc.xml ├── libraries │ ├── Maven__com_google_code_gson_gson_2_8_0.xml │ ├── Maven__org_projectlombok_lombok_1_18_8.xml │ └── Maven__org_apache_commons_commons_lang3_3_4.xml ├── compiler.xml ├── dataSources.xml ├── jarRepositories.xml └── uiDesigner.xml ├── src ├── model │ ├── available_model.py │ ├── nn.py │ ├── regression.py │ └── classification.py ├── eutilities │ ├── customized_print.py │ ├── metric.py │ ├── MAGdata │ │ ├── parse_absract_from_mag_kg.py │ │ └── parse_fos_from_mag_kg.py │ ├── preprocessor.py │ ├── name │ │ ├── name_parser.py │ │ ├── name_parser_by_socket.py │ │ └── name_parser_by_localscript.py │ ├── train_utils.py │ └── string_utils.py ├── myconfig.py ├── comparison │ ├── block │ │ ├── batch_runner.sh │ │ ├── clustering_metrics_MAG_AID.py │ │ └── clustering_metrics_other_baselines.py │ └── pairwise │ │ └── classification_metrics.py ├── statistics │ ├── orcid_doi_number_each_year.py │ └── last_name_variation_considering_transliterating.py ├── feature │ ├── cluster │ │ ├── sparse_tfidf_feature.py │ │ ├── doc2vec_feature.py │ │ ├── fast_feature.py │ │ └── network_feature.py │ ├── doc2vec_trainer.py │ ├── pairwise │ │ └── our_dataset_to_feature.py │ └── simple_matching_network_trainer_evaluator.py └── datacheck │ └── checking_multi_facets.py ├── pubmed-paper-author-link.iml ├── dataset ├── DBLP-CiteSeerX │ └── check.py ├── PubMed-GS │ └── check.py ├── dataset-urls.txt ├── DBLP-GESIS │ └── check.py ├── REXA-Culotta │ └── check.py ├── Aminer-Simple │ └── check.py ├── Aminer-Rich │ └── check.py ├── SCAD-zbMATH-Muller │ └── check.py ├── DBLP-Kim │ └── check.py ├── PubMed-Kim │ ├── check.py │ └── Kim_Authority_ORCID_linkage_dataset.sql ├── BDBComp-Cota │ └── check.py ├── DBLP-Qian │ └── check.py ├── Aminer-Zhang │ └── check.py └── Aminer-WhoisWho │ └── to_table_and_check.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /and/target/ 2 | ./.idea -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml 3 | 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/sqldialects.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/model/available_model.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ModelName(Enum): 5 | linear = 'Linear' 6 | logistic = 'Logistic' 7 | dt = 'DecisionTree' 8 | randomforest = 'RandomForest' 9 | 10 | @classmethod 11 | def available_modes(self): 12 | return [self.randomforest] 13 | 14 | @classmethod 15 | def get_short_name(self, model_name): 16 | return \ 17 | dict(zip( 18 | [self.linear, self.logistic, self.dt, self.randomforest], 19 | ['Linear', 'LR', 'DecisionTree', 'RF']))[model_name] 20 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_gson_gson_2_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_projectlombok_lombok_1_18_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_lang3_3_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /pubmed-paper-author-link.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 14 | -------------------------------------------------------------------------------- /dataset/DBLP-CiteSeerX/check.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ds_dir = './nameset_author-disamb' 4 | ds = [n for n in os.listdir(ds_dir)] 5 | print('num_block_size: %d' % len(ds)) 6 | 7 | num_citation = 0 8 | num_author_group = 0 9 | for n in ds: 10 | fn = os.path.join(ds_dir, n) 11 | author_idx_arr = [] 12 | for line in open(fn, encoding='iso8859-1'): 13 | author_idx_citation_idx = line[:line.index(' ')] 14 | num_citation += 1 15 | author_idx, citation_idx = author_idx_citation_idx.split('_') 16 | author_idx_arr.append(author_idx) 17 | num_author_group += len(set(author_idx_arr)) 18 | 19 | print('num_author_group_size: %d' % num_author_group) 20 | print('num_citation: %d' % num_citation) 21 | -------------------------------------------------------------------------------- /dataset/PubMed-GS/check.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_csv('./1900_pairs_train_test.csv', sep=';') 4 | last_name_variation_cases = 0 5 | for i, (ln1, ln2) in df[['Last_name1', 'Last_name2']].iterrows(): 6 | if str(ln1).lower() != str(ln2).lower(): 7 | last_name_variation_cases += 1 8 | print(ln1, ln2) 9 | 10 | print('last_name_variation_cases: %d' % last_name_variation_cases) 11 | 12 | num_paired_records = df.shape[0] 13 | print('num_paired_records: %d' % num_paired_records) 14 | 15 | pmid_arr = list(df['PMID1'].values) + list(df['PMID2'].values) 16 | num_citation = len(set(pmid_arr)) 17 | # print('num_author_group: %d' % num_author_group) 18 | print('num_citation: %d' % num_citation) 19 | 20 | -------------------------------------------------------------------------------- /dataset/dataset-urls.txt: -------------------------------------------------------------------------------- 1 | DBLP-Muller https://github.com/yaya213/DBLP-Name-Disambiguation-Dataset 2 | DBLP-CiteSeerX http://clgiles.ist.psu.edu/data/nameset_author-disamb.tar.zip 3 | DBLP-KIM(PENN) https://doi.org/10.6084/m9.figshare.6840281.v2 4 | KISTI http://www.lbd.dcc.ufmg.br/lbd/collections/disambiguation/DBLP.tar.gz/at_download/file 5 | 6 | Aminer: 7 | rich: http://arnetminer.org/lab-datasets/disambiguation/rich-author-disambiguation-data.zip or 8 | rich: https://lfs.aminer.cn/lab-datasets/disambiguation/rich-author-disambiguation-data.zip 9 | and 10 | simple: https://lfs.aminer.cn/lab-datasets/disambiguation/author-disambiguation-data.zip 11 | 12 | Aminer-ZHANG: https://static.aminer.cn/misc/na-data-kdd18.zip 13 | 14 | -------------------------------------------------------------------------------- /dataset/DBLP-GESIS/check.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pandas as pd 4 | 5 | names = pd.read_csv('Dataset/sigir/Gold Dataset/disambiguatedNames.csv', sep=';', encoding='iso8859-1') 6 | print(names.head()) 7 | pubs = pd.read_csv('Dataset/sigir/Gold Dataset/goldstandardPublications.csv', sep=';', encoding='iso8859-1') 8 | print(pubs.head()) 9 | 10 | author_names = names['name'].apply(lambda x: ' '.join(x.split(' ')[:-1])).values 11 | 12 | counter = Counter(author_names) 13 | print(counter) 14 | 15 | print('num_block: %d' % len(set(author_names))) 16 | num_citation = pubs.shape[0] 17 | num_author_group = len(set(names['fk_authorid'].values)) 18 | print('num_author_group: %d' % num_author_group) 19 | print('num_citation: %d' % num_citation) 20 | -------------------------------------------------------------------------------- /src/eutilities/customized_print.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | pd.set_option('display.unicode.ambiguous_as_wide', True) # 设置列名对齐 4 | pd.set_option('display.unicode.east_asian_width', True) # 设置列名对齐 5 | pd.set_option('display.max_rows', None) # 显示所有行 6 | pd.set_option('display.max_columns', None) # 显示所有列 7 | pd.set_option('expand_frame_repr', False) # 设置不换行 8 | 9 | 10 | def pprint(kv: list, decimal=2, pctg=False, sep=None): 11 | k = [item[0] for item in kv] 12 | if pctg: 13 | v = [round(item[1] * 100.0, decimal) for item in kv] 14 | else: 15 | v = [round(item[1], decimal) for item in kv] 16 | if not sep: 17 | df = pd.DataFrame(data=[v], columns=k) 18 | print(df.head()) 19 | else: 20 | print(sep.join([str(s) for s in v])) 21 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /dataset/REXA-Culotta/check.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_path = './rexa_author_coref/rexa' 4 | blocks = [n for n in os.listdir(base_path)] 5 | 6 | num_blocks = 0 7 | num_author_group = 0 8 | num_citation = 0 9 | for n in blocks: 10 | path = os.path.join(base_path, n) 11 | if os.path.isfile(path): 12 | continue 13 | num_blocks += 1 14 | block_authors = [n for n in os.listdir(path)] 15 | num_author_group += len(block_authors) 16 | for m in block_authors: 17 | path1 = os.path.join(path, m) 18 | if os.path.isfile(path1): 19 | continue 20 | citations = [n for n in os.listdir(path1)] 21 | num_citation += len(citations) 22 | 23 | print('num_block: %d' % num_blocks) 24 | print('num_author_group: %d' % num_author_group) 25 | print('num_citation: %d' % num_citation) 26 | -------------------------------------------------------------------------------- /dataset/Aminer-Simple/check.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_path = './author-disambiguation-data/data/Answer' 4 | blocks = [n for n in os.listdir(base_path)] 5 | 6 | num_blocks = 0 7 | num_author_group = 0 8 | citations = [] 9 | for n in blocks: 10 | path = os.path.join(base_path, n) 11 | if not os.path.isfile(path): 12 | continue 13 | num_blocks += 1 14 | for line in open(path): 15 | if ':' not in line: 16 | continue 17 | id = line[:line.index(':')] 18 | papers = [m.strip() for m in line[line.index(':') + 1:].split(' ') if len(m.strip()) > 0] 19 | print(id, papers) 20 | num_author_group += 1 21 | citations.extend(papers) 22 | 23 | num_citation = len(set(citations)) 24 | print('num_block: %d' % num_blocks) 25 | print('num_author_group: %d' % num_author_group) 26 | print('num_citation: %d' % num_citation) 27 | -------------------------------------------------------------------------------- /dataset/Aminer-Rich/check.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_path = './rich-author-disambiguation-data/experimental-results' 4 | blocks = [n for n in os.listdir(base_path)] 5 | 6 | num_blocks = 0 7 | num_author_group = 0 8 | citations = [] 9 | for n in blocks: 10 | path = os.path.join(base_path, n) 11 | if not os.path.isfile(path) or 'classify' not in n: 12 | continue 13 | 14 | num_blocks += 1 15 | for line in open(path): 16 | if ':' not in line: 17 | continue 18 | id = line[:line.index(':')] 19 | papers = [m.strip() for m in line[line.index(':') + 1:].split(' ') if len(m.strip()) > 0] 20 | print(id, papers) 21 | num_author_group += 1 22 | citations.extend(papers) 23 | 24 | num_citation = len(set(citations)) 25 | print('num_block: %d' % num_blocks) 26 | print('num_author_group: %d' % num_author_group) 27 | print('num_citation: %d' % num_citation) 28 | -------------------------------------------------------------------------------- /dataset/SCAD-zbMATH-Muller/check.py: -------------------------------------------------------------------------------- 1 | import xmltodict 2 | 3 | data = open('SCAD-zbMATH/scad-zbmath-01-open-access.xml').read() 4 | data = xmltodict.parse(data) 5 | data_instance = [] 6 | for n in data['publications']['publication']: 7 | title = n['title'] 8 | authors = n['authors']['author'] 9 | if type(authors) != list: 10 | authors = [authors] 11 | # print(authors) 12 | for au in authors: 13 | name, shortname, id = au['@name'], au['@shortname'], au['@id'], 14 | print(name, shortname, id, title) 15 | last_name_first_initial = shortname 16 | data_instance.append([name, last_name_first_initial, id, title]) 17 | 18 | num_blocks = len(set([n[1] for n in data_instance])) 19 | num_author_group = len(set([n[2] for n in data_instance])) 20 | num_citation = len(data_instance) 21 | 22 | print('num_block: %d' % num_blocks) 23 | print('num_author_group: %d' % num_author_group) 24 | print('num_citation: %d' % num_citation) 25 | -------------------------------------------------------------------------------- /.idea/dataSources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | clickhouse 6 | true 7 | ru.yandex.clickhouse.ClickHouseDriver 8 | jdbc:clickhouse://localhost:8124 9 | 10 | 11 | 12 | 13 | 14 | clickhouse 15 | true 16 | ru.yandex.clickhouse.ClickHouseDriver 17 | jdbc:clickhouse://202.114.70.54:8123 18 | 19 | 20 | -------------------------------------------------------------------------------- /dataset/DBLP-Kim/check.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pandas as pd 4 | 5 | # author name: full name string extracted from DBLP 6 | # unique author id: labels assigned manually by Dr. C. Lee Giles's team 7 | # paper id: assigned by Dr. Jinseok Kim 8 | # author list: names of authors in the byline of the paper 9 | # year: publication year 10 | # venue: conference or journal names 11 | # title: stopwords removed and stemmed by the Porter's stemmer 12 | 13 | df = pd.read_csv('./DBLP_labeled_data.txt', sep='\t', 14 | names=['author name', 'unique author id', 'paper id', 'author list', 'year', 'venue', 'title', 'null'], index_col=None) 15 | print(df.head()) 16 | 17 | 18 | author_names = df['unique author id'].apply(lambda x: ' '.join(x.split('-')[:-1])).values 19 | 20 | counter = Counter(author_names) 21 | print(counter) 22 | 23 | print('num_block: %d' % len(set(author_names))) 24 | 25 | num_citation = len(set(df['paper id'].values)) 26 | num_author_group = len(set(df['unique author id'].values)) 27 | print('num_author_group: %d' % num_author_group) 28 | print('num_citation: %d' % num_citation) 29 | -------------------------------------------------------------------------------- /src/myconfig.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | gpu_id = 0 5 | device = "cuda:%d" % gpu_id 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--which_model", type=int, default=5) 9 | cli_args = parser.parse_args() 10 | best_hac_clustering_parameters = [0.45, 0.25, 0.2, 0.2, 0.25, 0.2] 11 | # best_hac_clustering_parameters = [None, None, None, None, None, None] 12 | tuned_best_cluster_setting = best_hac_clustering_parameters[cli_args.which_model] 13 | 14 | # resource config 15 | latex_doc_base_dir = '/home/zhangli/ssd-1t/repo/manuscripts/ongoning-works/and-dataset/src/' 16 | src_base_path = os.path.dirname(os.path.abspath(__file__)) 17 | cached_dir = os.path.join(src_base_path, 'cached') 18 | 19 | pretrained_model_path = proj_base_path = os.path.abspath('/home/zhangli/pre-trained-models/') 20 | glove6b_path = os.path.join(pretrained_model_path, 'glove.6B/') 21 | glove840b300d_path = os.path.join(pretrained_model_path, 'glove.840B/') 22 | fasttextcrawl300d2m_path = os.path.join(pretrained_model_path, 'fastText/crawl-300d-2M.vec') 23 | infersent_based_path = os.path.join(pretrained_model_path, 'infersent') 24 | -------------------------------------------------------------------------------- /src/comparison/block/batch_runner.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #conda activate rapids-21.06 3 | #which python 4 | #python clustering_metrics_other_baselines.py 5 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=0 >r1_trimmed_model_trimmed_dataset0.log 2>1& 6 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=1 >r1_trimmed_model_trimmed_dataset1.log 2>1& 7 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=2 >r1_trimmed_model_trimmed_dataset2.log 2>1& 8 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=3 >r1_trimmed_model_trimmed_dataset3.log 2>1& 9 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=4 >r1_trimmed_model_trimmed_dataset4.log 2>1& 10 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=5 >r1_trimmed_model_trimmed_dataset5.log 2>1& -------------------------------------------------------------------------------- /src/eutilities/metric.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score 2 | 3 | metric_names = ['acc', 'p', 'r', 'f1', 'macro_f1', 'macro_weighted_f1', 'micro_f1', 'auc'] 4 | 5 | 6 | def calc_metrics(test_y, pred_y, average='macro_f1', search_cut_off=True): 7 | prob = 0.5 8 | pred_y_label = [1 if i > prob else 0 for i in pred_y] 9 | 10 | acc = accuracy_score(test_y, pred_y_label) 11 | p = precision_score(test_y, pred_y_label) 12 | r = recall_score(test_y, pred_y_label) 13 | 14 | macro_f1 = f1_score(test_y, pred_y_label, average='macro') 15 | macro_weighted_f1 = f1_score(test_y, pred_y_label, average='weighted') 16 | micro_f1 = f1_score(test_y, pred_y_label, average='micro') 17 | 18 | pos_label_f1 = f1_score(test_y, pred_y_label, average='binary') 19 | rocauc = roc_auc_score(y_true=test_y, y_score=pred_y) 20 | # neg_label_f1 = f1_score(test_y, pred_y_label, pos_label=0, average='binary') 21 | # print(confusion_matrix(test_y, pred_y_label)) 22 | return dict( 23 | zip(metric_names, 24 | [acc, p, r, pos_label_f1, macro_f1, macro_weighted_f1, micro_f1, rocauc])) 25 | -------------------------------------------------------------------------------- /.idea/jarRepositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 14 | 15 | 19 | 20 | 24 | 25 | -------------------------------------------------------------------------------- /dataset/PubMed-Kim/check.py: -------------------------------------------------------------------------------- 1 | from unidecode import unidecode 2 | 3 | from myio.data_reader import DBReader 4 | 5 | df = DBReader.tcp_model_cached_read("cached/YYYY", 6 | """select PMID, 7 | trimBoth(splitByString(',', MEDLINE_Name)[1]) as medline_lastname, 8 | splitByChar('_', AINI)[1] as block_lastname 9 | from and_ds.AUT_ORC 10 | where medline_lastname != block_lastname""", 11 | cached=False) 12 | df['medline_lastname_parsed'] = df['medline_lastname'].apply(unidecode) 13 | df['block_lastname_parsed'] = df['block_lastname'].apply(unidecode) 14 | 15 | all = df.shape[0] 16 | cnt = 0 17 | for i, (pmid, medline_lastname, medline_lastname_parsed, block_lastname, block_lastname_parsed) in df[ 18 | ['PMID', 'medline_lastname', 'medline_lastname_parsed', 'block_lastname', 'block_lastname_parsed']].iterrows(): 19 | # medline_lastname_parsed = medline_lastname_parsed.lower().replace('-', '').replace(' ', '').replace('\'', '').replace('?', '') 20 | medline_lastname_parsed = ''.join([n for n in medline_lastname_parsed.lower() if n not in ('-',' ','\'', '?')]) 21 | block_lastname_parsed = block_lastname_parsed.lower() 22 | if medline_lastname_parsed != block_lastname_parsed: 23 | print(pmid, medline_lastname_parsed, block_lastname_parsed) 24 | cnt += 1 25 | 26 | print(cnt, all, cnt / all) 27 | -------------------------------------------------------------------------------- /dataset/BDBComp-Cota/check.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pandas as pd 4 | 5 | # font_bdbcomp.txt and title_bdbcomp.txt. 6 | # 7 | # The records of the file font_bdbcomp.txt are in the format of: 8 | # citationId<>clusterId_sequential<>coauthor:coauthor:...:coauthor<>publicationVenueTitle<>author 9 | # 10 | # The records of the file title_bdbcomp.txt are in the format of: 11 | # citationId<>workTitle 12 | 13 | name = pd.read_csv('./bdbcomp/font_bdbcomp.txt', sep='<>', 14 | names=['citationId', 'clusterId_sequential', 'authorList', 'publicationVenueTitle', 'author', 15 | 'null']) 16 | print(name.head()) 17 | pub = pd.read_csv('./bdbcomp/title_bdbcomp.txt', sep='<>', names=['citationId', 'workTitle'], error_bad_lines=None) 18 | print(pub.head()) 19 | print(pub.shape) 20 | 21 | author_names = name['author'].apply(lambda x: '_'.join([ 22 | x.split(' ')[-1], # last name 23 | x.split(' ')[0][0] # first initial 24 | ])).values 25 | 26 | counter = Counter(author_names) 27 | print(counter) 28 | 29 | 30 | author_group_idx = [int(n) for n in set(name['clusterId_sequential'].apply(lambda x: str(x)[:str(x).index('_')]).values)] 31 | for n in range(214): 32 | if n not in author_group_idx: 33 | print(n) 34 | print(sorted(author_group_idx)) 35 | num_author_group = len(author_group_idx) 36 | print('num_block: %d' % len(set(author_names))) 37 | num_citation = len(set(name['citationId'].values)) 38 | print('num_author_group: %d' % num_author_group) 39 | print('num_citation: %d' % num_citation) 40 | -------------------------------------------------------------------------------- /dataset/DBLP-Qian/check.py: -------------------------------------------------------------------------------- 1 | ds_fn = './DBLP name disambiguation dataset' 2 | lines = [line for line in open(ds_fn)] 3 | header = lines[0] 4 | print('headers: %s' % header) 5 | lines = ''.join(lines[1:]) 6 | 7 | blocks = lines.split('\n\n') 8 | 9 | print('num_block: %d' % len(blocks)) 10 | num_citation = 0 11 | num_author_group = 0 12 | for n in blocks: 13 | if '\n' not in n: 14 | # print(n) 15 | continue 16 | block_name = n[:n.index('\n')] 17 | fields = n[n.index('\n') + 1:].split('\n') 18 | # assert len(fields) % 9 ==0 19 | # num_names = int(len(fields) / 9) 20 | author_idx_arr = [] 21 | 22 | for m in fields: 23 | try: 24 | if '\t' in m: 25 | author_idx = int(m[:m.index('\t')]) 26 | author_idx_arr.append(author_idx) 27 | except Exception as e: 28 | # print(e) 29 | pass 30 | num_author_group += len(set(author_idx_arr)) 31 | num_citation += len(author_idx_arr) 32 | 33 | # 34 | # num_citation = 0 35 | # num_author_group = 0 36 | # for n in ds: 37 | # fn = os.path.join(ds_dir, n) 38 | # author_idx_arr = [] 39 | # for line in open(fn, encoding='iso8859-1'): 40 | # author_idx_citation_idx = line[:line.index(' ')] 41 | # num_citation += 1 42 | # author_idx, citation_idx = author_idx_citation_idx.split('_') 43 | # author_idx_arr.append(author_idx) 44 | # num_author_group += len(set(author_idx_arr)) 45 | 46 | print('num_author_group: %d' % num_author_group) 47 | print('num_citation: %d' % num_citation) 48 | -------------------------------------------------------------------------------- /src/eutilities/MAGdata/parse_absract_from_mag_kg.py: -------------------------------------------------------------------------------- 1 | # download MAG KG from https://zenodo.org/record/3930398#.X9YvjnYzY5ll 2 | import traceback 3 | 4 | temp_line = '' 5 | closed = True 6 | 7 | file_name = 'PaperAbstractsInvertedIndex.nt' 8 | orcid_mag_matched_paper_id = set([line.strip() for line in open('orcid_mag_matched_paper_id.txt')]) 9 | all_need_matched = len(orcid_mag_matched_paper_id) 10 | print(all_need_matched) 11 | matched_cnt = 0 12 | fw = open('orcid_mag_matched_paper_abstract.txt', 'w') 13 | for line in open(file_name): 14 | line = line.strip() 15 | temp_line = temp_line + line + ' ' 16 | if line.endswith('string> .'): 17 | # if 'string> .' in line: 18 | closed = True 19 | else: 20 | closed = False 21 | 22 | if closed: 23 | # print(temp_line) 24 | try: 25 | front = temp_line[:temp_line.index('terms/abstract>') + 16] 26 | back = temp_line[temp_line.index('terms/abstract>') + 16: temp_line.index('^^')] 27 | pid = front[front.index('entity/') + 7:front.index('> <')].strip() 28 | abstract = back 29 | # print(pid, abstract) 30 | if pid in orcid_mag_matched_paper_id: 31 | matched_cnt += 1 32 | if matched_cnt % 10000: 33 | print('matched_cnt: ', matched_cnt, matched_cnt * 100.0 / all_need_matched) 34 | fw.write(pid + '\t' + abstract + '\n') 35 | # print(temp_line) 36 | except: 37 | traceback.print_exc() 38 | # print('-' * 100) 39 | temp_line = '' 40 | -------------------------------------------------------------------------------- /src/eutilities/preprocessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 4 | 5 | 6 | def drop_missing_items(df): 7 | df = df.dropna(how='any') 8 | return df 9 | 10 | 11 | def down_sample(df, percent=1): 12 | ''' 13 | percent:多数类别下采样的数量相对于少数类别样本数量的比例 14 | ''' 15 | neg_samples_num = df[df['same_author'] == 0].shape[0] 16 | pos_samples_num = df[df['same_author'] == 1].shape[0] 17 | if neg_samples_num < pos_samples_num: 18 | data1 = df[df['same_author'] == 0] # 将多数类别的样本放在data1 19 | data0 = df[df['same_author'] == 1] # 将少数类别的样本放在data0 20 | else: 21 | data0 = df[df['same_author'] == 0] # 将多数类别的样本放在data0 22 | data1 = df[df['same_author'] == 1] # 将少数类别的样本放在data1 23 | index = np.random.randint( 24 | len(data0), size=percent * (len(df) - len(data0))) # 随机给定下采样取出样本的序号 25 | lower_data1 = data0.iloc[list(index)] # 下采样 26 | # print(lower_data1.shape) 27 | # print(data1.shape) 28 | return (pd.concat([lower_data1, data1])) 29 | 30 | 31 | def scale(df): 32 | mm_scaler = MinMaxScaler() 33 | df = mm_scaler.fit_transform(df) 34 | std_scaler = StandardScaler() 35 | df = std_scaler.fit_transform(df) 36 | return df 37 | 38 | 39 | def select_features(): 40 | # #SelectKBest（卡方系数） 41 | # ch2 = SelectKBest(chi2,k=3)#在当前的案例中，使用SelectKBest这个方法从4个原始的特征属性，选择出来3个 42 | # x_train = ch2.fit_transform(x_train, y_train)#训练并转换 43 | # select_name_index = ch2.get_support(indices=True) 44 | # print ("对类别判断影响最大的三个特征属性分布是:",ch2.get_support(indices=False)) 45 | # print(select_name_index) 46 | pass 47 | 48 | 49 | def preprocess(df): 50 | print('original shape: ', df.shape) 51 | df = drop_missing_items(df) 52 | print('after dropping shape: ', df.shape) 53 | df = scale(df) 54 | print('after scaling shape: ', df.shape) 55 | df = down_sample(df) 56 | print('after sampling shape: ', df.shape) 57 | return df 58 | -------------------------------------------------------------------------------- /src/model/nn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class MatchGRU(nn.Module): 6 | def __init__(self, glove, hidden_dim=64, num_layers=2, num_hand_craft_feature=5, bidirectional=True, output_dim=1): 7 | super(MatchGRU, self).__init__() 8 | embedding_dim = len(glove.vectors[0]) 9 | self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True) 10 | self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, 11 | batch_first=True, dropout=0.5, bidirectional=bidirectional) 12 | # self.match_fc = nn.Linear(2 * num_layers * hidden_dim * (2 if bidirectional else 1), 5) 13 | self.match_fc = nn.Sequential( 14 | nn.Linear(2 * num_layers * hidden_dim * (2 if bidirectional else 1), 128), 15 | nn.ReLU(), 16 | nn.BatchNorm1d(128), 17 | nn.Dropout(p=0.5), 18 | 19 | nn.Linear(128, 64), 20 | nn.ReLU(), 21 | nn.BatchNorm1d(64), 22 | nn.Dropout(p=0.5), 23 | 24 | nn.Linear(64, 64), 25 | nn.ReLU(), 26 | nn.Linear(64, 16), 27 | nn.ReLU(), 28 | nn.Linear(16, 1), 29 | ) 30 | 31 | def forward(self, input): 32 | XL, XR = input 33 | 34 | # output: [batch-size, Sequence-len, embedding-dim] 35 | XL = self.embedding(XL) 36 | XL, hl = self.gru(XL) 37 | # print(XL.shape, hl.shape) 38 | hl = torch.cat([hl[i] for i in range(len(hl))], dim=1) 39 | # print(hl.shape) 40 | 41 | # output: [batch-size, Sequence-len, embedding-dim] 42 | XR = self.embedding(XR) 43 | XR, hr = self.gru(XR) 44 | hr = torch.cat([hr[i] for i in range(len(hr))], dim=1) 45 | 46 | res = torch.cat([hl, hr], dim=1) 47 | res = self.match_fc(res) 48 | 49 | # convert to 0-1 possibility distribution 50 | # res = torch.softmax(res, dim=1) 51 | 52 | # add hand-craft features 53 | # res = torch.cat([HF, res], dim=1) 54 | # res = self.ml_hidden_fc(res) 55 | # print(res.shape) 56 | return res 57 | -------------------------------------------------------------------------------- /src/statistics/orcid_doi_number_each_year.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import seaborn as sb 4 | 5 | from myconfig import cached_dir, latex_doc_base_dir 6 | 7 | # sb.set_style("darkgrid") 8 | custom_params = {"axes.spines.right": False, "axes.spines.top": False} 9 | sb.set_theme(style="ticks", rc=custom_params) 10 | 11 | from matplotlib import pyplot as plot 12 | 13 | plot.rcParams['font.family'] = 'serif' 14 | plot.rcParams['font.serif'] = ['Times New Roman'] + plot.rcParams['font.serif'] 15 | 16 | from mytookit.data_reader import DBReader 17 | 18 | colors = ['green', 'red', 'gold', 'black', 'cyan', 'blue', 'magenta', 'purple', 'gray', 'fuchsia', 'orange', 'yellow'] 19 | linestyles = ['--', '-.', '--', '--'] 20 | line_markers = ['<', '>', '^', 'v'] 21 | linewidth = 5 22 | 23 | df_orcid = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "num_orcid_each_year.pkl"), 24 | "", cached=True) 25 | print('df_orcid: ', df_orcid.values) 26 | 27 | df_doi = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "num_doi_each_year.pkl"), "", cached=True) 28 | 29 | print('df_doi: ', df_doi.values) 30 | 31 | plot.figure() 32 | # plot.grid(which='major', axis='y') 33 | idx = 0 34 | plot.plot(df_doi.values[:, 0].astype('int'), df_doi.values[:, 1], linestyle=linestyles[idx], 35 | # marker=line_markers[idx], markersize=8, markevery=0.2, 36 | color=colors[idx], label='DOI', linewidth=linewidth) 37 | 38 | idx = 1 39 | plot.plot(df_orcid.values[:, 0].astype('int'), df_orcid.values[:, 1], linestyle=linestyles[idx], 40 | # marker=line_markers[idx], markersize=8, markevery=0.2, 41 | color=colors[idx], label='ORCID', linewidth=linewidth) 42 | 43 | # plot.yscale('log') 44 | # plot.title('num of instance each year') 45 | # plot.xlabel('year', loc='right') 46 | 47 | plot.ylabel('# Records', loc='center', fontsize=18) # 'top' 48 | plot.legend(loc='best') # 'lower right' 49 | plot.xticks(fontsize=18) 50 | plot.yticks(fontsize=18) 51 | 52 | plot.tight_layout() 53 | plot.savefig(os.path.join(cached_dir, 'doi_orcid_each_year.png'), dpi=600) 54 | plot.savefig(os.path.join(latex_doc_base_dir, 'figs/doi_orcid_each_year.png'), dpi=600) 55 | 56 | plot.show() 57 | -------------------------------------------------------------------------------- /src/comparison/block/clustering_metrics_MAG_AID.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | from beard import metrics 5 | from mytookit.data_reader import DBReader 6 | from tqdm import tqdm 7 | 8 | from myconfig import cached_dir 9 | 10 | minimal_uniq_author_in_block = 0 11 | 12 | df_blocks = DBReader.tcp_model_cached_read(cached_file_path='xxx', 13 | sql=r'''select block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock from and_ds.our_and_dataset_block_test_set_mag_prediction;''', 14 | cached=False) 15 | 16 | num_instances = len(df_blocks) 17 | df_blocks = df_blocks[df_blocks['num_unique_author_inblock'] > minimal_uniq_author_in_block] 18 | num_instances1 = len(df_blocks) 19 | print('removed %d instances, enable each block containing more than one unique authors' % (num_instances - num_instances1)) 20 | 21 | print(df_blocks.shape) 22 | 23 | 24 | # Note ############################################################################################# 25 | # Note test the performance of MAG author identifier 26 | # Note the clustering evaluation can not provide the Random baseline because it can not generate the ``labels_pred`` 27 | 28 | def data_precision_round(arr, precision=2, pctg=True): 29 | return [round(x * 100 if pctg else x, precision) for x in arr] 30 | 31 | 32 | mag_metrics = [] 33 | for index, row in tqdm(df_blocks.iterrows(), total=df_blocks.shape[0]): 34 | block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock = row 35 | # print('block-size: %d' % len(pm_aos)) 36 | 37 | # note calculate the paired-F1 and the B3-F1 score 38 | mag_metrics_b3 = metrics.b3_precision_recall_fscore(labels_true=ground_truths, labels_pred=mag_preds) 39 | mag_metrics_pairedf = metrics.paired_precision_recall_fscore(labels_true=ground_truths, labels_pred=mag_preds) 40 | 41 | mag_metrics.append([block_name] + data_precision_round(list(mag_metrics_pairedf + mag_metrics_b3))) 42 | 43 | # Note using block_name as the index row 44 | result_file = os.path.join(cached_dir, 'clustering-results-lagos-and-MAG-AID.tsv') 45 | columns = ['Block', 'pP', 'pR', 'pF', 'bP', 'bR', 'bF'] 46 | df = pd.DataFrame(mag_metrics, columns=columns) 47 | df.to_csv(result_file, sep='\t') 48 | mean_metrics = df._get_numeric_data().mean() 49 | print(mean_metrics) 50 | print(columns) 51 | print('minimal_uniq_author_in_block: ', minimal_uniq_author_in_block, 52 | data_precision_round(mean_metrics.values.tolist(), pctg=False)) 53 | -------------------------------------------------------------------------------- /src/eutilities/name/name_parser.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import personnamenorm as pnn 4 | from nameparser import HumanName 5 | 6 | 7 | def derek73_nameparser(name_str): 8 | name = HumanName(name_str) 9 | # name.as_dict() 10 | return [name.first, name.middle, name.last] 11 | 12 | 13 | threadLocal = threading.local() 14 | 15 | 16 | def thread_local_init(): 17 | initialized = getattr(threadLocal, 'initialized', None) 18 | if initialized is None: 19 | print('init thread local and loaded pickle data') 20 | threadLocal.personnamenorm = pnn.namenorm('cached/p_firstname.p') 21 | threadLocal.initialized = True 22 | else: 23 | # print('has inited thread local') 24 | pass 25 | 26 | 27 | class NameProcessor(): 28 | def __init__(self): 29 | pass 30 | 31 | def __call__(self, au): 32 | thread_local_init() 33 | if au is None or len(au) == 0: 34 | return [] 35 | else: 36 | splited_au = [] 37 | for pos, au_name in au: 38 | # print(id(threadLocal.personnamenorm)) 39 | personnamenorm = threadLocal.personnamenorm 40 | personnamenorm.unify(au_name) 41 | splited_au.append([pos, ' '.join(personnamenorm.name['firstname']).lower(), 'merged_to_fn', 42 | ' '.join(personnamenorm.name['lastname']).lower()]) 43 | # print(current_thread().name, splited_au) 44 | return splited_au 45 | 46 | 47 | personnamenorm = pnn.namenorm('cached/p_firstname.p') 48 | 49 | 50 | def klauslippert_personnamenorm(name_str): 51 | personnamenorm.unify(name_str) 52 | # print(name) 53 | return [' '.join(personnamenorm.name['firstname']).lower(), 'merged_to_fn', 54 | ' '.join(personnamenorm.name['lastname']).lower()] 55 | 56 | 57 | if __name__ == '__main__': 58 | names = ['Douglas H. Keefe', 'Carolina Abdala', 'Ram C. Naidu', 'David C. Mountain', 'Christopher A. Shera', 59 | 'John J. Guinan', 'Bernhard Ross', 'Kelly L. Tremblay', 'Terence W. Picton', 'Manfred Mauermann', 60 | 'Volker Hohmann', 'Richard L. Freyman', 'Karen S. Helfer', 'Uma Balakrishnan', 'Soha N. Garadat', 61 | 'Ruth Y. Litovsky', 'Michael A. Akeroyd', 'John Chambers', 'David Bullock', 'Alan R. Palmer', 62 | 'A. Quentin Summerfield'] 63 | for n in names: 64 | print(n.lower(), derek73_nameparser(n), derek73_nameparser(n.lower())) 65 | print(n.lower(), klauslippert_personnamenorm(n), klauslippert_personnamenorm(n.lower())) 66 | -------------------------------------------------------------------------------- /src/model/regression.py: -------------------------------------------------------------------------------- 1 | import time 2 | import warnings 3 | 4 | import numpy as np 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.linear_model import LogisticRegression, LinearRegression 7 | from sklearn.tree import DecisionTreeRegressor 8 | 9 | from model.available_model import ModelName 10 | 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | def use_regression(X_train, Y_train, X_test, model_switch: str): 15 | if model_switch == ModelName.linear: 16 | pred_y, feature_importance = linear_regressor(X_train, Y_train, X_test) 17 | elif model_switch == ModelName.logistic: 18 | pred_y, coefs, _ = logistic_regressor(X_train, Y_train, X_test) 19 | feature_importance = np.array(coefs[0]) 20 | elif model_switch == ModelName.dt: 21 | pred_y, feature_importance = dt_regressor(X_train, Y_train, X_test) 22 | elif model_switch == ModelName.randomforest: 23 | pred_y, feature_importance = randomforest_regressor(X_train, Y_train, X_test) 24 | else: 25 | pass 26 | return pred_y, feature_importance 27 | 28 | 29 | def linear_regressor(X_train, Y_train, X_test): 30 | model = LinearRegression() 31 | model.fit(X_train, Y_train) 32 | s = time.time() 33 | y_pred = model.predict(X_test) 34 | print('used time: ', time.time() - s) 35 | return y_pred, model.coef_ 36 | 37 | 38 | def logistic_regressor(X_train, Y_train, X_test): 39 | # model = LogisticRegression(max_iter=1000, solver='newton-cg', tol=1e-5) 40 | model = LogisticRegression(max_iter=1000, tol=1e-4, class_weight='balanced', C=2) 41 | model.fit(X_train, Y_train) 42 | s = time.time() 43 | y_pred = model.predict_proba(X_test) 44 | print('used time: ', time.time() - s) 45 | # metrics = calc_metrics(Y_train, [p1 for (p0, p1) in y_pred]) 46 | # pprint(metrics, pctg=True) 47 | y_pred = [p1 for (p0, p1) in y_pred] 48 | print(model.coef_, model.intercept_) 49 | return y_pred, model.coef_, model.intercept_ 50 | 51 | 52 | def dt_regressor(X_train, Y_train, X_test): 53 | model = DecisionTreeRegressor(max_depth=6) # max_depth=, 54 | model.fit(X_train, Y_train) 55 | # depth = model.get_depth() 56 | # for i in range(depth): 57 | # print(model.get_params(i+1)) 58 | s = time.time() 59 | y_pred = model.predict(X_test) 60 | print('used time: ', time.time() - s) 61 | return y_pred, model.feature_importances_ 62 | 63 | 64 | def randomforest_regressor(X_train, Y_train, X_test): 65 | model = RandomForestRegressor(n_estimators=100) 66 | model.fit(X_train, Y_train) 67 | # s = time.time() 68 | y_pred = model.predict(X_test) 69 | # print('used time: ', time.time() - s) 70 | return y_pred, model.feature_importances_ 71 | -------------------------------------------------------------------------------- /src/model/classification.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.linear_model import LinearRegression, LogisticRegression 6 | from sklearn.tree import DecisionTreeClassifier 7 | 8 | from model.available_model import ModelName 9 | 10 | warnings.filterwarnings('ignore') 11 | 12 | 13 | def use_classifier(X_train, Y_train, X_test, model_switch: str): 14 | if model_switch == ModelName.linear: 15 | pred_y, feature_importance = linear_classifier(X_train, Y_train, X_test) 16 | elif model_switch == ModelName.logistic: 17 | pred_y, coefs, _ = logistic_classifier(X_train, Y_train, X_test) 18 | feature_importance = np.array(coefs[0]) 19 | elif model_switch == ModelName.dt: 20 | pred_y, feature_importance = dt_classifier(X_train, Y_train, X_test) 21 | elif model_switch == ModelName.randomforest: 22 | pred_y, feature_importance = randomforest_classifier(X_train, Y_train, X_test) 23 | else: 24 | pass 25 | return pred_y, feature_importance 26 | 27 | 28 | def linear_classifier(X_train, Y_train, X_test): 29 | model = LinearRegression() 30 | model.fit(X_train, Y_train) 31 | y_pred = model.predict(X_test) 32 | y_pred = [1 if y >= 0.5 else 0 for y in y_pred] 33 | return y_pred, model.coef_ 34 | 35 | 36 | def logistic_classifier(X_train, Y_train, X_test): 37 | model = LogisticRegression(max_iter=1000, tol=1e-4, class_weight='balanced', C=2) 38 | model.fit(X_train, Y_train) 39 | y_pred = model.predict_proba(X_test) 40 | y_pred = [p1 for (p0, p1) in y_pred] 41 | y_pred = [1 if y >= 0.5 else 0 for y in y_pred] 42 | return y_pred, model.coef_, model.intercept_ 43 | 44 | 45 | def dt_classifier(X_train, Y_train, X_test): 46 | model = DecisionTreeClassifier(ccp_alpha=0, 47 | criterion='gini', 48 | max_depth=5, 49 | max_features=None) 50 | 51 | model.fit(X_train, Y_train) 52 | y_pred = model.predict(X_test) 53 | return y_pred, model.feature_importances_ 54 | 55 | 56 | def randomforest_classifier(X_train, Y_train, X_test): 57 | model = RandomForestClassifier(n_estimators=100, 58 | criterion="gini", 59 | max_depth=None, 60 | min_samples_split=2, 61 | min_samples_leaf=1, 62 | max_features='auto', # "auto" class_weight='balanced' 63 | ) 64 | model.fit(X_train, Y_train) 65 | y_pred = model.predict_proba(X_test) 66 | y_pred = y_pred[:, 1] 67 | return y_pred, model 68 | -------------------------------------------------------------------------------- /dataset/Aminer-Zhang/check.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | d1 = json.load(open('na-data-kdd18/data/global/name_to_pubs_train_500.json')) 4 | print(len(d1), d1.keys()) 5 | 6 | d2 = json.load(open('na-data-kdd18/data/global/name_to_pubs_test_100.json')) 7 | print(len(d2), d2.keys()) 8 | 9 | d = {} 10 | d.update(d1) 11 | d.update(d2) 12 | print(len(d), d.keys()) 13 | assert len(d) == len(d1) + len(d2) 14 | 15 | num_blocks = len(d) 16 | # for k,v in d.items(): 17 | # print(k, v) 18 | num_author_group = sum([len(list(v.keys())) for k, v in d.items()]) 19 | citation_with_author_order = sum([sum([b for a, b in v.items()], []) for k, v in d.items()], []) 20 | citations = [n[:n.index('-')] for n in citation_with_author_order] 21 | num_citation = len(set(citations)) 22 | 23 | print('num_block: %d' % num_blocks) 24 | print('num_author_group: %d' % num_author_group) 25 | print('num_citation: %d' % num_citation) 26 | 27 | block_author_papers = sum([[(k, n) for n in sum([b for _, b in v.items()], [])] for k, v in d.items()], []) 28 | print(block_author_papers[:10]) 29 | 30 | # read bibliographic author name 31 | pubs = json.load(open('na-data-kdd18/data/global/pubs_raw.json')) 32 | pubs_author_name = [] 33 | for k, v in pubs.items(): 34 | if 'authors' in v: 35 | for i, a in enumerate(v['authors']): 36 | pubs_author_name.append((k + '-' + str(i), a['name'].lower())) 37 | print(len(pubs_author_name), pubs_author_name[:10]) 38 | pubs_author_name = dict(pubs_author_name) 39 | print(len(pubs_author_name)) 40 | 41 | matched_names = [] 42 | from collections import Counter 43 | 44 | lastnames = [] 45 | for a, b in block_author_papers: 46 | biblio_name = pubs_author_name.get(b) 47 | if biblio_name is None: 48 | continue 49 | lastname = a.split('_')[-1] 50 | lastnames.append(lastname) 51 | a = a.replace('_', ' ').replace(' ', '') 52 | biblio_name = biblio_name.replace('.', '').replace('-', ' ').replace(' ', '') 53 | # have verified there is no case of full name variation in this dataset 54 | if a != biblio_name: 55 | print(a, biblio_name) 56 | 57 | matched_names.append([a, biblio_name]) 58 | 59 | print(Counter(lastnames)) 60 | # convert json to csv 61 | res = [] 62 | pubs_author_name = [] 63 | for k, v in pubs.items(): 64 | authors, title, venue, year, abstract = v.get('authors'), v.get('title'), v.get('venue'), v.get('year'), v.get( 65 | 'abstract') 66 | if abstract is not None: 67 | abstract = abstract.replace('\t', ' ').replace('\n', '') 68 | res.append([authors, title, venue, year, abstract]) 69 | 70 | import pandas as pd 71 | 72 | pd.DataFrame(res, columns=['authors', 'title', 'venue', 'year', 'abstract']).to_csv('aminer-zhang-csv.csv', sep='\t', 73 | index=None) 74 | -------------------------------------------------------------------------------- /dataset/Aminer-WhoisWho/to_table_and_check.py: -------------------------------------------------------------------------------- 1 | import json 2 | from itertools import groupby 3 | 4 | import pandas as pd 5 | 6 | train_author = json.load(open('./train_author.json')) 7 | # print(train_author) 8 | author_name_aid_pid = [] 9 | for author_name in train_author.keys(): 10 | for aid, pids in train_author[author_name].items(): 11 | # print(author_name, aid, pids) 12 | # author_name_aid_pids.append([author_name, aid, '|'.join(pids)]) 13 | for pid in pids: 14 | assert len(pid) > 0 15 | author_name_aid_pid.append([author_name, aid, pid]) 16 | 17 | print(len(author_name_aid_pid)) 18 | train_pub = json.load(open('./train_pub.json')) 19 | pubs = {} 20 | for pid in train_pub.keys(): 21 | paper_info = train_pub[pid] 22 | id = paper_info['id'] if paper_info.get('id') is not None else '' 23 | authors = paper_info['authors'] if paper_info.get('authors') is not None else '' 24 | title = paper_info['title'] if paper_info.get('title') is not None else '' 25 | abstract = paper_info['abstract'].replace('\t', ' ').replace('\n', ' ') if paper_info.get( 26 | 'abstract') is not None else '' 27 | keywords = paper_info['keywords'] if paper_info.get('keywords') is not None else '' 28 | venue = paper_info['venue'] if paper_info.get('venue') is not None else '' 29 | year = paper_info['year'] if paper_info.get('year') is not None else '' 30 | assert pid == id 31 | # print([pid, id, authors, title, abstract, keywords, venue, year]) 32 | assert pubs.get(pid) == None 33 | pubs[pid] = [pid, authors, title, abstract, keywords, venue, year] 34 | 35 | author_name_aid_pid_pub = [item + pubs.get(item[-1])[1:] if pubs.get(item[-1]) is not None else [''] * 6 for item in 36 | author_name_aid_pid] 37 | 38 | 39 | def convert_to_author_list(n): 40 | res = [m['name'].lower().replace('-', '').replace('.', '').replace(' ', '') for m in n[3]] 41 | return res 42 | 43 | 44 | num_variation = 0 45 | num_all = 0 46 | for k, g in groupby(author_name_aid_pid_pub, lambda s: s[1]): 47 | for n in g: 48 | num_all += 1 49 | author_list = convert_to_author_list(n) 50 | split = n[0].split('_') 51 | au_name0 = split[-1] + split[0] 52 | au_name = n[0].replace('_', '') 53 | # last_name = n[0].split('_')[-1].strip() 54 | # TODO 55 | # if au_name not in author_list: 56 | if au_name not in author_list and au_name0 not in author_list: 57 | print(n[0], author_list) 58 | num_variation += 1 59 | 60 | print(num_variation, num_all, num_variation / num_all) 61 | # last name frequency 62 | from collections import Counter 63 | 64 | print(Counter([n[0].split('_')[-1] for n in author_name_aid_pid_pub])) 65 | 66 | pd.DataFrame(author_name_aid_pid_pub, 67 | columns=['author_name', 'aid', 'pid', 'authors', 'title', 'abstract', 'keywords', 'venue', 'year']).to_csv( 68 | 'train_author_pub.tsv', sep='\t', 69 | index=None) 70 | -------------------------------------------------------------------------------- /src/eutilities/name/name_parser_by_socket.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | import traceback 4 | 5 | from mytookit.data_reader import DBReader 6 | 7 | 8 | def recvall(sock_cli): 9 | BUFF_SIZE = 4096 # 4 KiB 10 | data = b'' 11 | while True: 12 | part = sock_cli.recv(BUFF_SIZE) 13 | data += part 14 | if len(part) < BUFF_SIZE: 15 | # either 0 or end of data 16 | break 17 | return data 18 | 19 | 20 | def insert_batch_data(batch_insert_data): 21 | paper_names = [[m[1] for m in n[1]] for n in batch_insert_data] 22 | json_str = json.dumps({'names': paper_names}, ensure_ascii=False) 23 | 24 | client = socket.socket() 25 | client.connect(('localhost', 38081)) 26 | # print('connect successfully') 27 | 28 | client.send(json_str.encode("utf-8")) 29 | res = recvall(client) 30 | process_names = json.loads(res.decode()) 31 | client.close() 32 | print(len(process_names), len(batch_insert_data)) 33 | assert len(process_names) == len(batch_insert_data) 34 | batch_insert_data = [n + [process_names[i]] for i, n in enumerate(batch_insert_data)] 35 | 36 | v = DBReader.tcp_client.execute( 37 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES", 38 | params=batch_insert_data) 39 | print('has inserted %d instances' % v) 40 | 41 | 42 | df_s2 = DBReader.tcp_model_cached_read("XXXX", 43 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_s2_paper_linkage""", 44 | cached=False) 45 | print(df_s2.shape) 46 | batch_insert_data = [] 47 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_s2.iterrows(): 48 | if i > 0 and i % 10000 == 0: 49 | # trigger inserting data 50 | insert_batch_data(batch_insert_data) 51 | batch_insert_data = [] 52 | batch_insert_data.append([pid, biblio_authors, doi, orcid, orcid_names, 'S2', 'joshfraser-NameParser']) 53 | 54 | if len(batch_insert_data) != 0: 55 | insert_batch_data(batch_insert_data) 56 | print('inserted completed!') 57 | 58 | # delete this obj for saving RAM 59 | if df_s2 is not None: 60 | del df_s2 61 | 62 | df_mag = DBReader.tcp_model_cached_read("XXXX", 63 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_mag_paper_linkage""", 64 | cached=False) 65 | print(df_mag.shape) 66 | batch_insert_data = [] 67 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_mag.iterrows(): 68 | if i > 0 and i % 10000 == 0: 69 | # trigger insert here 70 | try: 71 | insert_batch_data(batch_insert_data) 72 | except: 73 | traceback.print_exc() 74 | 75 | batch_insert_data = [] 76 | batch_insert_data.append([str(pid), biblio_authors, doi, orcid, orcid_names, 'MAG', 'joshfraser-NameParser']) 77 | 78 | if len(batch_insert_data) != 0: 79 | insert_batch_data(batch_insert_data) 80 | print('inserted completed!') 81 | -------------------------------------------------------------------------------- /src/eutilities/name/name_parser_by_localscript.py: -------------------------------------------------------------------------------- 1 | from eutilities.name.name_parser import derek73_nameparser, klauslippert_personnamenorm 2 | from mytookit.data_reader import DBReader 3 | 4 | name_parser_method = 'derek73' 5 | # name_parser_method = 'klauslippert' 6 | 7 | def split_ltiple_biblio_authors(au): 8 | if au is None or len(au) == 0: 9 | return [] 10 | else: 11 | splited_au = [] 12 | for pos, au_name in au: 13 | if name_parser_method == 'derek73': 14 | name_parts = derek73_nameparser(au_name) 15 | else: 16 | name_parts = klauslippert_personnamenorm(au_name) 17 | splited_au.append([pos, name_parts[0], name_parts[1], name_parts[2]]) 18 | return splited_au 19 | 20 | 21 | df_s2 = DBReader.tcp_model_cached_read("XXXX", 22 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_s2_paper_linkage""", 23 | cached=False) 24 | print(df_s2.shape) 25 | batch_insert_data = [] 26 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_s2.iterrows(): 27 | if i > 0 and i % 100000 == 0: 28 | # trigger insert here 29 | v = DBReader.tcp_client.execute( 30 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES", 31 | params=batch_insert_data) 32 | print('has inserted %d instances' % v) 33 | batch_insert_data = [] 34 | batch_insert_data.append( 35 | [pid, biblio_authors, doi, orcid, orcid_names, 'S2', name_parser_method, 36 | split_ltiple_biblio_authors(biblio_authors)]) 37 | 38 | if len(batch_insert_data) != 0: 39 | v = DBReader.tcp_client.execute(query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES", 40 | params=batch_insert_data, types_check=True) 41 | print('has inserted the last %d instances' % v) 42 | print('inserted completed!') 43 | 44 | # delete this obj for saving RAM 45 | if df_s2 is not None: 46 | del df_s2 47 | 48 | df_mag = DBReader.tcp_model_cached_read("XXXX", 49 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_mag_paper_linkage""", 50 | cached=False) 51 | print(df_mag.shape) 52 | batch_insert_data = [] 53 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_mag.iterrows(): 54 | if i > 0 and i % 100000 == 0: 55 | # trigger insert here 56 | v = DBReader.tcp_client.execute( 57 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES", 58 | params=batch_insert_data) 59 | print('has inserted %d instances' % v) 60 | batch_insert_data = [] 61 | batch_insert_data.append( 62 | [str(pid), biblio_authors, doi, orcid, orcid_names, 'MAG', name_parser_method, 63 | split_ltiple_biblio_authors(biblio_authors)]) 64 | 65 | if len(batch_insert_data) != 0: 66 | v = DBReader.tcp_client.execute(query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES", 67 | params=batch_insert_data, types_check=True) 68 | print('has inserted the last %d instances' % v) 69 | print('inserted completed!') 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Dataset 2 | 3 | The dataset can be available from [here](https://zenodo.org/record/7313380), it is represented in two kinds of forms. The first one is full-name-block form, which arranges the dataset by ORCID iD and shared full name. The second is pairwise form, ambiguous authors are arranged in pair so that some classifier-based models can use it to capture the similarity between two authors. 4 | 5 | #### Our dataset v.s. Existing datasets 6 | Creating a new dataset is painful. In AND researches, all existing datasets created by human annotators, and most datasets are either in limited scale or biased. However, our dataset has overcome these problems. It does not need human interventions in building the dataset. Moreover, by using the two comprehensive resources, the publishing history of a specific author (query DOIs by ORCID iD) and authors of a specific paper (query ORCID iDs by DOI) can be easily and credibly identified. Thus, with the large number of records in the credible resources, a large-scale dataset can be built. More importantly, the dataset considers more realistic aspects than existing datasets. It passed a series of rigorous gold standard validations, among which the two most important ones are synonym patterns and domains. The dataset contains a similar variation degree in last names and covers wide domain of research areas, as that represented in entire MAG. 7 | 8 | #### Dataset Structure 9 | The block-based dataset contains the following fields: 10 | 11 | | Field | Date Type | 12 | |------------------------------|---------------| 13 | | block_fullname | String | 14 | | author_group_orcid | String | 15 | | author_group_idx_in_block | Int | 16 | | citation_idx_in_author_group | Int | 17 | | doi | String | 18 | | pid | Int | 19 | | author_position | Int | 20 | | author_name | String | 21 | | author_affiliation | String | 22 | | coauthors | String Array | 23 | | coauthor_affliations | String Array | 24 | | venue | String | 25 | | pub_year | Int | 26 | | paper_title | String | 27 | | paper_abstract | String | 28 | 29 | "block_fullname" is taken from the credible full name (CFN) from the ORCID system, it is used to represent the block. Due to the fact that more than one authors can exist in a block, "author_group_orcid" is the ORCID iD of a specific author in a block, it is used to represent a group of citations (CG) that authored by this author, and "author_group_idx_in_block" denotes the order of CGs in a block. Similarly, "citation_idx_in_author_group" denotes the order of citation in a CG. "pid" is the paper ID in Microsoft Academic and Microsoft Academic Graph. "author_position" is identified by heuristics. 30 | 31 | ### Citation 32 | If you used the dataset, method or model, please consider cite it. 33 | ```bibtex 34 | @article{zhang2021lagos, 35 | title={LAGOS-AND: A Large, Gold Standard Dataset for Scholarly Author Name Disambiguation}, 36 | author={Zhang, Li and Lu, Wei and Yang, Jinqing}, 37 | journal={arXiv preprint arXiv:2104.01821}, 38 | year={2021} 39 | } 40 | ``` 41 | -------------------------------------------------------------------------------- /src/eutilities/train_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import torch 5 | from sklearn import metrics 6 | from tqdm import tqdm 7 | 8 | 9 | def train(model, train_loader, criterion, optimizer, epoch, epochs, train_vector, logs_per_epoch=10, 10 | device=torch.device('cuda')): 11 | model.train() 12 | train_loss = 0 13 | num_batches = len(train_loader) 14 | start = time.time() 15 | for batch_idx, (MT, XL, XR, Y) in enumerate(train_loader): 16 | # HF, XL, XR, Y = HF.to(device), XL.to(device), XR.to(device), Y.to(device) 17 | XL, XR, Y = XL.to(device), XR.to(device), Y.to(device) 18 | optimizer.zero_grad() 19 | # output = model([HF, XL, XR]) 20 | output = model([XL, XR]) 21 | loss = criterion(output, Y) 22 | train_loss += loss.item() 23 | loss.backward() 24 | optimizer.step() 25 | 26 | if batch_idx % (num_batches // logs_per_epoch) == 0 and batch_idx > 0: 27 | now = time.time() 28 | batch_size = len(Y) 29 | inputs_per_sec = ((batch_idx + 1) * batch_size) / (now - start) 30 | eta_min = (epochs * num_batches - (epoch - 1) * num_batches - ( 31 | batch_idx + 1)) * batch_size / inputs_per_sec / 60 32 | print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tInputs/s: {:.1f}\tRemaining: {:.1f} min'.format( 33 | epoch, batch_idx * len(Y), len(train_loader.dataset), 34 | 100. * batch_idx / len(train_loader), loss.data.item(), inputs_per_sec, eta_min)) 35 | 36 | train_loss /= len(train_loader) 37 | train_vector.append(train_loss) 38 | 39 | 40 | def validate(model, test_loader, criterion, loss_vector, f1_vector=[], device=torch.device('cuda'), switch_input=False): 41 | model.eval() 42 | val_loss = 0 43 | metadata = [] 44 | prediction = torch.tensor([], device=device) 45 | true_labels = torch.tensor([], device=device) 46 | print('\nValidating...') 47 | with torch.no_grad(): 48 | for (MT, XL, XR, Y) in tqdm(test_loader): 49 | metadata.append(np.array([n.cpu().numpy() for n in MT])) 50 | # HF, XL, XR, Y = HF.to(device), XL.to(device), XR.to(device), Y.to(device) 51 | XL, XR, Y = XL.to(device), XR.to(device), Y.to(device) 52 | # output = model([HF, XL, XR]) 53 | if switch_input: 54 | output = model([XR, XL]) 55 | else: 56 | output = model([XL, XR]) 57 | 58 | val_loss += criterion(output, Y).data.item() 59 | 60 | if isinstance(criterion, torch.nn.BCEWithLogitsLoss): 61 | pred = output.sigmoid() 62 | 63 | prediction = torch.cat((prediction, pred)) 64 | true_labels = torch.cat((true_labels, Y)) 65 | 66 | if output.size(-1) == 2: 67 | true_label_numpy = [int(n[1]) for n in true_labels.cpu().numpy()] 68 | pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()] 69 | pred_prob = [n[1] for n in prediction.cpu().numpy()] 70 | else: 71 | true_label_numpy = [int(n) for n in true_labels.cpu().numpy()] 72 | pred_label_numpy = [1 if n > 0.5 else 0 for n in prediction.cpu().numpy()] 73 | pred_prob = [n[-1] for n in prediction.cpu().numpy()] 74 | print(pred_prob[:100]) 75 | accuracy = metrics.accuracy_score(true_label_numpy, pred_label_numpy) 76 | f1_score = metrics.f1_score(true_label_numpy, pred_label_numpy) 77 | macro_f1_score = metrics.f1_score(true_label_numpy, pred_label_numpy, average='macro') 78 | val_loss /= len(test_loader) 79 | loss_vector.append(val_loss) 80 | f1_vector.append(f1_score) 81 | print('Validation set: Average loss: {:.4f}\t Accuracy: {:.4f}\t F1-score: {:.4f}\t Macro-F1-score: {:.4f}\n'.format(val_loss, 82 | accuracy, 83 | f1_score, 84 | macro_f1_score)) 85 | metadata = np.hstack(metadata) 86 | return metadata, true_label_numpy, pred_label_numpy, pred_prob 87 | -------------------------------------------------------------------------------- /src/eutilities/MAGdata/parse_fos_from_mag_kg.py: -------------------------------------------------------------------------------- 1 | # download MAG KG from https://zenodo.org/record/3930398#.X9YvjnYzY5ll 2 | import traceback 3 | 4 | fos_names = set(map(lambda x: x.lower(), 5 | ['Medicine', 'Biology', 'Chemistry', 'Computer Science', 'Engineering', 'Physics', 6 | 'Materials Science', 7 | 'Psychology', 'Mathematics', 'History', 'Sociology', 'Art', 'Political Science', 'Geography', 8 | 'Economics', 9 | 'Business', 'Geology', 'Philosophy', 'Environmental Science'])) 10 | file_name = 'FieldsOfStudy.nt' 11 | fos_id_name_dict = {} 12 | for line in open(file_name): 13 | splt = line.strip().split('>') 14 | assert len(splt) == 4 15 | if 'name' in splt[1]: 16 | temp = splt[2] 17 | fos = temp[:temp.index('^')].replace('\"', '').strip().lower() 18 | if fos in fos_names: 19 | fos_id = splt[0][splt[0].index('entity/') + 7:].strip() 20 | fos_id_name_dict[fos_id] = fos 21 | 22 | # cat FieldsOfStudy.nt | grep 'level> "0"' 23 | # "0"^^ . 24 | # "0"^^ . 25 | # "0"^^ . 26 | # "0"^^ . 27 | # "0"^^ . 28 | # "0"^^ . 29 | # "0"^^ . 30 | # "0"^^ . 31 | # "0"^^ . 32 | # "0"^^ . 33 | # "0"^^ . 34 | # "0"^^ . 35 | # "0"^^ . 36 | # "0"^^ . 37 | # "0"^^ . 38 | # "0"^^ . 39 | # "0"^^ . 40 | # "0"^^ . 41 | # "0"^^ . 42 | 43 | # assert len(fos_id_name_dict) == 19 44 | # {'95457728': 'history', '127313418': 'geology', '162324750': 'economics', '205649164': 'geography', '185592680': 'chemistry', 45 | # '138885662': 'philosophy', '144024400': 'sociology', '192562407': 'materials science', '33923547': 'mathematics', 46 | # '86803240': 'biology', '41008148': 'computer science', '17744445': 'political science', '127413603': 'engineering', 47 | # '15744967': 'psychology', '39432304': 'environmental science', '144133560': 'business', '121332964': 'physics', 48 | # '71924100': 'medicine', '142362112': 'art'} 49 | 50 | file_name1 = 'paper_fos_parsed_using_awk.txt' 51 | with open('mag_paper_top_level_fos.tsv.1', 'w') as fw: 52 | for line in open(file_name1): 53 | splt = line.strip().split(' ') 54 | assert len(splt) == 2 55 | pid, fos_id = splt 56 | if fos_id in fos_id_name_dict: 57 | fw.write('\t'.join([pid, fos_id_name_dict[fos_id]]) + '\n') 58 | traceback.print_exc() 59 | 60 | # file_name1 = 'mag_kg/PaperFieldsOfStudy.nt' 61 | # with open('mag_paper_top_level_fos.tsv', 'w') as fw: 62 | # for line in open(file_name1): 63 | # try: 64 | # splt = line.strip().split('>') 65 | # assert len(splt) == 4 66 | # pid = splt[0][splt[0].index('entity/') + 7:].strip() 67 | # fos_id = splt[2][splt[2].index('entity/') + 7:].strip() 68 | # if fos_id in fos_id_name_dict: 69 | # fw.write('\t'.join([pid, fos_id_name_dict[fos_id]]) + '\n') 70 | # except Exception as e: 71 | # traceback.print_exc() -------------------------------------------------------------------------------- /src/feature/cluster/sparse_tfidf_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | import numpy as np 5 | import torch 6 | from mytookit.data_reader import DBReader 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from tqdm import tqdm 9 | 10 | from myconfig import device, cached_dir 11 | 12 | sql_block = r''' 13 | select block_fullname as block_name, 14 | arrayMap(x->x[1], 15 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos, 16 | arrayMap(x->x[2], tmp) as ground_truths, 17 | arrayMap(x->x[3], tmp) as mag_preds 18 | from (select block_fullname, 19 | author_group_orcid, 20 | -- Note has verified all mag_author_id is successfully matched 21 | toString(aid) as mag_author_id, 22 | concat(toString(pid), '_', toString(author_position)) as pid_ao 23 | from and_ds.our_and_dataset_block any 24 | left join ( 25 | select pid, aid, author_position 26 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 27 | from mag.paper_author_affiliation) any 28 | inner join and_ds.our_and_dataset_block using pid, author_position 29 | ) using pid, author_position) 30 | group by block_name 31 | having xxHash32(block_name) %% 10=%d 32 | order by length(pid_aos) desc; 33 | ;''' 34 | 35 | sql_metadata = r''' 36 | select concat(toString(pid), '_', toString(author_position)) as pid_ao, 37 | block_fullname, 38 | author_group_orcid as orcid, 39 | -- -- Note has verified all mag_author_id is successfully matched 40 | -- lowerUTF8(author_name) as author_name, 41 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation, 42 | -- coauthors, 43 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue, 44 | -- pub_year, 45 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content 46 | from and_ds.our_and_dataset_block any 47 | left join ( 48 | select pid, aid, author_position 49 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 50 | from mag.paper_author_affiliation) any 51 | inner join and_ds.our_and_dataset_block using pid, author_position 52 | ) using pid, author_position 53 | where xxHash32(block_fullname) %% 10=%d 54 | ''' 55 | 56 | 57 | def sparse_tensor_tfidf_similarity(documents): 58 | tfidf_csr = vectorizer.transform(documents) 59 | 60 | if len(documents) < 300: 61 | # Note computer by CPU 62 | m = tfidf_csr * tfidf_csr.T 63 | similarity = m.A 64 | else: 65 | # Note computer by GPU 66 | coo = tfidf_csr.tocoo() 67 | indices = np.vstack((coo.row, coo.col)) 68 | st = torch.sparse.FloatTensor(torch.LongTensor(indices), 69 | torch.FloatTensor(coo.data), 70 | torch.Size(coo.shape)).to(device) 71 | # Note this feature require a high version of pytorch 72 | multipled_st = torch.sparse.mm(st, torch.transpose(st, 0, 1)) 73 | similarity = multipled_st.to_dense().cpu().numpy() 74 | 75 | return similarity 76 | 77 | 78 | for seg in range(0, 10, 1): 79 | sql = sql_metadata % seg 80 | print(sql) 81 | # Note prepare the paper metadata dict 82 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False) 83 | print(df_metadata.shape) 84 | print(df_metadata.head()) 85 | 86 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values)) 87 | md_content_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].values)) 88 | 89 | del df_metadata 90 | 91 | # Note generate the pairwise similarity 92 | documents = md_content_dict.values() 93 | vectorizer = TfidfVectorizer() # tokenizer=normalize, stop_words='english' 94 | print('fit tfidf model') 95 | vectorizer = vectorizer.fit(documents) 96 | 97 | all_block_feature = {} 98 | sql = sql_block % seg 99 | print(sql) 100 | df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False) 101 | print(df_block.shape) 102 | 103 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]): 104 | block_name, pid_aos, ground_truths, mag_preds = row 105 | documents = [md_content_dict[pid_ao] for pid_ao in pid_aos] 106 | tfidf_similarity = sparse_tensor_tfidf_similarity(documents) 107 | tfidf_similarity = np.array(tfidf_similarity, dtype=np.float16) 108 | all_block_feature[block_name] = tfidf_similarity 109 | 110 | joblib.dump(all_block_feature, 111 | filename=os.path.join(cached_dir, 'cluster_feature/tfidf-feature-%d.pkl' % seg)) 112 | -------------------------------------------------------------------------------- /src/feature/doc2vec_trainer.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | import os 4 | 5 | from mytookit.data_reader import DBReader 6 | from sklearn.model_selection import train_test_split 7 | 8 | from myconfig import cached_dir 9 | 10 | logging.basicConfig( 11 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 12 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename 13 | project_dir_path = os.path.dirname(os.path.abspath(base_file_path)) 14 | data_path = project_dir_path 15 | 16 | import inspect 17 | import logging 18 | import os 19 | import random 20 | 21 | import numpy as np 22 | from gensim.models import doc2vec 23 | 24 | logging.basicConfig( 25 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 26 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename 27 | base_path = os.path.dirname(os.path.abspath(base_file_path)) 28 | project_dir_path = os.path.dirname(os.path.abspath(base_path)) 29 | classifiers_path = os.path.join(project_dir_path, 'classifiers') 30 | 31 | 32 | class doc2VecModel(): 33 | 34 | def __init__(self): 35 | super().__init__() 36 | 37 | def initialize_model(self, corpus): 38 | logging.info("Building Doc2Vec vocabulary") 39 | self.corpus = corpus 40 | self.model = doc2vec.Doc2Vec( 41 | epochs=8, 42 | min_count=1, 43 | # Ignores all words with 44 | # total frequency lower than this 45 | window=10, 46 | # The maximum distance between the current 47 | # and predicted word within a sentence 48 | vector_size=200, # Dimensionality of the 49 | # generated feature vectors 50 | workers=12, # Number of worker threads to 51 | # train the model 52 | alpha=0.025, # The initial learning rate 53 | min_alpha=0.00025, 54 | # Learning rate will linearly drop to 55 | # min_alpha as training progresses 56 | dm=1) 57 | # dm defines the training algorithm. 58 | # If dm=1 means 'distributed memory' (PV-DM) 59 | # and dm =0 means 'distributed bag of words' (PV-DBOW) 60 | self.model.build_vocab(self.corpus) 61 | 62 | def train_model(self): 63 | logging.info("Training Doc2Vec model") 64 | for epoch in range(2): 65 | logging.info('Training iteration #{0}'.format(epoch)) 66 | self.model.train( 67 | self.corpus, total_examples=self.model.corpus_count, 68 | epochs=self.model.epochs) 69 | # shuffle the corpus 70 | random.shuffle(self.corpus) 71 | # decrease the learning rate 72 | self.model.alpha -= 0.0002 73 | # fix the learning rate, no decay 74 | self.model.min_alpha = self.model.alpha 75 | 76 | def get_vectors(self, corpus_size, vectors_size, vectors_type): 77 | """ 78 | Get vectors from trained doc2vec model 79 | :param doc2vec_model: Trained Doc2Vec model 80 | :param corpus_size: Size of the data 81 | :param vectors_size: Size of the embedding vectors 82 | :param vectors_type: Training or Testing vectors 83 | :return: list of vectors 84 | """ 85 | vectors = np.zeros((corpus_size, vectors_size)) 86 | for i in range(0, corpus_size): 87 | prefix = vectors_type + '_' + str(i) 88 | vectors[i] = self.model.docvecs[prefix] 89 | return vectors 90 | 91 | def save_model(self, model_path): 92 | logging.info("Doc2Vec model saved at: " + model_path) 93 | self.model.save(model_path) 94 | 95 | def label_sentences(corpus, label_type): 96 | """ 97 | Gensim's Doc2Vec implementation requires each 98 | document/paragraph to have a label associated with it. 99 | We do this by using the LabeledSentence method. 100 | The format will be "TRAIN_i" or "TEST_i" where "i" is 101 | a dummy index of the review. 102 | """ 103 | labeled = [] 104 | for i, v in enumerate(corpus): 105 | label = label_type + '_' + str(i) 106 | labeled.append(doc2vec.LabeledSentence(v.split(), [label])) 107 | return labeled 108 | 109 | 110 | def prepare_all_data(ds): 111 | x_train, x_test, y_train, y_test = train_test_split(ds.review, ds.sentiment, random_state=0, test_size=0.1) 112 | x_train = doc2VecModel.label_sentences(x_train, 'Train') 113 | x_test = doc2VecModel.label_sentences(x_test, 'Test') 114 | all_data = x_train + x_test 115 | return x_train, x_test, y_train, y_test, all_data 116 | 117 | 118 | if __name__ == "__main__": 119 | ds = DBReader.tcp_model_cached_read(os.path.join(cached_dir, 'doc2vec_train_corpus.pkl'), 120 | """select content from and_ds.doc2vec_train_corpus""", 121 | cached=False) 122 | print(ds.shape) 123 | print(ds.head()) 124 | ds_content = list(ds['content']) 125 | ds_content = [item.split() for item in ds_content] 126 | print('training samples size:', len(ds_content)) 127 | print('first 3 training samples:', ds_content[:3]) 128 | corpus = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(ds_content)] 129 | # print(corpus[0:4]) 130 | d2v = doc2VecModel() 131 | d2v.initialize_model(corpus) 132 | d2v.train_model() 133 | d2v.save_model(os.path.join(cached_dir, 'doc2vec_model')) 134 | -------------------------------------------------------------------------------- /src/feature/cluster/doc2vec_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing import Pool 3 | 4 | import joblib 5 | import torch 6 | from gensim.models import Doc2Vec 7 | from mytookit.data_reader import DBReader 8 | from nltk.corpus import stopwords 9 | from tqdm import tqdm 10 | 11 | from myconfig import cached_dir, device 12 | 13 | en_stopwords_set = set(stopwords.words('english')) 14 | 15 | sql_block = r''' 16 | select block_fullname as block_name, 17 | arrayMap(x->x[1], 18 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos, 19 | arrayMap(x->x[2], tmp) as ground_truths, 20 | arrayMap(x->x[3], tmp) as mag_preds 21 | from (select block_fullname, 22 | author_group_orcid, 23 | -- Note has verified all mag_author_id is successfully matched 24 | toString(aid) as mag_author_id, 25 | concat(toString(pid), '_', toString(author_position)) as pid_ao 26 | from and_ds.our_and_dataset_block any 27 | left join ( 28 | select pid, aid, author_position 29 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 30 | from mag.paper_author_affiliation) any 31 | inner join and_ds.our_and_dataset_block using pid, author_position 32 | ) using pid, author_position) 33 | group by block_name 34 | having xxHash32(block_name) %% 10=%d 35 | order by length(pid_aos) desc; 36 | ;''' 37 | 38 | sql_metadata = r''' 39 | select concat(toString(pid), '_', toString(author_position)) as pid_ao, 40 | block_fullname, 41 | author_group_orcid as orcid, 42 | -- -- Note has verified all mag_author_id is successfully matched 43 | -- lowerUTF8(author_name) as author_name, 44 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation, 45 | -- coauthors, 46 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue, 47 | -- pub_year, 48 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content 49 | from and_ds.our_and_dataset_block any 50 | left join ( 51 | select pid, aid, author_position 52 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 53 | from mag.paper_author_affiliation) any 54 | inner join and_ds.our_and_dataset_block using pid, author_position 55 | ) using pid, author_position 56 | where xxHash32(block_fullname) %% 10=%d 57 | ''' 58 | 59 | # load doc2vec model 60 | print('begin load models... ') 61 | doc2vec_model = Doc2Vec.load('../cached/doc2vec_model') 62 | print('end load models... ') 63 | 64 | 65 | def sim_matrix(a, b, eps=1e-8): 66 | """ 67 | added eps for numerical stability 68 | """ 69 | a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None] 70 | a_norm = a / torch.clamp(a_n, min=eps) 71 | b_norm = b / torch.clamp(b_n, min=eps) 72 | b_norm = b_norm.transpose(0, 1) 73 | # print(a_norm.shape, b_norm.shape) 74 | sim_mt = torch.mm(a_norm, b_norm) 75 | return sim_mt 76 | 77 | 78 | for seg in range(0, 10, 1): 79 | sql = sql_metadata % seg 80 | print(sql) 81 | # Note prepare the paper metadata dict 82 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='XXX', sql=sql, cached=False) 83 | print(df_metadata.shape) 84 | print(df_metadata.head()) 85 | 86 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values)) 87 | 88 | # md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values)) 89 | # md_content_word_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply( 90 | # lambda x: set([w for w in x.split(' ') if not w in en_stopwords_set])).values)) 91 | 92 | # for index, (pid_ao, content) in tqdm(df_metadata[['pid_ao', 'content']].iterrows(), total=df_metadata.shape[0]): 93 | # doc2vec_model.infer_vector(content.split(' '), steps=12, alpha=0.025) 94 | # Note this step will be very slow 95 | # md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply( 96 | # lambda x: doc2vec_model.infer_vector(x.split(' '), steps=12, alpha=0.025)).values)) 97 | 98 | def infer_vector_worker(document): 99 | vector = doc2vec_model.infer_vector(document.split(' '), steps=12, alpha=0.025) 100 | return vector 101 | 102 | 103 | with Pool(processes=14) as pool: 104 | doc2vec_emds = pool.map(infer_vector_worker, df_metadata['content'].values) 105 | 106 | md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, doc2vec_emds)) 107 | del df_metadata 108 | 109 | all_block_feature = {} 110 | sql = sql_block % seg 111 | print(sql) 112 | df_block = DBReader.tcp_model_cached_read(cached_file_path='XXX', sql=sql, cached=False) 113 | print(df_block.shape) 114 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]): 115 | block_name, pid_aos, ground_truths, mag_preds = row 116 | 117 | # Note calculate the similarity between different metadata according to pid_ao 118 | num_instances = len(pid_aos) 119 | # if ij % 10 == 0: 120 | # print(num_instances) 121 | embeddings = torch.tensor([md_doc2vec_emd_dict[pid_ao] for pid_ao in pid_aos], device=device) 122 | pairwise_feature_matrix = sim_matrix(embeddings, embeddings) 123 | pairwise_feature_matrix = pairwise_feature_matrix.cpu().numpy() 124 | 125 | all_block_feature[block_name] = pairwise_feature_matrix 126 | 127 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir, 'cluster_feature/doc2vec-feature-%d.pkl' % seg)) 128 | -------------------------------------------------------------------------------- /src/feature/pairwise/our_dataset_to_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing import Pool 3 | 4 | import sys 5 | 6 | import joblib 7 | from scipy import stats 8 | 9 | sys.path.append('../../') 10 | 11 | import pandas as pd 12 | from gensim.models import Doc2Vec 13 | from mytookit.data_reader import DBReader 14 | from scipy.spatial.distance import cosine 15 | from sklearn.feature_extraction.text import TfidfVectorizer 16 | 17 | from eutilities.string_utils import jaccard_similarity, extract_word_list, ngram_sequence, \ 18 | convert_unicode_to_ascii 19 | from myconfig import cached_dir 20 | 21 | df = DBReader.tcp_model_cached_read("XXXX", 22 | sql="select * from and_ds.our_and_dataset_pairwise_gold_standard;", 23 | cached=False) 24 | print('df.shape', df.shape) 25 | 26 | # ['fullname' 'pid1' 'ao1' 'pid2' 'ao2' 'same_author' 'authors1' 27 | # 'paper_title1' 'venue1' 'pub_year1' 'authors2' 'paper_title2' 'venue2' 28 | # 'pub_year2', 'train1_test0_val2'] 29 | columns = df.columns.values 30 | print(len(columns), columns) 31 | h, w = df.shape 32 | 33 | 34 | def concat_title_abstract(row): 35 | return ' '.join([str(n) for n in row.values]).lower() 36 | 37 | 38 | documents = list( 39 | df[['paper_title1', 'abstract1']].apply(concat_title_abstract, axis=1).values) + list( 40 | df[['paper_title2', 'abstract2']].apply(concat_title_abstract, axis=1).values) 41 | vectorizer = TfidfVectorizer() # tokenizer=normalize, stop_words='english' 42 | print('fit tfidf model') 43 | vectorizer = vectorizer.fit(documents) 44 | 45 | 46 | def cosine_sim(text1, text2): 47 | tfidf = vectorizer.transform([text1, text2]) 48 | return ((tfidf * tfidf.T).A)[0, 1] 49 | 50 | 51 | # load doc2vec model 52 | model = Doc2Vec.load(os.path.join(cached_dir, 'doc2vec_model')) 53 | print('load doc2vec model') 54 | 55 | 56 | def extract_pairwise_feature(pairwise_citation): 57 | task_id, (fullname, pid1, ao1, pid2, ao2, 58 | coauthor1, aid1, author_names1, aff_arr1, aff_id_arr1, paper_title1, abstract1, venue1, pub_year1, 59 | coauthor2, aid2, author_names2, aff_arr2, aff_id_arr2, paper_title2, abstract2, venue2, pub_year2, 60 | same_author, train1_test0_val2) = pairwise_citation 61 | 62 | try: 63 | if task_id % 10000 == 0: 64 | print(task_id * 100.0 / h) 65 | 66 | author_names1, author_names2 = author_names1.lower(), author_names2.lower() 67 | 68 | # if author_names1 != convert_unicode_to_ascii(author_names1): 69 | # print(author_names1, convert_unicode_to_ascii(author_names1)) 70 | 71 | # name similarity 72 | name_similarity = jaccard_similarity(ngram_sequence(convert_unicode_to_ascii(author_names1)), 73 | ngram_sequence(convert_unicode_to_ascii(author_names2))) 74 | 75 | same_biblio_aid = 1 if aid1 == aid2 else 0 76 | pub_year_diff = abs(pub_year1 - pub_year2) if pub_year1 > 0 and pub_year2 > 0 else -1 77 | 78 | try: 79 | content1, content2 = (paper_title1 + ' ' + str(abstract1)).lower(), (paper_title2 + ' ' + str(abstract2)).lower() 80 | except Exception as e: 81 | print(e) 82 | content1, content2 = paper_title1, paper_title2 83 | 84 | word_list1 = extract_word_list(content1) 85 | word_list2 = extract_word_list(content2) 86 | paper_title_abstract_similarity = jaccard_similarity( 87 | word_list1, 88 | word_list2, 89 | remove_stop_word=True) 90 | 91 | # do2vec similarity 92 | content_cosin_sim = 0 93 | try: 94 | v1 = model.infer_vector(word_list1, steps=12, alpha=0.025) 95 | v2 = model.infer_vector(word_list2, steps=12, alpha=0.025) 96 | # Compute the Cosine distance between 1-D arrays. 97 | # distance cosine([1, 2],[3,4]) = 1 - (1*3+2*4)/(sqrt(1*1+2*2) * sqrt(3*3+4*4)) 98 | content_cosin_sim = 1 - cosine(v1, v2) 99 | except Exception as e: 100 | print(e) 101 | 102 | # tfidf similarity 103 | tfidf_cosin_sim = 0 104 | try: 105 | tfidf_cosin_sim = cosine_sim(content1, content2) 106 | except Exception as e: 107 | print(e) 108 | 109 | venue_similarity = jaccard_similarity(extract_word_list(str(venue1).lower()), 110 | extract_word_list(str(venue2).lower())) 111 | 112 | aff_similarity = jaccard_similarity(extract_word_list(' '.join(str(aff_arr1).split('|')).lower()), 113 | extract_word_list(' '.join(str(aff_arr2).split('|')).lower())) 114 | 115 | feature_item = [fullname, pid1, ao1, pid2, ao2, same_author, train1_test0_val2, 116 | name_similarity, same_biblio_aid, pub_year_diff, 117 | paper_title_abstract_similarity, 118 | content_cosin_sim, tfidf_cosin_sim, 119 | venue_similarity, 120 | aff_similarity, 121 | content1, 122 | content2] 123 | 124 | return feature_item 125 | except Exception as e: 126 | print(e) 127 | return [fullname, pid1, ao1, pid2, ao2, same_author, train1_test0_val2, 128 | 0, 0, 0, 0, 0, 0, 0, 0, "", ""] 129 | 130 | 131 | task_pools = [(i, row) for i, row in df.iterrows()] 132 | 133 | with Pool(processes=14) as pool: 134 | features = pool.map(extract_pairwise_feature, task_pools) 135 | 136 | joblib.dump(features, 'tmp.pkl') 137 | 138 | pd.DataFrame(features, 139 | columns=['fullname', 'pid1', 'ao1', 'pid2', 'ao2', 140 | 'same_author', 'train1_test0_val2', 141 | 'name_similarity', 'same_biblio_aid', 'pub_year_diff', 142 | 'paper_title_abstract_similarity', 'content_cosin_similarity', 'tfidf_cosin_similarity', 143 | 'venue_similarity', 'aff_similarity', 'content1', 'content2']).to_csv( 144 | os.path.join(cached_dir, 'pairwise_and_dataset_feature_full.tsv'), sep='\t', index=False) 145 | -------------------------------------------------------------------------------- /dataset/PubMed-Kim/Kim_Authority_ORCID_linkage_dataset.sql: -------------------------------------------------------------------------------- 1 | create table if not exists and_ds.AUT_NIH 2 | ( 3 | Year String, 4 | PMID String, 5 | BylinePosition String, 6 | MEDLINE_Name String, 7 | NIH_ID String, 8 | NIH_Name String, 9 | Authority2009_ID String, 10 | Ethnea String, 11 | Genni String, 12 | AINI String, 13 | FINI String 14 | ) ENGINE = Log; 15 | 16 | create table if not exists and_ds.AUT_ORC 17 | ( 18 | Year String, 19 | PMID String, 20 | BylinePosition String, 21 | MEDLINE_Name String, 22 | ORCID String, 23 | ORCID_Name String, 24 | Authority2009_ID String, 25 | Ethnea String, 26 | Genni String, 27 | AINI String, 28 | FINI String 29 | ) ENGINE = Log; 30 | 31 | create table if not exists and_ds.AUT_SCT_info 32 | ( 33 | Year String, 34 | PMID String, 35 | BylinePosition String, 36 | MEDLINE_Name String, 37 | Authority2009_ID String, 38 | Ethnea String, 39 | Genni String, 40 | AINI String, 41 | FINI String 42 | ) ENGINE = Log; 43 | 44 | create table if not exists and_ds.AUT_SCT_pairs 45 | ( 46 | PMID_1 String, 47 | Byline_Position_1 String, 48 | PMID_2 String, 49 | Byline_Position_2 String 50 | ) ENGINE = Log; 51 | 52 | -- 312952 AUT_NIH.txt 53 | -- 3076502 AUT_ORC.txt 54 | -- 4732531 AUT_SCT_info.txt 55 | -- 6214200 AUT_SCT_pairs.txt 56 | 57 | -- 312951 58 | -- 6214199 59 | -- 3076501 60 | -- 4732530 61 | select count() 62 | from and_ds.AUT_NIH 63 | union all 64 | select count() 65 | from and_ds.AUT_ORC 66 | union all 67 | select count() 68 | from and_ds.AUT_SCT_info 69 | union all 70 | select count() 71 | from and_ds.AUT_SCT_pairs; 72 | 73 | -- cat AUT_NIH.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_NIH FORMAT TSVWithNames' 74 | -- cat AUT_ORC.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_ORC FORMAT TSVWithNames' 75 | -- cat AUT_SCT_info.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_SCT_info FORMAT TSVWithNames' 76 | -- cat AUT_SCT_pairs.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_SCT_pairs FORMAT TSVWithNames' 77 | 78 | drop table if exists and_ds.AUT_NIH; 79 | drop table if exists and_ds.AUT_ORC; 80 | drop table if exists and_ds.AUT_SCT_info; 81 | drop table if exists and_ds.AUT_SCT_pairs; 82 | 83 | select * 84 | from ( 85 | select * 86 | from (select concat(PMID_1, '_', Byline_Position_1) as pm_ao1, concat(PMID_2, '_', Byline_Position_2) as pm_ao2 87 | from and_ds.AUT_SCT_pairs) any 88 | inner join (select concat(PMID, '_', BylinePosition) as pm_ao1, 89 | MEDLINE_Name as MEDLINE_Name1, 90 | AINI as AINI1, 91 | FINI as FINI1 92 | from and_ds.AUT_SCT_info) using pm_ao1 93 | ) any 94 | inner join (select concat(PMID, '_', BylinePosition) as pm_ao2, 95 | MEDLINE_Name as MEDLINE_Name2, 96 | AINI as AINI2, 97 | FINI as FINI2 98 | from and_ds.AUT_SCT_info) using pm_ao2; 99 | 100 | -- 6214199 paired_authors 101 | -- 1680310 num_citations 102 | select count() as cnt, 'paired_authors' as name 103 | from and_ds.AUT_SCT_pairs 104 | union all 105 | select arrayUniq(arrayConcat(groupArray(PMID_1), groupArray(PMID_2))) as cnt, 'num_citations' as name 106 | from and_ds.AUT_SCT_pairs; 107 | 108 | -- 3076501 num_citations 109 | -- 268631 number_full_initial_based_blocks 110 | -- 245754 num_author_groups 111 | -- 197379 number_first_initial_based_blocks 112 | select count() as cnt, 'num_citations' as name 113 | from and_ds.AUT_ORC 114 | union all 115 | select count(distinct lowerUTF8(AINI)) as cnt, 'number_full_initial_based_blocks' as name 116 | from and_ds.AUT_ORC 117 | union all 118 | select count(distinct lowerUTF8(FINI)) as cnt, 'number_first_initial_based_blocks' as name 119 | from and_ds.AUT_ORC 120 | union all 121 | select count(distinct ORCID) as cnt, 'num_author_groups' as name 122 | from and_ds.AUT_ORC 123 | ; 124 | 125 | -- 312951 num_citations 126 | -- 34206 num_author_groups 127 | -- 37185 number_full_initial_based_blocks 128 | -- 29243 number_first_initial_based_blocks 129 | select count() as cnt, 'num_citations' as name 130 | from and_ds.AUT_NIH 131 | union all 132 | select count(distinct lowerUTF8(AINI)) as cnt, 'number_full_initial_based_blocks' as name 133 | from and_ds.AUT_NIH 134 | union all 135 | select count(distinct lowerUTF8(FINI)) as cnt, 'number_first_initial_based_blocks' as name 136 | from and_ds.AUT_NIH 137 | union all 138 | select count(distinct NIH_ID) as cnt, 'num_author_groups' as name 139 | from and_ds.AUT_NIH 140 | ; 141 | 142 | -- name variations 143 | -- 226588 144 | select count() 145 | from ( 146 | select lowerUTF8(trimBoth(splitByString(',', MEDLINE_Name)[1])) as medline_lastname, 147 | lowerUTF8(trimBoth(splitByString('|', ORCID_Name)[1])) as orcid_lastname, 148 | MEDLINE_Name, 149 | ORCID_Name 150 | from and_ds.AUT_ORC 151 | where medline_lastname != orcid_lastname) 152 | union all 153 | select count() 154 | from and_ds.AUT_ORC 155 | ; 156 | 157 | 158 | select count() 159 | from ( 160 | select lowerUTF8(trimBoth(splitByString(',', MEDLINE_Name)[1])) as medline_lastname, 161 | lowerUTF8(splitByChar('_', AINI)[1]) as block_lastname 162 | from and_ds.AUT_ORC 163 | where medline_lastname != block_lastname) 164 | ; 165 | select count() 166 | from and_ds.AUT_ORC; 167 | 168 | select arrayUniq(arrayConcat(groupArray(PMID1), groupArray(PMID2))) as cnt, 'num_citations' as name 169 | from and.GS; -------------------------------------------------------------------------------- /src/eutilities/string_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import string 5 | import unicodedata 6 | 7 | import geograpy 8 | import jaro 9 | from nltk import ngrams 10 | 11 | os.environ['JAVAHOME'] = "/usr/local/jdk-11.0.1" 12 | from Levenshtein.StringMatcher import StringMatcher 13 | from nltk.tag import StanfordNERTagger 14 | from nltk.corpus import stopwords 15 | 16 | def extract_email(affi): 17 | match = re.search(r'[\w\.-]+@[\w\.-]+', affi) 18 | if match is not None: 19 | result = match.group(0) 20 | 21 | if result[-1] == '.': 22 | result = result[:len(result) - 1] 23 | return result 24 | return None 25 | 26 | 27 | def extract_inner_words(string): 28 | replaced = re.sub('[^a-z]', " ", string) 29 | splts = replaced.split(' ') 30 | return [s for s in splts if len(s) > 2] 31 | 32 | 33 | def extract_word_list(string): 34 | return re.findall(r'\w+', string) 35 | 36 | 37 | def extract_key_wods_list(key_words_str): 38 | key_words = [] 39 | key_words_dict = json.loads(key_words_str) 40 | if key_words_dict == None: 41 | return [] 42 | for item in key_words_dict: 43 | if 'keyword' in item: 44 | keyword_ = item['keyword'] 45 | keyword_ = extract_inner_words(keyword_) 46 | key_words += keyword_ 47 | return key_words 48 | 49 | 50 | # 有28895个不重复的 mesh heading 51 | def extract_mesh_headings(raw_str: str): 52 | s = json.loads(raw_str) 53 | if s == None: 54 | return [] 55 | desc_name_list = [] 56 | for item in s: 57 | if 'descriptorName' in item: 58 | # TODO 'qualifierNameList' 59 | descriptorname_ = item['descriptorName'] 60 | descriptorname_ = extract_inner_words(descriptorname_) 61 | desc_name_list += descriptorname_ 62 | return desc_name_list 63 | 64 | 65 | def edit_distinct_diff_chars(str1, str2): 66 | str_matcher = StringMatcher() 67 | if len(str1) < len(str2): 68 | str1, str2 = str2, str1 69 | str_matcher.set_seqs(str1, str2) 70 | editops = str_matcher.get_editops() 71 | # print(editops) 72 | diff_chars = [] 73 | for model, pos1, pos2 in editops: 74 | if model == 'delete': 75 | # print('delete: ', str1[pos1]) 76 | diff_chars.append(str1[pos1]) 77 | elif model == 'replace': 78 | # print('replace: ', str1[pos1]) 79 | diff_chars.append(str1[pos1]) 80 | elif model == 'insert': 81 | # print('insert: ', str2[pos2]) 82 | diff_chars.append(str2[pos2]) 83 | return diff_chars 84 | 85 | 86 | def jaro_winkler_similarity(s1, s2): 87 | if s1 is None or s2 is None: 88 | return 0.0 89 | return jaro.jaro_winkler_metric(s1, s2) 90 | 91 | 92 | # Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 93 | all_letters = string.ascii_letters + " -" 94 | all_letters = set([c for c in all_letters]) 95 | n_letters = len(all_letters) 96 | 97 | 98 | def convert_unicode_to_ascii(s): 99 | s = s.lower() 100 | return ''.join( 101 | c for c in unicodedata.normalize('NFD', s) 102 | if unicodedata.category(c) != 'Mn' 103 | ) 104 | 105 | 106 | def ngram_sequence(s, n=2): 107 | grams = ngrams(list(s), n) 108 | grams = [''.join(gram) for gram in grams] 109 | return grams 110 | 111 | 112 | en_stopwords_set = set(stopwords.words('english')) 113 | 114 | 115 | def intersection(a, b, remove_stop_word=False): 116 | if a is None or b is None: 117 | return 0 118 | if remove_stop_word: 119 | a = [n for n in a if n not in en_stopwords_set] 120 | b = [n for n in b if n not in en_stopwords_set] 121 | intersections = len(set(a).intersection(set(b))) 122 | return intersections 123 | 124 | 125 | def jaccard_similarity(a, b, remove_stop_word=False): 126 | if a is None or b is None: 127 | return 0.0 128 | if remove_stop_word: 129 | a = [n for n in a if n not in en_stopwords_set] 130 | b = [n for n in b if n not in en_stopwords_set] 131 | unions = len(set(a).union(set(b))) 132 | if unions == 0: 133 | return 0.0 134 | intersections = len(set(a).intersection(set(b))) 135 | return 1. * intersections / unions 136 | 137 | 138 | # 3 class: Location, Person, Organization 139 | # 4 class: Location, Person, Organization, Misc 140 | # 7 class: Location, Person, Organization, Money, Percent, Date, Time 141 | # english.all.3class.caseless.distsim.crf.ser.gz 142 | # english.conll.4class.caseless.distsim.crf.ser.gz 143 | # english.muc.7class.caseless.distsim.crf.ser.gz 144 | stanford_ner_base_path = '/home/zhangli/mydisk-2t/apps/stanford-ner-4.0.0/' 145 | st = StanfordNERTagger( 146 | model_filename=('%sclassifiers/english.all.3class.distsim.crf.ser.gz' % stanford_ner_base_path), 147 | path_to_jar=('%sstanford-ner.jar' % stanford_ner_base_path)) 148 | 149 | 150 | def ner(s): 151 | if s is None or len(s) == 0: 152 | return [], [] 153 | res = st.tag(s.split()) 154 | print(res) 155 | l = len(res) 156 | broken_point = [i + 1 for i in range(l - 1) if res[i][1] != res[i + 1][1]] 157 | start = [0] + broken_point 158 | end = broken_point + [l] 159 | locs, orgs = [], [] 160 | for s, e in zip(start, end): 161 | if e <= s: 162 | continue 163 | entities_with_class = res[s:e] 164 | cls = entities_with_class[0][1] 165 | entity = ' '.join([n[0] for n in entities_with_class]) 166 | if cls == 'ORGANIZATION': 167 | orgs.append(entity) 168 | elif cls == 'LOCATION': 169 | locs.append(entity) 170 | return locs, orgs 171 | 172 | 173 | cached_extracted_geo = dict() 174 | 175 | 176 | def extract_geo(s): 177 | if s is None or len(s) == 0: 178 | return [[], [], [], []] 179 | if s not in cached_extracted_geo: 180 | # places = geograpy.Extractor(text=s).find_geoEntities() 181 | places = geograpy.get_geoPlace_context(text=s) 182 | cached_extracted_geo[s] = [ 183 | [n.lower() for n in places.countries], 184 | [n.lower() for n in places.regions], 185 | [n.lower() for n in places.cities], 186 | [n.lower() for n in places.other] 187 | ] 188 | # print(places) 189 | return cached_extracted_geo[s] 190 | 191 | 192 | if __name__ == '__main__': 193 | import time 194 | 195 | t1 = time.time() 196 | # s = "University of Minnesota, Minneapolis, Minnesota 55455, USA." 197 | s = "University of California, San Diego, La Jolla, California 92093, USA." 198 | for _ in range(10): 199 | chars = ner(s) 200 | t2 = time.time() 201 | print(t2 - t1) 202 | print(chars) 203 | print() 204 | for _ in range(10): 205 | chars = extract_geo(s) 206 | print(chars) 207 | t3 = time.time() 208 | print(t3 - t2) 209 | -------------------------------------------------------------------------------- /src/feature/cluster/fast_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | import numpy as np 5 | from mytookit.data_reader import DBReader 6 | from nltk.corpus import stopwords 7 | from tqdm import tqdm 8 | 9 | from eutilities.string_utils import jaccard_similarity, ngram_sequence, convert_unicode_to_ascii 10 | from myconfig import cached_dir 11 | 12 | en_stopwords_set = set(stopwords.words('english')) 13 | 14 | sql_block = r''' 15 | select block_fullname as block_name, 16 | arrayMap(x->x[1], 17 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos, 18 | arrayMap(x->x[2], tmp) as ground_truths, 19 | arrayMap(x->x[3], tmp) as mag_preds 20 | from (select block_fullname, 21 | author_group_orcid, 22 | -- Note has verified all mag_author_id is successfully matched 23 | toString(aid) as mag_author_id, 24 | concat(toString(pid), '_', toString(author_position)) as pid_ao 25 | from and_ds.our_and_dataset_block any 26 | left join ( 27 | select pid, aid, author_position 28 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 29 | from mag.paper_author_affiliation) any 30 | inner join and_ds.our_and_dataset_block using pid, author_position 31 | ) using pid, author_position) 32 | group by block_name 33 | having xxHash32(block_name) %% 10=%d 34 | order by length(pid_aos) desc; 35 | ;''' 36 | 37 | sql_metadata = r''' 38 | select concat(toString(pid), '_', toString(author_position)) as pid_ao, 39 | block_fullname, 40 | author_group_orcid as orcid, 41 | -- -- Note has verified all mag_author_id is successfully matched 42 | lowerUTF8(author_name) as author_name, 43 | arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation, 44 | -- coauthors, 45 | arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue, 46 | pub_year, 47 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content 48 | from and_ds.our_and_dataset_block any 49 | left join ( 50 | select pid, aid, author_position 51 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 52 | from mag.paper_author_affiliation) any 53 | inner join and_ds.our_and_dataset_block using pid, author_position 54 | ) using pid, author_position 55 | where xxHash32(block_fullname) %% 10=%d 56 | ''' 57 | 58 | num_features = 5 59 | 60 | for seg in range(0, 10, 1): 61 | sql = sql_metadata % seg 62 | print(sql) 63 | # Note prepare the paper metadata dict 64 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False) 65 | print(df_metadata.shape) 66 | print(df_metadata.head()) 67 | 68 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values)) 69 | md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values)) 70 | md_author_name_dict = dict(zip(df_metadata['pid_ao'].values, 71 | df_metadata['author_name'].apply( 72 | lambda x: ngram_sequence(convert_unicode_to_ascii(x))).values)) 73 | md_author_affiliation_dict = dict( 74 | zip(df_metadata['pid_ao'].values, df_metadata['author_affiliation'].apply(lambda x: x.split(' ')).values)) 75 | # md_coauthors_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['coauthors'].values)) 76 | md_venue_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['venue'].apply(lambda x: x.split(' ')).values)) 77 | md_pub_year_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['pub_year'].values)) 78 | # md_content_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].values)) 79 | md_content_word_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply( 80 | lambda x: set([w for w in x.split(' ') if not w in en_stopwords_set])).values)) 81 | # md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply( 82 | # lambda x: doc2vec_model.infer_vector(x.split(' '), steps=12, alpha=0.025)).values)) 83 | 84 | del df_metadata 85 | 86 | all_block_feature = {} 87 | sql = sql_block % seg 88 | print(sql) 89 | df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False) 90 | print(df_block.shape) 91 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]): 92 | block_name, pid_aos, ground_truths, mag_preds = row 93 | 94 | # Note calculate the similarity between different metadata according to pid_ao 95 | num_instances = len(pid_aos) 96 | # if ij % 10 == 0: 97 | # print(num_instances) 98 | 99 | pairwise_feature_matrix = np.zeros(shape=(num_instances, num_instances, num_features), dtype=np.float16) 100 | for i, pid_ao_i in enumerate(pid_aos): 101 | for j, pid_ao_j in enumerate(pid_aos): 102 | author_names1, author_names2 = md_author_name_dict[pid_ao_i], md_author_name_dict[pid_ao_j] 103 | aff_arr1, aff_arr2 = md_author_affiliation_dict[pid_ao_i], md_author_affiliation_dict[pid_ao_j] 104 | 105 | orcid1, orcid2 = md_orcid_dict[pid_ao_i], md_orcid_dict[pid_ao_j] 106 | content_word1, content_word2 = md_content_word_dict[pid_ao_i], md_content_word_dict[pid_ao_j] 107 | 108 | venue1, venue2 = md_venue_dict[pid_ao_i], md_venue_dict[pid_ao_j] 109 | pub_year1, pub_year2 = md_pub_year_dict[pid_ao_i], md_pub_year_dict[pid_ao_j] 110 | 111 | # if author_names1 != convert_unicode_to_ascii(author_names1): 112 | # print(author_names1, convert_unicode_to_ascii(author_names1)) 113 | 114 | name_similarity = jaccard_similarity(author_names1, author_names2) 115 | 116 | pub_year_diff = 1.0 * (abs(pub_year1 - pub_year2) if pub_year1 > 0 and pub_year2 > 0 else -1) 117 | 118 | paper_title_abstract_similarity = jaccard_similarity(content_word1, content_word2, remove_stop_word=False) 119 | 120 | venue_similarity = jaccard_similarity(venue1, venue2) 121 | 122 | aff_similarity = jaccard_similarity(aff_arr1, aff_arr2) 123 | 124 | pairwise_feature_matrix[i][j] = [name_similarity, 125 | pub_year_diff, 126 | paper_title_abstract_similarity, 127 | venue_similarity, 128 | aff_similarity] 129 | all_block_feature[block_name] = pairwise_feature_matrix 130 | 131 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir, 'cluster_feature/five-fast-features-%d.pkl' % seg)) 132 | -------------------------------------------------------------------------------- /src/statistics/last_name_variation_considering_transliterating.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from unidecode import unidecode 3 | 4 | from mytookit.data_reader import DBReader 5 | 6 | which_dataset = ['pairwise', 'block'][1] 7 | applying_transliterating = False 8 | 9 | sources, methods = ['S2', 'MAG'], ['derek73', 'joshfraser-NameParser'] 10 | # sources, methods = ['S2'], ['derek73'] 11 | # sources, methods = ['S2'], ['joshfraser-NameParser'] 12 | # sources, methods = ['MAG'], ['derek73'] 13 | # sources, methods = ['MAG'], ['joshfraser-NameParser'] 14 | 15 | field_map = {'derek73': 'derek73_top_lastname', 'joshfraser-NameParser': 'joshfraser_top_lastname'} 16 | 17 | 18 | def clean_name(s): 19 | return unidecode(s).lower() 20 | 21 | 22 | sql_template = ''' 23 | select matched_biblio_author_name, biblio_author_split_lastname, orcid_lastname, top_lastname 24 | from ( 25 | select pid, 26 | orcid, 27 | matched_biblio_author_name, 28 | lowerUTF8(matched_biblio_author_split_names[3]) as biblio_author_split_lastname 29 | from and_ds.orcid_mag_s2_author_name_split_by_various_algorithms_with_author_position 30 | where source = 'SOURCE' and method = 'METHOD' 31 | ) any 32 | join (select orcid, 33 | orcid_lastname, 34 | FIELD as top_lastname 35 | from and_ds.orcid_mag_s2_actual_author_name 36 | where source = 'SOURCE' 37 | ) using orcid; 38 | ''' 39 | 40 | 41 | for source in sources: 42 | for method in methods: 43 | sql = sql_template.replace('SOURCE', source).replace('METHOD', method).replace('FIELD', field_map[method]) 44 | print(sql) 45 | df = DBReader.tcp_model_cached_read("cached/XXXXX", sql, cached=False) 46 | print(df.shape) 47 | num_instances = df.shape[0] 48 | df['matched_biblio_author_name'] = df['matched_biblio_author_name'].apply(clean_name) 49 | df['biblio_author_split_lastname'] = df['biblio_author_split_lastname'].apply(clean_name) 50 | df['orcid_lastname'] = df['orcid_lastname'].apply(clean_name) 51 | df['top_lastname'] = df['top_lastname'].apply(clean_name) 52 | not_endwith_orcid_lastname, not_endwith_top_lastname, not_identicalwith_orcid_lastname, not_identicalwith_top_lastname = 0, 0, 0, 0 53 | for i, (biblio_author_name, biblio_author_lastname, orcid_lastname, top_lastname) in df.iterrows(): 54 | if i % 100000 == 0: 55 | print(i * 1.0 / num_instances) 56 | # end with orcid lastname 57 | if not biblio_author_name.endswith(orcid_lastname): 58 | not_endwith_orcid_lastname += 1 59 | # end with top lastname 60 | if not biblio_author_name.endswith(top_lastname): 61 | not_endwith_top_lastname += 1 62 | # identical with orcid lastname 63 | if biblio_author_lastname != orcid_lastname: 64 | not_identicalwith_orcid_lastname += 1 65 | # identical with top lastname 66 | if biblio_author_lastname != top_lastname: 67 | not_identicalwith_top_lastname += 1 68 | print(source, method, not_endwith_orcid_lastname, not_endwith_top_lastname, not_identicalwith_orcid_lastname, 69 | not_identicalwith_top_lastname) 70 | print(source, method, not_endwith_orcid_lastname * 1.0 / num_instances, 71 | not_endwith_top_lastname * 1.0 / num_instances, not_identicalwith_orcid_lastname * 1.0 / num_instances, 72 | not_identicalwith_top_lastname * 1.0 / num_instances) 73 | 74 | # PubMed 75 | df = DBReader.tcp_model_cached_read("cached/XXXXX", """ 76 | select matched_biblio_author_lastname, orcid_lastname, top_lastname 77 | from and_ds.orcid_pubmed_author_linkage_with_author_position_with_topname; 78 | """, cached=False) 79 | print(df.shape) 80 | num_instances = df.shape[0] 81 | df['matched_biblio_author_lastname'] = df['matched_biblio_author_lastname'].apply(clean_name) 82 | df['orcid_lastname'] = df['orcid_lastname'].apply(clean_name) 83 | df['top_lastname'] = df['top_lastname'].apply(clean_name) 84 | not_identicalwith_orcid_lastname, not_identicalwith_top_lastname = 0, 0 85 | 86 | for i, (biblio_author_lastname, orcid_lastname, top_lastname) in df.iterrows(): 87 | if i % 100000 == 0: 88 | print(i * 1.0 / num_instances) 89 | if biblio_author_lastname != orcid_lastname: 90 | not_identicalwith_orcid_lastname += 1 91 | # identical with top lastname 92 | if biblio_author_lastname != top_lastname: 93 | not_identicalwith_top_lastname += 1 94 | print('PubMed', '-', not_identicalwith_orcid_lastname, not_identicalwith_top_lastname) 95 | print(not_identicalwith_orcid_lastname * 1.0 / num_instances, not_identicalwith_top_lastname * 1.0 / num_instances) 96 | 97 | # Our dataset 98 | df = DBReader.tcp_model_cached_read("cached/XXXXX", """ 99 | select tupleElement(item, 2) as author_biblio_name, 100 | tupleElement(item, 3) as orcid_last_name, 101 | toInt64(tupleElement(arrayJoin(paper_orcid_lastname_bib_name) as item, 1) as pid) in 102 | (select arrayJoin(flatten(groupArray([pid1, pid2]))) 103 | from and_ds.our_and_dataset_pairwise_gold_standard) as for_pairwise_dataset 104 | from ( 105 | select arrayJoin( 106 | -- full_name_blocks: (num_work, orcid, same_orcidauthor_paper_positions, lastname_variations, same_orcidauthor_paper_repres) 107 | -- same_orcidauthor_paper_repres: [(pid, author_position, orcid, orcid_names, matched_biblio_author, ethnic_seer, ethnea, genni, sex_mac, ssn_gender, pub_year, fos_arr), ..., ] 108 | arrayZip( 109 | arrayMap(x->arrayMap(y->y[1], x.3), full_name_blocks) as tmp_pids, 110 | arrayMap(x->arrayMap(y-> 111 | y.4, x.5), full_name_blocks) as tmp_orcid_names, 112 | arrayMap(x->arrayMap(y-> 113 | y.5, x.5), full_name_blocks) as tmp_bib_names 114 | )) as paper_orcid_names, 115 | 116 | tupleElement(paper_orcid_names, 1) as pids, 117 | arrayMap(x->lowerUTF8(x[2]), tupleElement(paper_orcid_names, 2)) as orcid_last_names, 118 | tupleElement(paper_orcid_names, 3) as author_biblio_names, 119 | length(pids) = length(orcid_last_names) as is_valid, 120 | arrayZip(pids, author_biblio_names, orcid_last_names) as paper_orcid_lastname_bib_name 121 | from and_ds.orcid_mag_matched_fullname_block) 122 | ; 123 | """, cached=False) 124 | 125 | if which_dataset == 'pairwise': 126 | df = df[df['for_pairwise_dataset'] == 1] 127 | 128 | del df['for_pairwise_dataset'] 129 | print(df.shape) 130 | 131 | if applying_transliterating: 132 | df['author_biblio_name'] = df['author_biblio_name'].apply(clean_name) 133 | df['orcid_last_name'] = df['orcid_last_name'].apply(clean_name) 134 | 135 | not_endwith_orcid_lastname = 0 136 | for i, (author_biblio_name, orcid_last_name) in tqdm(df.iterrows(), total=df.shape[0]): 137 | if not author_biblio_name.endswith(orcid_last_name): 138 | not_endwith_orcid_lastname += 1 139 | 140 | print(which_dataset, '%s transliterating' % ('with' if applying_transliterating else 'without'), 'Our dataset', 'Endwith', 141 | not_endwith_orcid_lastname) 142 | num_instances = df.shape[0] 143 | print(not_endwith_orcid_lastname * 1.0 / num_instances) 144 | # Note result 145 | # pairwise without transliterating Our dataset Endwith 181348; 0.09719875824336029 146 | # pairwise with transliterating Our dataset Endwith 122208; 0.06550094761124785 147 | 148 | # block without transliterating Our dataset Endwith 722965; 0.09626437527720622 149 | # block with transliterating Our dataset Endwith 485079; 0.06458933267183324 150 | 151 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/feature/simple_matching_network_trainer_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from matplotlib import pyplot as plt 9 | from mytookit.data_reader import DBReader 10 | from torch.utils.data.dataset import Dataset 11 | from torchtext import vocab 12 | 13 | from eutilities import train_utils 14 | from eutilities.preprocessor import down_sample 15 | from model.nn import MatchGRU 16 | from myconfig import cached_dir, glove840b300d_path 17 | 18 | underlying_dataset = 'pairwise-gold-standard' 19 | print(underlying_dataset) 20 | glove_vocab_size = ['6B', '840B'][1] 21 | 22 | need_balance_dataset = True 23 | 24 | # # Note we use the 840B model as the word embedding 25 | glove = vocab.GloVe(name=glove_vocab_size, dim=300, cache=glove840b300d_path) 26 | pad_idx = 0 27 | batch_size = 128 28 | epochs = 30 29 | lr = 5e-5 30 | max_sql_len = 300 31 | print(max_sql_len) 32 | 33 | # evice = torch.device('cpu') 34 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 35 | print('use device: ', device) 36 | 37 | 38 | def word_token(txt): 39 | words = txt.lower().split() 40 | tokens = [glove.stoi[word] for word in words if word in glove.stoi] 41 | tokens = tokens[:max_sql_len] if len(tokens) >= max_sql_len else tokens + [pad_idx] * ( 42 | max_sql_len - len(tokens)) 43 | return tokens 44 | 45 | 46 | class ANDDataset(Dataset): 47 | def __init__(self, df): 48 | # self.num_hand_craft_feature_set = ['name_similarity', 'same_biblio_aid', 'pub_year_diff', 'venue_similarity', 49 | # 'aff_similarity'] 50 | # df[self.num_hand_craft_feature_set] = MinMaxScaler().fit_transform(df[self.num_hand_craft_feature_set]) 51 | # df[self.num_hand_craft_feature_set] = StandardScaler().fit_transform(df[self.num_hand_craft_feature_set]) 52 | self.df = df 53 | 54 | def __getitem__(self, index): 55 | data_item = self.df.iloc[index] 56 | pid1 = data_item.pid1 57 | ao1 = data_item.ao1 58 | pid2 = data_item.pid2 59 | ao2 = data_item.ao2 60 | same_author = data_item.same_author 61 | # train1_test0_val2 = data_item.train1_test0_val2 62 | content1 = data_item.content1 63 | content2 = data_item.content2 64 | # hand-craft features 65 | # HF = torch.tensor([data_item[n] for n in self.num_hand_craft_feature_set], 66 | # dtype=torch.float) 67 | XL, XR = torch.tensor(word_token(content1)), torch.tensor(word_token(content2)) 68 | Y = torch.tensor([0], dtype=torch.float) if same_author == 0 else torch.tensor([1], dtype=torch.float) 69 | MT = [int(pid1), int(ao1), int(pid2), int(ao2), int(same_author)] 70 | return MT, XL, XR, Y 71 | 72 | def __len__(self): 73 | return len(self.df) 74 | 75 | 76 | df = DBReader.tcp_model_cached_read("XXXX", 77 | sql="""select * from and_ds.matching_network_train_corpus""", 78 | cached=False) 79 | print(df.shape) 80 | # df = df.dropna(0) 81 | 82 | if underlying_dataset == 'pairwise-gold-standard': 83 | data_split_field = 'train1_test0_val2' 84 | print(set(df[data_split_field].values)) 85 | df_train_set = df[df[data_split_field].astype(int) == 1] 86 | # df_train_set = df_train_set.sample(frac=0.8, random_state=42) 87 | df_val_set = df[df[data_split_field].astype(int) == 2] 88 | # Note because we need to give all the instances a similar score, so the infer_set used here is all the instances 89 | df_infer_set = df 90 | elif underlying_dataset == 'block-gold-standard': 91 | pass 92 | 93 | # Note for the training dataset, try to balance the dataset 94 | if need_balance_dataset: 95 | print('pos_samples_num: ', df_train_set[df_train_set['same_author'] == 1].shape[0]) 96 | print('neg_samples_num: ', df_train_set[df_train_set['same_author'] == 0].shape[0]) 97 | df_train_set = down_sample(df_train_set, percent=4) 98 | print('after balancing dataset shape: ', df_train_set.shape) 99 | print('pos_samples_num: ', df_train_set[df_train_set['same_author'] == 1].shape[0]) 100 | print('neg_samples_num: ', df_train_set[df_train_set['same_author'] == 0].shape[0]) 101 | 102 | df_train_set.reset_index(inplace=True, drop=True) 103 | df_val_set.reset_index(inplace=True, drop=True) 104 | df_infer_set.reset_index(inplace=True, drop=True) 105 | 106 | print('df_train shape:', df_train_set.shape, 'df_val shape:', df_val_set.shape, 'df_test shape:', df_infer_set.shape) 107 | train_set = ANDDataset(df_train_set) 108 | val_set = ANDDataset(df_val_set) 109 | infer_set = ANDDataset(df_infer_set) 110 | 111 | # Instantiate the dataset and get data loaders. The training dataset is split into train_set and test_set. 112 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, 113 | num_workers=8) # collate_fn=pad_collate 114 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=batch_size, shuffle=False, 115 | num_workers=8) # collate_fn=pad_collate 116 | infer_loader = torch.utils.data.DataLoader(dataset=infer_set, batch_size=batch_size, shuffle=False, 117 | num_workers=8) # collate_fn=pad_collate 118 | 119 | model = MatchGRU(glove, hidden_dim=64, num_layers=2, 120 | # num_hand_craft_feature=len(train_set.num_hand_craft_feature_set), 121 | bidirectional=True, output_dim=2).to(device) 122 | print(model) 123 | 124 | # pos_weight (Tensor, optional): a weight of positive examples. Must be a vector with length equal to the number of classes. 125 | pos_weight = len(df_train_set[df_train_set['same_author'] == 0]) * 1.0 / len(df_train_set[df_train_set['same_author'] == 1]) 126 | 127 | # criterion = nn.MSELoss() 128 | criterion = nn.BCEWithLogitsLoss() 129 | # criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device)) 130 | 131 | parameters = model.parameters() 132 | optimizer = optim.Adam(parameters, lr=lr) 133 | 134 | losst, lossv = [], [] 135 | for epoch in range(1, epochs + 1): 136 | train_utils.train(model, train_loader, criterion, optimizer, epoch, epochs, losst) 137 | train_utils.validate(model, val_loader, criterion, lossv) 138 | if lossv[-1] == min(lossv): # Current best model, push to disk 139 | torch.save({ 140 | 'epoch': epoch, 141 | 'model_state_dict': model.state_dict(), 142 | 'optimizer_state_dict': optimizer.state_dict(), 143 | 'losst': losst[-1], 144 | 'lossv': lossv[-1] 145 | }, os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset))) 146 | 147 | plt.figure(figsize=(5, 3)) 148 | plt.plot(np.arange(1, len(losst) + 1), losst, label="training") 149 | plt.plot(np.arange(1, len(lossv) + 1), lossv, label="validation") 150 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 151 | plt.grid() 152 | plt.title('loss vs epoch') 153 | plt.show() 154 | plt.savefig(os.path.join(cached_dir, 'match-network-training-loss.png')) 155 | 156 | checkpoint = torch.load(os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset))) 157 | model.load_state_dict(checkpoint['model_state_dict']) 158 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 159 | print('Epoch:', checkpoint['epoch']) 160 | print('losst:', checkpoint['losst']) 161 | print('lossv:', checkpoint['lossv']) 162 | model.eval() 163 | 164 | # The following code is used for inferring the similarity scores for the pairwise dataset, the simple text matching neural network acts as a content-based feature generator. 165 | # In doing so, we find that passing the paired input to the network with different orders (LEFT-RIGHT or RIGHT-LEFT) may yield different results, 166 | # So, we simply calcualte the same paired input twice but with different orders and calcualte the averaged scores as the final sorces. 167 | 168 | # Note inferring LEFT_RIGHT input 169 | d1 = dict() 170 | metadata, true_label_numpy, pred_label_numpy, pred_prob = train_utils.validate(model, infer_loader, criterion, [], 171 | switch_input=False) 172 | print(metadata.shape) 173 | assert metadata.shape[1] == len(true_label_numpy) == len(pred_label_numpy) 174 | 175 | same_author_metadata = metadata[4] 176 | for i, n in enumerate(true_label_numpy): 177 | k = '-'.join(list(map(lambda x: str(x), [metadata[0][i], metadata[1][i], metadata[2][i], metadata[3][i]]))) 178 | m = same_author_metadata[i] 179 | assert n == m 180 | prob = pred_prob[i] 181 | print(n, pred_label_numpy[i], prob) 182 | d1[k] = str(prob) 183 | 184 | # Note inferring RIGHT_LEFT input 185 | d2 = dict() 186 | metadata, true_label_numpy, pred_label_numpy, pred_prob = train_utils.validate(model, infer_loader, criterion, [], 187 | switch_input=True) 188 | print(metadata.shape) 189 | assert metadata.shape[1] == len(true_label_numpy) == len(pred_label_numpy) 190 | 191 | same_author_metadata = metadata[4] 192 | for i, n in enumerate(true_label_numpy): 193 | k = '-'.join(list(map(lambda x: str(x), [metadata[0][i], metadata[1][i], metadata[2][i], metadata[3][i]]))) 194 | m = same_author_metadata[i] 195 | assert n == m 196 | prob = pred_prob[i] 197 | print(n, pred_label_numpy[i], prob) 198 | d2[k] = str(prob) 199 | 200 | d1_keys, d2_keys = set(d1.keys()), set(d2.keys()) 201 | print('number exclusive elements: %d; %d' % ( 202 | len(d1_keys.difference(d1_keys.intersection(d2_keys))), len(d2_keys.difference(d1_keys.intersection(d2_keys))))) 203 | 204 | d = {} 205 | for k in d1_keys: 206 | d[k] = [d1[k], d2[k]] 207 | 208 | with open(os.path.join(cached_dir, 'matching-score-glove%s-%s.json' % (glove_vocab_size, underlying_dataset)), 'w') as fw: 209 | fw.write(json.dumps(d) + '\n') 210 | -------------------------------------------------------------------------------- /src/comparison/pairwise/classification_metrics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import warnings 4 | from random import random 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sb 9 | 10 | custom_params = {"axes.spines.right": False, "axes.spines.top": False} 11 | sb.set_theme(style="ticks", rc=custom_params) 12 | 13 | from matplotlib import pyplot as plt 14 | from sklearn.utils import shuffle 15 | from eutilities.customized_print import pprint 16 | from eutilities.preprocessor import drop_missing_items 17 | from eutilities.metric import calc_metrics, metric_names 18 | from model.available_model import ModelName 19 | from model.classification import use_classifier 20 | from myconfig import cached_dir, latex_doc_base_dir 21 | from mytookit.data_reader import DBReader 22 | 23 | warnings.filterwarnings('ignore') 24 | 25 | glove_vocab_size = '840B' 26 | underlying_dataset = ['pairwise-gold-standard', 'block-gold-standard'][0] 27 | cluster_uniq_author_gt_1 = False 28 | # cluster_uniq_author_gt_1 = True 29 | 30 | print(underlying_dataset) 31 | 32 | feature_file = os.path.join(cached_dir, 'pairwise_and_dataset_feature_full.tsv') 33 | df = pd.read_csv(feature_file, sep='\t') 34 | column_names = df.columns.values.tolist() 35 | print('column_names: ', column_names) 36 | del df['content1'], df['content2'] 37 | 38 | # if we use the pairwise AND model to disambiguate trimmed-blocks-based dataset, the training dataset should contain "large" fullname 39 | if cluster_uniq_author_gt_1: 40 | num_instances = len(df) 41 | block_uniq_author_gt_1 = set(DBReader.tcp_model_cached_read(cached_file_path='xxxxx', 42 | sql='''select block_fullname from and_ds.our_and_dataset_block where num_unique_author_inblock = 1;''', 43 | cached=False)['block_fullname'].values) 44 | df = df[df['fullname'].apply(lambda x: x not in block_uniq_author_gt_1) == 1] 45 | num_instances1 = len(df) 46 | print('removed %d instances which are in small blocks' % (num_instances - num_instances1)) 47 | 48 | print('dataset size before deduplication', df.shape) 49 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0]) 50 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0]) 51 | df.drop_duplicates(keep='first', inplace=True) 52 | print('dataset size after deduplication', df.shape) 53 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0]) 54 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0]) 55 | 56 | mode_names = ModelName.available_modes() 57 | print('available_modes: ', mode_names) 58 | 59 | matching_score_dict = json.loads(open(os.path.join(cached_dir, 60 | 'matching-score-glove%s-%s.json' % (glove_vocab_size, underlying_dataset)) 61 | ).readline()) 62 | print(len(matching_score_dict)) 63 | 64 | 65 | def get_score(row): 66 | try: 67 | k = '-'.join(map(lambda x: str(x), row.values.astype(int))) 68 | if k in matching_score_dict: 69 | # print('hit') 70 | v_left_right, v_right_left = matching_score_dict[k] 71 | v_left_right, v_right_left = float(v_left_right), float(v_right_left) 72 | # v = v if v > 0.5 else 0 73 | v = (v_left_right + v_right_left) / 2 74 | return v 75 | else: 76 | # print('nan') 77 | return np.nan 78 | except Exception as e: 79 | print('error: ', e) 80 | return np.nan 81 | 82 | 83 | df['random'] = df['pid1'].apply(lambda x: random()) 84 | 85 | df['match_score'] = df[['pid1', 'ao1', 'pid2', 'ao2']].apply( 86 | lambda row: get_score(row), axis=1) 87 | 88 | print(df.shape, df[['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity', 89 | 'tfidf_cosin_similarity', 'content_cosin_similarity', 'match_score']].mean()) 90 | 91 | feature_names_groups = [ 92 | ['rand', ['random']], 93 | ['magaid', ['same_biblio_aid']], 94 | ['match_score', ['match_score']], 95 | ['name', ['name_similarity']], 96 | ['bf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity']], 97 | ['bf-cfjaccard', 98 | ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity']], 99 | ['bf-cftfidf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'tfidf_cosin_similarity']], 100 | ['bf-cfdoc2vec', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'content_cosin_similarity']], 101 | ['bf-cfnn', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'match_score']] 102 | ] 103 | 104 | formal_feature_name_dict = {'same_biblio_aid': 'MAG Author ID', 'name_similarity': 'Name Similarity', 105 | 'pub_year_diff': 'Publication Year Gap', 106 | 'venue_similarity': 'Venue Similarity', 'aff_similarity': 'Affiliation Similarity', 107 | 'paper_title_abstract_similarity': r'Content Similarity $cf_{jaccard}$', 108 | 'tfidf_cosin_similarity': r'Content Similarity $cf_{tfidf}$', 109 | 'content_cosin_similarity': r'Content Similarity $cf_{doc2vec}$', 110 | 'match_score': r'Content Similarity $cf_{nn}$'} 111 | 112 | if __name__ == '__main__': 113 | # df.to_csv('tmp.tsv', sep='\t', index=False) 114 | df = shuffle(df) 115 | print(df.head()) 116 | 117 | print('original shape: ', df.shape) 118 | df = drop_missing_items(df) 119 | print('after dropping none shape: ', df.shape) 120 | 121 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0]) 122 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0]) 123 | # df = down_sample(df) 124 | # print('after balancing dataset shape: ', df.shape) 125 | # print('pos_samples_num: ', df[df['same_author'] == 1].shape[0]) 126 | # print('neg_samples_num: ', df[df['same_author'] == 0].shape[0]) 127 | 128 | for feature_group_name, feature_names in feature_names_groups: 129 | for idx, model_switch in enumerate(mode_names): 130 | df_copy = df.copy(deep=True) 131 | print('-' * 160) 132 | print(str(model_switch) + '\tused features:\n', '\t'.join(feature_names)) 133 | Y = np.array(df_copy['same_author'].astype('int')) 134 | X = df_copy[feature_names] 135 | # X = scale(X) # TODO scale will improve the performance sightly 136 | X = np.array(X) 137 | 138 | avg_metrics = [] 139 | 140 | # Note we do not using cross validation because the test set is very large 141 | train_test_index = df_copy['train1_test0_val2'].astype('int') 142 | indx_split = [ 143 | ([i for i, n in enumerate(train_test_index) if n == 1], 144 | [i for i, n in enumerate(train_test_index) if n == 0]) 145 | ] 146 | 147 | for round_idx, (train_index, test_index) in enumerate(indx_split): 148 | train_X, train_y = X[train_index], Y[train_index] 149 | test_X, test_y = X[test_index], Y[test_index] 150 | 151 | if len(feature_names) == 1: 152 | # Note if only one feature, then no need to use any classifier. 0.5 is the cut-off value 153 | pred_y, model = test_X, None 154 | else: 155 | # Note if only multiple features, then using a classifier 156 | pred_y, model = use_classifier(train_X, train_y, test_X, model_switch=model_switch) 157 | 158 | # pred_y, model = use_regression(train_X, train_y, test_X, model_switch=model_switch) 159 | # save the model 160 | # file_name = 'cached/lagos-and-rf-model.pkl' 161 | # pickle.dump(model, open(file_name, 'wb')) 162 | importances = model.feature_importances_ 163 | pprint(list(zip(feature_names, importances)), sep='\t') 164 | 165 | if round_idx == 0 and model is not None: 166 | # Note save the model 167 | # joblib.dump(model, os.path.join(cached_dir, 168 | # 'pairwise_and_models/rf-model-with-feature-%s-trained-on-lagos-and-%s-%s-based-dataset.pkl' % ( 169 | # feature_group_name, underlying_dataset, 170 | # 'trimmed' if cluster_uniq_author_gt_1 else 'original'))) 171 | 172 | std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) 173 | plt.figure(figsize=(6, 4), dpi=300) 174 | plt.grid(linestyle='dashed', linewidth=1, axis='y') 175 | 176 | plt.errorbar([formal_feature_name_dict[n] for n in feature_names], importances, yerr=std, 177 | fmt='D', 178 | # mfc='#C9A66B', 179 | # mec='#662E1C', 180 | ms=3, 181 | mew=3, 182 | ecolor='#AF4425', 183 | lw=3, 184 | ls=':', 185 | color='#AF4425', 186 | capsize=6) 187 | plt.ylabel('Feature Contribution', loc='center') # 'top' 188 | plt.xticks(fontsize=8, rotation=10, ha='center') 189 | plt.tight_layout() 190 | if not cluster_uniq_author_gt_1 and feature_group_name == 'bf-cfnn': 191 | plt.savefig(os.path.join(cached_dir, 'feature-contributions.png'), dpi=600) 192 | plt.savefig(os.path.join(latex_doc_base_dir, 'figs/feature-contributions.png'), dpi=600) 193 | plt.show() 194 | 195 | df_test = pd.DataFrame(df_copy.values[test_index], columns=df_copy.columns.values.tolist()) 196 | df_test[feature_names] = test_X 197 | df_test['test_y'] = test_y 198 | df_test['pred_y'] = pred_y 199 | df_test.to_csv(feature_group_name + '_test_instance_predictions.tsv', sep='\t', index=False) 200 | metric_dict = calc_metrics(test_y, pred_y) 201 | metric_tuple = [(m, metric_dict[m]) for m in metric_names] 202 | # pprint(metric_tuple, pctg=True, sep='\t') 203 | avg_metrics.append(metric_dict) 204 | 205 | avg_metric_vals = [np.average([item[m] for item in avg_metrics]) for m in metric_names] 206 | print(metric_names) 207 | pprint(list(zip(metric_names, avg_metric_vals)), pctg=True, sep='\t') 208 | -------------------------------------------------------------------------------- /src/feature/cluster/network_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append('../../') 5 | 6 | from myconfig import glove840b300d_path, cached_dir, device 7 | import joblib 8 | import numpy as np 9 | import torch 10 | from torchtext import vocab 11 | from tqdm import tqdm 12 | from model.nn import MatchGRU 13 | from mytookit.data_reader import DBReader 14 | from nltk.corpus import stopwords 15 | from torch.utils.data.dataset import Dataset 16 | 17 | en_stopwords_set = set(stopwords.words('english')) 18 | 19 | glove_vocab_size = ['6B', '840B'][1] 20 | underlying_dataset = ['pairwise-gold-standard', 'block-gold-standard'][0] 21 | 22 | sql_block = r''' 23 | select block_fullname as block_name, 24 | arrayMap(x->x[1], 25 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos, 26 | arrayMap(x->x[2], tmp) as ground_truths, 27 | arrayMap(x->x[3], tmp) as mag_preds 28 | from (select block_fullname, 29 | author_group_orcid, 30 | -- Note has verified all mag_author_id is successfully matched 31 | toString(aid) as mag_author_id, 32 | concat(toString(pid), '_', toString(author_position)) as pid_ao 33 | from and_ds.our_and_dataset_block any 34 | left join ( 35 | select pid, aid, author_position 36 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 37 | from mag.paper_author_affiliation) any 38 | inner join and_ds.our_and_dataset_block using pid, author_position 39 | ) using pid, author_position) 40 | group by block_name 41 | having xxHash32(block_name) %% 10=%d 42 | order by length(pid_aos) desc 43 | ;''' 44 | 45 | sql_metadata = r''' 46 | select concat(toString(pid), '_', toString(author_position)) as pid_ao, 47 | block_fullname, 48 | author_group_orcid as orcid, 49 | -- -- Note has verified all mag_author_id is successfully matched 50 | -- lowerUTF8(author_name) as author_name, 51 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation, 52 | -- coauthors, 53 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue, 54 | -- pub_year, 55 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content 56 | from and_ds.our_and_dataset_block any 57 | left join ( 58 | select pid, aid, author_position 59 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position 60 | from mag.paper_author_affiliation) any 61 | inner join and_ds.our_and_dataset_block using pid, author_position 62 | ) using pid, author_position 63 | where xxHash32(block_fullname) %% 10=%d 64 | ''' 65 | 66 | # Note ############################################################################################# 67 | # Note test the performance of learnable method. 68 | print('begin load models... ') 69 | glove = vocab.GloVe(name=glove_vocab_size, dim=300, cache=glove840b300d_path) 70 | pad_idx = 0 71 | batch_size = 640 72 | max_sql_len = 300 73 | print(max_sql_len) 74 | 75 | print('use device: ', device) 76 | 77 | 78 | def word_token(txt): 79 | words = txt.lower().split() 80 | tokens = [glove.stoi[word] for word in words if word in glove.stoi] 81 | tokens = tokens[:max_sql_len] if len(tokens) >= max_sql_len else tokens + [pad_idx] * ( 82 | max_sql_len - len(tokens)) 83 | tokens = np.array(tokens) 84 | return tokens 85 | 86 | 87 | # the model accept the GloVe pretrained word embedding 88 | model = MatchGRU(glove, hidden_dim=64, num_layers=2, 89 | # num_hand_craft_feature=len(train_set.num_hand_craft_feature_set), 90 | bidirectional=True, output_dim=2).to(device) 91 | 92 | model_path = os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset)) 93 | print('model_path: %s' % model_path) 94 | checkpoint = torch.load(model_path) 95 | model.load_state_dict(checkpoint['model_state_dict']) 96 | model.eval() 97 | print('end load models... ') 98 | 99 | 100 | class MyDataset(Dataset): 101 | def __init__(self, all_XL, all_XR): 102 | self.all_XL = all_XL 103 | self.all_XR = all_XR 104 | 105 | def __getitem__(self, index): 106 | XL = self.all_XL[index] 107 | XR = self.all_XR[index] 108 | return XL, XR 109 | 110 | def __len__(self): 111 | return self.all_XL.size(0) 112 | 113 | 114 | def compute_batch_pairwise_similarity(pairwise_dataset): 115 | all_input_loader = torch.utils.data.DataLoader(dataset=pairwise_dataset, 116 | batch_size=batch_size, 117 | # Note shuffle should not be True, 118 | # Note we do not perform shuffle because the output should be in order with the inout samples 119 | shuffle=False) 120 | 121 | prediction1 = torch.tensor([], device=device) 122 | prediction2 = torch.tensor([], device=device) 123 | 124 | for batch_idx, (XL, XR) in enumerate(all_input_loader): 125 | # Note matching similarity 126 | # XL, XR = torch.tensor(word_token(content1), device=device), torch.tensor(word_token(content2), device=device) 127 | with torch.no_grad(): 128 | output = model([XL, XR]) 129 | 130 | # Note if using BCELogistiLoss, the model does not contain the activation layer 131 | pred = output.sigmoid() 132 | 133 | prediction1 = torch.cat((prediction1, pred)) 134 | # prediction2 = torch.cat((prediction2, pred2)) 135 | 136 | # pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()] 137 | 138 | # for i, pid_ao_i in enumerate(pid_aos): 139 | # for j, pid_ao_j in enumerate(pid_aos): 140 | # content_word1, content_word2 = md_content_word_dict[pid_ao_i], md_content_word_dict[pid_ao_j] 141 | # # content1, content2 = md_content_dict[pid_ao_i], md_content_dict[pid_ao_j] 142 | # # doc2vec_v1, doc2vec_v2 = md_doc2vec_emd_dict[pid_ao_i], md_doc2vec_emd_dict[pid_ao_j] 143 | # 144 | # # Note matching similarity 145 | # XL, XR = torch.tensor(word_token(content1), device=device), torch.tensor(word_token(content2), device=device) 146 | # prediction = torch.tensor([], device=device) 147 | # with torch.no_grad(): 148 | # output = model([XL, XR]) 149 | # pred = output.sigmoid() 150 | # prediction = torch.cat((prediction, pred)) 151 | # 152 | # pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()] 153 | # pred_prob = [n[1] for n in prediction.cpu().numpy()] 154 | 155 | return prediction1, prediction2 156 | 157 | 158 | for seg in list(range(0, 10, 1))[::-1]: 159 | sql = sql_metadata % seg 160 | print(sql) 161 | # Note prepare the paper metadata dict 162 | # df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False) 163 | df_metadata = DBReader.tcp_model_cached_read( 164 | cached_file_path=os.path.join(cached_dir, 'block_data/block_metadata_%d.pkl' % seg), sql=sql, cached=True) 165 | 166 | print(df_metadata.shape) 167 | # print(df_metadata[['pid_ao', 'content']].values[:100]) 168 | 169 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values)) 170 | md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values)) 171 | 172 | md_content_word_embedding = dict( 173 | zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(lambda x: word_token(x)).values)) 174 | 175 | del df_metadata 176 | 177 | all_block_feature = {} 178 | sql = sql_block % seg 179 | print(sql) 180 | # df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False) 181 | df_block = DBReader.tcp_model_cached_read(cached_file_path=os.path.join(cached_dir, 'block_data/block_data_%d.pkl' % seg), 182 | sql=sql, cached=True) 183 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]): 184 | block_name, pid_aos, ground_truths, mag_preds = row 185 | 186 | # Note calculate the similarity between different metadata according to pid_ao 187 | num_instances = len(pid_aos) 188 | 189 | if num_instances > 700: 190 | block_content_term_ids = torch.tensor(np.array([md_content_word_embedding[pid_ao] for pid_ao in pid_aos]), 191 | device=device) 192 | # Note this block is very large that can not fit into the GPU RAM, thus, we should process the each XL individually. 193 | all_XR = block_content_term_ids 194 | 195 | prediction1, prediction2 = torch.tensor([], device=device), torch.tensor([], device=device) 196 | for i in range(num_instances): 197 | one_XL = block_content_term_ids[i] 198 | one_XL = one_XL.unsqueeze(0).repeat(num_instances, 1) 199 | assert one_XL.shape == all_XR.shape 200 | 201 | pairwised_dataset = MyDataset(one_XL, all_XR) 202 | tmp_prediction1, tmp_prediction2 = compute_batch_pairwise_similarity(pairwised_dataset) 203 | prediction1 = torch.cat((prediction1, tmp_prediction1)) 204 | prediction2 = torch.cat((prediction2, tmp_prediction2)) 205 | else: 206 | block_content_term_ids = torch.tensor(np.array([md_content_word_embedding[pid_ao] for pid_ao in pid_aos]), 207 | device=device) 208 | all_XL = block_content_term_ids.repeat(1, block_content_term_ids.size(0)).view(-1, block_content_term_ids.size(-1)) 209 | all_XR = block_content_term_ids.repeat(block_content_term_ids.size(0), 1) 210 | pairwised_dataset = MyDataset(all_XL, all_XR) 211 | prediction1, prediction2 = compute_batch_pairwise_similarity(pairwised_dataset) 212 | 213 | pred_prob = [prediction1.reshape(num_instances, -1).cpu().numpy().astype(np.float16), 214 | prediction2.reshape(num_instances, -1).cpu().numpy().astype(np.float16)] 215 | 216 | all_block_feature[block_name] = pred_prob 217 | 218 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir, 219 | 'cluster_feature/matching-features-glove840B-%d-with-model-trained-on-%s.pkl' % ( 220 | seg, underlying_dataset))) 221 | -------------------------------------------------------------------------------- /src/comparison/block/clustering_metrics_other_baselines.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append('../../') 5 | import joblib 6 | import numpy as np 7 | from beard import metrics 8 | from mytookit.data_reader import DBReader 9 | from sklearn.cluster import AgglomerativeClustering 10 | from tqdm import tqdm 11 | 12 | from myconfig import cached_dir, cli_args, tuned_best_cluster_setting 13 | 14 | ''' Note 15 | This script aims to evaluate some semi-supervised learning baseline methods. To do this, We 16 | 1) trained pairwise disambiguation models on the TRAINING set; 17 | 2) use these model to search the most effect parameters on the DEV set; 18 | 3) last, using the pairwise AND models and the optimal clustering parameters to perform clustering on the TEST set. 19 | 20 | Note that we also conduct experiments on the slimmer LAGOS-AND dataset created by removing those blocks containing only one author 21 | ''' 22 | 23 | # Note hyper-parameters ################################################################################ 24 | underlying_dataset = 'pairwise-gold-standard' 25 | # cluster_uniq_author_gt_1 = False 26 | cluster_uniq_author_gt_1 = True 27 | which_model = cli_args.which_model 28 | print(which_model) 29 | HAC_distance_threshold_trials = range(1, 11, 1) 30 | 31 | # Note step 1 ########################################################################################## 32 | # Note load the trained model, which is trained on the block-based LAGOS-AND training set 33 | cached_file_base_dir = os.path.join(cached_dir, 'cluster_feature') 34 | available_model_names = ['name', 'bf', 'bf-cfjaccard', 'bf-cftfidf', 'bf-cfdoc2vec', 'bf-cfnn'] 35 | available_feature_masks = [[0], [0, 1, 2, 3], [0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7]] 36 | 37 | # Note load all the pairwise AND models 38 | available_models = [] 39 | for n in available_model_names: 40 | model_path = os.path.join(cached_dir, 41 | 'pairwise_and_models/rf-model-with-feature-%s-trained-on-lagos-and-%s-%s-based-dataset.pkl' % ( 42 | n, underlying_dataset, 'trimmed' if cluster_uniq_author_gt_1 else 'original')) 43 | print(model_path) 44 | available_models.append(joblib.load(model_path)) 45 | 46 | current_model = available_model_names[which_model] 47 | ml_model = available_models[which_model] 48 | feature_mask = available_feature_masks[which_model] 49 | print('current_model: ', current_model) 50 | 51 | # Note step 2 ########################################################################################## 52 | # Note load the DEV and TEST set 53 | df_blocks = DBReader.tcp_model_cached_read(cached_file_path=os.path.join(cached_dir, 'lagos-and-block-info.pkl'), 54 | sql=r'''select block_name, pid_aos, ground_truths, mag_preds, seg, train1_test0_val2, num_unique_author_inblock, num_citaion_in_block from and_ds.our_and_dataset_block_with_block_info;''', 55 | cached=True) 56 | print(df_blocks.shape) 57 | 58 | # Note this is very important here as it will greatly reduce the size of the dataset 59 | if cluster_uniq_author_gt_1: 60 | num_instances = len(df_blocks) 61 | df_blocks = df_blocks[df_blocks['num_unique_author_inblock'] > 1] 62 | num_instances1 = len(df_blocks) 63 | print('removed %d instances, enable each block containing more than one unique authors' % (num_instances - num_instances1)) 64 | # del df_blocks['num_unique_author_inblock'], df_blocks['num_citaion_in_block'] 65 | 66 | df_train_blocks = df_blocks[df_blocks['train1_test0_val2'] == 1] 67 | df_test_blocks = df_blocks[df_blocks['train1_test0_val2'] == 0] 68 | df_val_blocks = df_blocks[df_blocks['train1_test0_val2'] == 2] 69 | print('train/val/test block sizes', df_train_blocks.shape, df_val_blocks.shape, df_test_blocks.shape) 70 | del df_blocks, df_train_blocks['train1_test0_val2'], df_test_blocks['train1_test0_val2'], df_val_blocks['train1_test0_val2'] 71 | 72 | 73 | # Note step 3 ########################################################################################## 74 | # Note eval on the DEV set, trying to find the best clustering parameters 75 | def merge_feature(five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature): 76 | # set1==set2 compares for equality of each element in both the sets, 77 | # and evaluates to true if and only if both the sets are exactly same. 78 | assert five_fast_feature.keys() == tfidf_feature.keys() == dov2vec_feature.keys() == matching_feature.keys() 79 | avg_feature_values = [] 80 | merged_feature_map = {} 81 | for k in matching_feature.keys(): 82 | fv1, fv2, fv3, (fv41, fv42) = five_fast_feature[k], tfidf_feature[k], dov2vec_feature[k], matching_feature[k] 83 | assert fv1.shape[:2] == fv2.shape[:2] == fv3.shape[:2] == fv41.shape[:2] 84 | num_authors = fv1.shape[0] 85 | # Note all these are numpy array while permuted the feature order, 86 | # Note making it aligned with the order of the original feature training the AND models 87 | 88 | # feature_names_groups = [ 89 | # # ['rand', ['random']], 90 | # # ['magaid', ['same_biblio_aid']], 91 | # ['name', ['name_similarity']], 92 | # ['bf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity']], 93 | # ['bf-cfjaccard', 94 | # ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity']], 95 | # ['bf-cftfidf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'tfidf_cosin_similarity']], 96 | # ['bf-cfdoc2vec', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'content_cosin_similarity']], 97 | # ['bf-cfnn', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'match_score']] 98 | # ] 99 | 100 | # fv41[fv41 <= 0.5] = 0 101 | 102 | # Note convert the 2D numpy to a symmetry 2D matrix 103 | fv41 = (fv41 + fv41.T) / 2 104 | 105 | tmp_concat_feature = np.concatenate( 106 | (np.expand_dims(fv1[:, :, 0], axis=2), # name_similarity, 0 107 | np.expand_dims(fv1[:, :, 1], axis=2), # pub_year_diff, 1 108 | np.expand_dims(fv1[:, :, 3], axis=2), # venue_similarity, 2 109 | np.expand_dims(fv1[:, :, 4], axis=2), # aff_similarity, 3 110 | np.expand_dims(fv1[:, :, 2], axis=2), # paper_title_abstract_similarity, 4 111 | np.expand_dims(fv2, axis=2), # tfidf, 5 112 | np.expand_dims(fv3, axis=2), # dov2vec, 6 113 | np.expand_dims(fv41, axis=2), # nn1 sigmoid, 7 114 | ), 115 | axis=2) 116 | 117 | tmp_avg_feature_value = [[num_authors * num_authors, np.sum(tmp_concat_feature[:, :, i].view().reshape(-1))] for i in 118 | range(0, 8, 1)] 119 | avg_feature_values.append(tmp_avg_feature_value) 120 | 121 | # print(tmp_concat_feature.shape) 122 | merged_feature_map[k] = tmp_concat_feature 123 | 124 | # avg_feature_values = np.array(avg_feature_values) 125 | # avg_feature_values = [np.sum(avg_feature_values[:, i, 1]) / np.sum(avg_feature_values[:, i, 0]) for i in range(0, 8, 1)] 126 | # print('feature average values: ', avg_feature_values) 127 | # feature average values: [0.8576142289367379, 5.90870462549071, 0.17193027950163528, 0.32441071932990906, 0.08016590954082886, 0.13715612273098102, 0.2845889716223027, 0.7829814895889327, 0.8295835544720577] 128 | return merged_feature_map 129 | 130 | 131 | def data_precision_round(arr, precision=2, pctg=True): 132 | return [round(x * 100 if pctg else x, precision) for x in arr] 133 | 134 | 135 | def clustering_over_input_blocks(cluster_algo, input_df_blocks): 136 | all_clustering_metrics = [] 137 | all_clustering_predictions = {} 138 | segments = range(0, 10, 1) 139 | for seg in segments: 140 | # Note loading DEV block information 141 | df_seg = input_df_blocks[input_df_blocks['seg'] == seg] 142 | del df_seg['seg'] 143 | 144 | # Note loading the cached feature data 145 | merged_feature_path = os.path.join(cached_file_base_dir, 'merged_features-gold-standard-%d.pkl' % seg) 146 | # merged_feature_path = os.path.join(cached_dir, 'temp/merged_features-%d.pkl' % seg) 147 | if os.path.exists(merged_feature_path): 148 | merged_feature_map = joblib.load(merged_feature_path) 149 | else: 150 | # Note consolidating the features into one feature file 151 | five_fast_feature = joblib.load(os.path.join(cached_file_base_dir, 'five-fast-features-%d.pkl' % seg)) 152 | tfidf_feature = joblib.load(os.path.join(cached_file_base_dir, 'tfidf-feature-%d.pkl' % seg)) 153 | dov2vec_feature = joblib.load(os.path.join(cached_file_base_dir, 'doc2vec-feature-%d.pkl' % seg)) 154 | # matching_feature = joblib.load(os.path.join(cached_file_base_dir, 'matching-features-%d.pkl' % seg)) 155 | # matching_feature = joblib.load(os.path.join(cached_file_base_dir, 'matching-features-glove840B-%d.pkl' % seg)) 156 | matching_feature = joblib.load(os.path.join(cached_file_base_dir, 157 | 'matching-features-glove840B-%d-with-model-trained-on-%s.pkl' % ( 158 | seg, underlying_dataset))) 159 | 160 | print(len(five_fast_feature), len(dov2vec_feature), len(matching_feature)) 161 | merged_feature_map = merge_feature(five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature) 162 | del five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature 163 | joblib.dump(merged_feature_map, merged_feature_path) 164 | 165 | for ij, row in tqdm(df_seg.iterrows(), total=df_seg.shape[0]): 166 | block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock, num_citaiton_in_block = row 167 | num_authors = len(pid_aos) 168 | 169 | block_feature_matrix = merged_feature_map[block_name] 170 | assert block_feature_matrix.shape[:2] == (num_authors, num_authors) 171 | 172 | # Note squared predictions based on the giving features 173 | block_feature_matrix = block_feature_matrix[:, :, feature_mask] 174 | block_flatten_feature_vector = block_feature_matrix.view().reshape(-1, block_feature_matrix.shape[-1]) 175 | block_flatten_predictions = ml_model.predict_proba(block_flatten_feature_vector)[:, 1] 176 | 177 | # ground_truths_1D = np.array([[1 if aa == bb else 0 for aa in ground_truths] for bb in ground_truths]).reshape(-1) 178 | # for k, _ in enumerate(feature_mask): 179 | # print(k, stats.spearmanr(block_flatten_feature_vector[:, k], ground_truths_1D)[0]) 180 | # print(stats.spearmanr(block_flatten_predictions, ground_truths_1D)[0]) 181 | 182 | block_flatten_predictions = 1 - block_flatten_predictions # convert to distance matrix 183 | block_squared_predictions = block_flatten_predictions.reshape(num_authors, num_authors) 184 | 185 | # block_squared_predictions = 1 - block_feature_matrix[:, :, 8] 186 | 187 | # Note clustering on the block_squared_predictions using DBSCAN 188 | # cluster = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed') # , n_jobs=-1 ``-1`` means using all processors 189 | # cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, affinity='precomputed', 190 | # linkage='single') 191 | 192 | # Note the input of clustering algorithm is the distance matrix 193 | cluster_labels = cluster_algo.fit_predict(X=block_squared_predictions) 194 | all_clustering_predictions[block_name] = [cluster_labels, ground_truths] 195 | 196 | # print(block_name, len(ground_truths), len(set(ground_truths)), cluster_labels) 197 | 198 | # Note compare the cluster_labels with the ground truth and calculate the metrics 199 | block_metrics_b3 = metrics.b3_precision_recall_fscore(labels_true=ground_truths, labels_pred=cluster_labels) 200 | block_metrics_pairwisef = metrics.paired_precision_recall_fscore(labels_true=ground_truths, 201 | labels_pred=cluster_labels) 202 | all_clustering_metrics.append( 203 | [block_name] + data_precision_round(list(block_metrics_b3 + block_metrics_pairwisef), pctg=False)) 204 | 205 | # if np.random.random() < 0.001: 206 | # print('intermediate results: ', np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0)) 207 | 208 | return all_clustering_metrics, all_clustering_predictions 209 | 210 | 211 | if tuned_best_cluster_setting is None: 212 | print('evaluating ...') 213 | best_cluster_setting = None 214 | best_metric = -1 215 | metric_tendencies = [] 216 | for cluster_setting in HAC_distance_threshold_trials: 217 | distance_threshold = 0.2 + cluster_setting * 0.01 218 | 219 | cluster_algo = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, 220 | affinity='precomputed', linkage='single') 221 | all_clustering_metrics, all_clustering_predictions = clustering_over_input_blocks(cluster_algo, df_val_blocks) 222 | 223 | # Note computer average metrics 224 | avg_metrics = np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0) 225 | print(avg_metrics) 226 | bp, br, bf, pp, pr, pf = avg_metrics 227 | metric_tendencies.append([metric_tendencies, bf]) 228 | if best_metric < bf: 229 | print('updated the best clustering B3-F1 metric from %f to %f, and the the corresponding clustering setting is %f' % ( 230 | best_metric, bf, distance_threshold)) 231 | best_metric = bf 232 | best_cluster_setting = distance_threshold 233 | 234 | # plt.plot([n[0] for n in metric_tendencies], [n[1] for n in metric_tendencies]) 235 | # plt.title(current_model) 236 | # plt.savefig(os.path.join(cached_dir, 'cluster_parameter_tuning/%s.png' % current_model), dpi=600) 237 | # plt.show() 238 | 239 | print('the best_cluster_setting for current_model: %s is %f' % (current_model, best_cluster_setting)) 240 | tuned_best_cluster_setting = best_cluster_setting 241 | 242 | # Note step 3 ########################################################################################## 243 | # Note clustering on the the block-based LAGOS-AND test set and calculating the metrics 244 | print('evaluating on the test set using the parameter %f ...' % tuned_best_cluster_setting) 245 | tuned_cluster_algo = AgglomerativeClustering(n_clusters=None, distance_threshold=tuned_best_cluster_setting, 246 | affinity='precomputed', linkage='single') 247 | all_clustering_metrics, all_clustering_predictions = clustering_over_input_blocks(tuned_cluster_algo, 248 | # Note must use the TEST set 249 | df_test_blocks) 250 | # Note computer average metrics 251 | avg_metrics = np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0) 252 | print('avg_metrics: ', avg_metrics) 253 | bp, br, bf, pp, pr, pf = avg_metrics 254 | 255 | joblib.dump([avg_metrics, all_clustering_metrics, all_clustering_predictions], 256 | os.path.join(cached_dir, 'cluster_metrics/all-metrics-predictions-%s-%f.pkl' % 257 | (current_model, tuned_best_cluster_setting)) 258 | ) 259 | -------------------------------------------------------------------------------- /src/datacheck/checking_multi_facets.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | 4 | import numpy as np 5 | from matplotlib import pyplot as plot 6 | 7 | from myconfig import cached_dir, latex_doc_base_dir 8 | 9 | plot.rcParams['font.family'] = 'serif' 10 | plot.rcParams['font.serif'] = ['Times New Roman'] + plot.rcParams['font.serif'] 11 | 12 | from mytookit.data_reader import DBReader 13 | 14 | colors = ['green', 'gold', 'red', 'black', 'cyan', 'blue', 'magenta', 'purple', 'gray', 'fuchsia', 'orange', 'yellow'] 15 | linestyles = ['--', '-.', ':', '--'] 16 | line_markers = ['<', '>', '^', 'v'] 17 | linewidth = 4 18 | tick_font_size = 14 19 | df_whole = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "whole_mag_representativeness_distribution.pkl"), 20 | "select * from and_ds.whole_mag_representativeness_distribution;", 21 | cached=True) 22 | print('df_whole.shape', df_whole.shape) 23 | # ['check_item' 'distribution'] 24 | print(df_whole['check_item'].values) 25 | # for i, (check_item, distribution) in df_whole.iterrows(): 26 | # print(check_item, len(distribution), distribution[:5]) 27 | # pub_year, author_position, lastname_popularity, ssn_gender-sex, sex_mac-sex, ethnic-seer, genni-sex, ethnea 28 | whole_pub_year_dist = df_whole[df_whole['check_item'] == 'pub_year']['distribution'].values[0] 29 | whole_author_position_dist = df_whole[df_whole['check_item'] == 'author_position']['distribution'].values[0] 30 | whole_mac_gender_dist = df_whole[df_whole['check_item'] == 'sex_mac-sex']['distribution'].values[0] 31 | whole_genni_gender_dist = df_whole[df_whole['check_item'] == 'genni-sex']['distribution'].values[0] 32 | whole_ssn_gender_dist = df_whole[df_whole['check_item'] == 'ssn_gender-sex']['distribution'].values[0] 33 | whole_lastname_popularity_dist = df_whole[df_whole['check_item'] == 'lastname_popularity']['distribution'].values[0] 34 | whole_lastname_first_initial_popularity_dist = \ 35 | df_whole[df_whole['check_item'] == 'lastname_first_initial_popularity']['distribution'].values[0] 36 | # whole_lastname_popularity_dist = df_whole[df_whole['check_item'] == 'lastname']['distribution'].values[0] 37 | # whole_lastname_first_initial_popularity_dist = df_whole[df_whole['check_item'] == 'lastname_first_initial']['distribution'].values[0] 38 | whole_ethnic_seer_dist = df_whole[df_whole['check_item'] == 'ethnic-seer']['distribution'].values[0] 39 | whole_ethnea_dist = df_whole[df_whole['check_item'] == 'ethnea']['distribution'].values[0] 40 | whole_fos_dist = df_whole[df_whole['check_item'] == 'fos']['distribution'].values[0] 41 | 42 | print(whole_pub_year_dist[:5]) 43 | print(whole_author_position_dist[:5]) 44 | print(whole_mac_gender_dist[:5]) 45 | print(whole_genni_gender_dist[:5]) 46 | print(whole_ssn_gender_dist[:5]) 47 | print(whole_lastname_popularity_dist[:5]) 48 | print(whole_ethnic_seer_dist[:5]) 49 | print(whole_ethnea_dist[:5]) 50 | 51 | df_block = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "orcid_mag_matched_representativeness.pkl"), 52 | sql="""select * from and_ds.our_dataset_representativeness;""", 53 | cached=True) 54 | # ['pid' 'orcid' 'author_position' 'lastname' 'ethnic_seer' 'ethnea' 'genni', 'sex_mac' 'ssn_gender' 'pub_year'] 55 | print(len(df_block.columns.values), df_block.columns.values) 56 | # df_sample = df_block[:10] 57 | 58 | # Note distribution of various aspects of the block-based dataset 59 | pub_year_counter_block = sorted(collections.Counter(df_block['pub_year'].values).items(), key=lambda x: x[0], reverse=False) 60 | author_position_counter_block = sorted(collections.Counter(df_block['author_position'].values).items(), key=lambda x: x[0], 61 | reverse=False) 62 | author_genni_gender_counter_block = sorted(collections.Counter(df_block['genni'].values).items(), key=lambda x: x[0], 63 | reverse=False) 64 | 65 | author_sex_mac_counter_block = sorted(collections.Counter(df_block['sex_mac'].values).items(), key=lambda x: x[0], 66 | reverse=False) 67 | author_ssn_gender_counter_block = sorted(collections.Counter(df_block['ssn_gender'].values).items(), key=lambda x: x[0], 68 | reverse=False) 69 | 70 | author_ethnic_seer_counter_block = sorted(collections.Counter(df_block['ethnic_seer'].values).items(), key=lambda x: x[0], 71 | reverse=False) 72 | author_ethnea_counter_block = sorted(collections.Counter(df_block['ethnea'].values).items(), key=lambda x: x[0], 73 | reverse=False) 74 | author_lastname_counter_block = sorted(collections.Counter(df_block['lastname'].values).items(), key=lambda x: x[1], 75 | reverse=True) 76 | author_lastname_counter_block = sorted(collections.Counter([n[1] for n in author_lastname_counter_block]).items(), 77 | key=lambda x: x[0], 78 | reverse=True) 79 | lastname_first_initial_counter_block = sorted(collections.Counter(df_block['lastname_first_initial'].values).items(), 80 | key=lambda x: x[1], reverse=True) 81 | lastname_first_initial_counter_block = sorted(collections.Counter([n[1] for n in lastname_first_initial_counter_block]).items(), 82 | key=lambda x: x[0], 83 | reverse=True) 84 | fos_counter_block = sorted( 85 | collections.Counter([n for n in np.hstack(df_block['fos_arr'].values) if len(n) > 0]).items(), key=lambda x: x[1], 86 | reverse=False) 87 | 88 | # Note distribution of various aspects of the pairwise-based dataset 89 | df_pairwise = DBReader.tcp_model_cached_read("xxx", 90 | sql="""select * from and_ds.our_dataset_pairwise_representativeness;""", 91 | cached=False) 92 | print('df_pairwise.shape before adjustment', df_pairwise.shape) 93 | 94 | pub_year_counter_pairwise = sorted(collections.Counter(df_pairwise['pub_year'].values).items(), key=lambda x: x[0], reverse=False) 95 | author_position_counter_pairwise = sorted(collections.Counter(df_pairwise['author_position'].values).items(), key=lambda x: x[0], 96 | reverse=False) 97 | author_genni_gender_counter_pairwise = sorted(collections.Counter(df_pairwise['genni'].values).items(), key=lambda x: x[0], 98 | reverse=False) 99 | author_sex_mac_counter_pairwise = sorted(collections.Counter(df_pairwise['sex_mac'].values).items(), key=lambda x: x[0], 100 | reverse=False) 101 | author_ssn_gender_counter_pairwise = sorted(collections.Counter(df_pairwise['ssn_gender'].values).items(), key=lambda x: x[0], 102 | reverse=False) 103 | author_ethnic_seer_counter_pairwise = sorted(collections.Counter(df_pairwise['ethnic_seer'].values).items(), key=lambda x: x[0], 104 | reverse=False) 105 | author_ethnea_counter_pairwise = sorted(collections.Counter(df_pairwise['ethnea'].values).items(), key=lambda x: x[0], 106 | reverse=False) 107 | author_lastname_counter_pairwise = sorted(collections.Counter(df_pairwise['lastname'].values).items(), key=lambda x: x[1], 108 | reverse=True) 109 | author_lastname_counter_pairwise = sorted(collections.Counter([n[1] for n in author_lastname_counter_pairwise]).items(), 110 | key=lambda x: x[0], 111 | reverse=True) 112 | lastname_first_initial_counter_pairwise = sorted(collections.Counter(df_pairwise['lastname_first_initial'].values).items(), 113 | key=lambda x: x[1], reverse=True) 114 | lastname_first_initial_counter_pairwise = sorted( 115 | collections.Counter([n[1] for n in lastname_first_initial_counter_pairwise]).items(), 116 | key=lambda x: x[0], 117 | reverse=True) 118 | fos_counter_pairwise = sorted( 119 | collections.Counter([n for n in np.hstack(df_pairwise['fos_arr'].values) if len(n) > 0]).items(), key=lambda x: x[1], 120 | reverse=False) 121 | 122 | 123 | def plot_pub_year(whole_pub_year_dist, counter_block, counter_pairwise, check_item): 124 | whole_pub_year_dist = [n for n in whole_pub_year_dist if 1970 <= int(n[0]) <= 2018] 125 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 126 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist] 127 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 128 | 129 | pub_year_counter1 = [n for n in counter_block if 1970 <= n[0] <= 2018] 130 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 131 | pub_year1 = [n[0] for n in pub_year_counter1] 132 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 133 | 134 | pub_year_counter2 = [n for n in counter_pairwise if 1970 <= n[0] <= 2018] 135 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 136 | pub_year2 = [n[0] for n in pub_year_counter2] 137 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 138 | 139 | # plot.figure() 140 | idx = 0 141 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 142 | # marker=line_markers[idx], markersize=8, markevery=0.2, 143 | color=colors[idx], label='MAG', linewidth=linewidth) 144 | idx = 1 145 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx], 146 | # marker=line_markers[idx], markersize=8, markevery=0.2, 147 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth) 148 | idx = 2 149 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx], 150 | # marker=line_markers[idx], markersize=8, markevery=0.2, 151 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth) 152 | 153 | # plot.yscale('log') 154 | plot.title(check_item, fontsize=18) 155 | plot.xlabel('Year', loc='right', fontsize=18) 156 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 157 | plot.xticks(fontsize=tick_font_size) 158 | plot.yticks(fontsize=tick_font_size) 159 | plot.legend(loc='best') # 'lower right' 160 | 161 | 162 | def plot_author_position(whole_author_position_dist, counter_block, counter_pairwise, check_item): 163 | whole_pub_year_dist = [n for n in whole_author_position_dist if int(n[0]) <= 15] 164 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 165 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist] 166 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 167 | 168 | pub_year_counter1 = [n for n in counter_block if n[0] <= 15] 169 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 170 | pub_year1 = [n[0] for n in pub_year_counter1] 171 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 172 | 173 | pub_year_counter2 = [n for n in counter_pairwise if n[0] <= 15] 174 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 175 | pub_year2 = [n[0] for n in pub_year_counter2] 176 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 177 | 178 | # plot.figure() 179 | idx = 0 180 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 181 | # marker=line_markers[idx], markersize=8, markevery=0.2, 182 | color=colors[idx], label='MAG', linewidth=linewidth) 183 | idx = 1 184 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx], 185 | # marker=line_markers[idx], markersize=8, markevery=0.2, 186 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth) 187 | idx = 2 188 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx], 189 | # marker=line_markers[idx], markersize=8, markevery=0.2, 190 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth) 191 | 192 | plot.yscale('log') 193 | plot.title(check_item, fontsize=18) 194 | plot.xlabel('Author Position', loc='right', fontsize=18) 195 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 196 | plot.xticks(fontsize=tick_font_size) 197 | plot.yticks(fontsize=tick_font_size) 198 | plot.legend(loc='best') # 'lower right' 199 | 200 | 201 | def plot_author_gender(whole_genni_gender_dist, counter_block, counter_pairwise, check_item): 202 | x_label_map = {'-': 'Unsure', 'F': 'Female', 'M': 'Male', '': ''} 203 | whole_pub_year_dist = sorted(whole_genni_gender_dist, key=lambda x: x[0], reverse=False) 204 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 205 | whole_pub_year = [x_label_map[n[0]] for n in whole_pub_year_dist] 206 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 207 | 208 | pub_year_counter1 = sorted([n for n in counter_block if x_label_map[n[0]] in set(whole_pub_year)], 209 | key=lambda x: x[0], reverse=False) 210 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 211 | pub_year1 = [x_label_map[n[0]] for n in pub_year_counter1] 212 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 213 | 214 | pub_year_counter2 = sorted([n for n in counter_pairwise if x_label_map[n[0]] in set(whole_pub_year)], 215 | key=lambda x: x[0], reverse=False) 216 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 217 | pub_year2 = [x_label_map[n[0]] for n in pub_year_counter2] 218 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 219 | 220 | # plot.figure() 221 | idx = 0 222 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 223 | # marker=line_markers[idx], markersize=8, markevery=0.2, 224 | color=colors[idx], label='MAG', linewidth=linewidth) 225 | idx = 1 226 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx], 227 | # marker=line_markers[idx], markersize=8, markevery=0.2, 228 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth) 229 | idx = 2 230 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx], 231 | # marker=line_markers[idx], markersize=8, markevery=0.2, 232 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth) 233 | 234 | plot.title(check_item, fontsize=18) 235 | plot.xlabel('Gender', loc='right', fontsize=18) 236 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 237 | plot.xticks(fontsize=tick_font_size) 238 | plot.yticks(fontsize=tick_font_size) 239 | 240 | plot.legend(loc='best') # 'lower right' 241 | 242 | 243 | def plot_ethnic_seer(whole_ethnic_seer_dist, counter_block, counter_pairwise, check_item): 244 | whole_pub_year_dist = sorted(whole_ethnic_seer_dist, key=lambda x: x[1], reverse=False) 245 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 246 | whole_pub_year = [n[0] for n in whole_pub_year_dist] 247 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 248 | 249 | keys1 = [n[0] for n in counter_block] 250 | pub_year_counter1 = [counter_block[keys1.index(n)] if n in keys1 else (n, 0) for n in whole_pub_year] 251 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 252 | pub_year1 = [n[0] for n in pub_year_counter1] 253 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 254 | 255 | keys2 = [n[0] for n in counter_pairwise] 256 | pub_year_counter2 = [counter_block[keys2.index(n)] if n in keys2 else (n, 0) for n in whole_pub_year] 257 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 258 | pub_year2 = [n[0] for n in pub_year_counter2] 259 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 260 | 261 | # plot.figure() 262 | idx = 0 263 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 264 | # marker=line_markers[idx], markersize=8, markevery=0.2, 265 | color=colors[idx], label='MAG', linewidth=linewidth) 266 | idx = 1 267 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx], 268 | # marker=line_markers[idx], markersize=8, markevery=0.2, 269 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth) 270 | idx = 2 271 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx], 272 | # marker=line_markers[idx], markersize=8, markevery=0.2, 273 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth) 274 | 275 | # plot.xscale('log') 276 | plot.yscale('log') 277 | plot.title(check_item, fontsize=18) 278 | plot.xlabel('Ethnicity', loc='right', fontsize=18) 279 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 280 | plot.xticks(fontsize=tick_font_size - 4) 281 | plot.yticks(fontsize=tick_font_size) 282 | 283 | plot.legend(loc='best') # 'lower right' 284 | 285 | 286 | # def plot_ethnea(whole_ethnic_seer_dist, author_ethnic_seer_counter, check_item): 287 | # whole_pub_year_dist = sorted(whole_ethnic_seer_dist, key=lambda x: x[1], reverse=False) 288 | # all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 289 | # whole_pub_year = [n[0] for n in whole_pub_year_dist] 290 | # whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 291 | # 292 | # keys = [n[0] for n in author_ethnic_seer_counter] 293 | # pub_year_counter = [author_ethnic_seer_counter[keys.index(n)] if n in keys else (n, 0) for n in whole_pub_year] 294 | # pub_cnt = sum([n[1] for n in pub_year_counter]) 295 | # pub_year = [n[0] for n in pub_year_counter] 296 | # pub_count = [n[1] * 1.0 / pub_cnt for n in pub_year_counter] 297 | # 298 | # # plot.figure() 299 | # idx = 0 300 | # plot.loglog(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 301 | # # marker=line_markers[idx], markersize=8, markevery=0.2, 302 | # color=colors[idx], label='MAG', linewidth=linewidth) 303 | # idx = 1 304 | # plot.loglog(pub_year, pub_count, linestyle=linestyles[idx], 305 | # # marker=line_markers[idx], markersize=8, markevery=0.2, 306 | # color=colors[idx], label='LAGOS-AND', linewidth=linewidth) 307 | # plot.title(check_item, fontsize=18) 308 | # plot.xlabel('ethnicity', loc='right', fontsize=18) 309 | # plot.ylabel('ethnicity proportion', loc='center', fontsize=18) # 'top' 310 | # plot.legend(loc='best') # 'lower right' 311 | 312 | 313 | def plot_lastname_popularity(whole_lastname_popularity_dist, counter_block, counter_pairwise, check_item): 314 | # ratio = len(author_lastname_counter) * 1.0 / len(whole_lastname_popularity_dist) 315 | # used_for_plot_ratio = 1 316 | whole_pub_year_dist = sorted([n for n in whole_lastname_popularity_dist], 317 | # if random() <= used_for_plot_ratio * ratio 318 | key=lambda x: int(x[0]), reverse=False) 319 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 320 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist] 321 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 322 | 323 | pub_year_counter1 = sorted([n for n in counter_block], # if random() <= used_for_plot_ratio 324 | key=lambda x: int(x[0]), reverse=False) 325 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 326 | pub_year1 = [int(n[0]) for n in pub_year_counter1] 327 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 328 | 329 | pub_year_counter2 = sorted([n for n in counter_pairwise], # if random() <= used_for_plot_ratio 330 | key=lambda x: int(x[0]), reverse=False) 331 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 332 | pub_year2 = [int(n[0]) for n in pub_year_counter2] 333 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 334 | 335 | print(whole_pub_year_dist[0], whole_pub_year_dist[-1]) 336 | print(pub_year_counter1[0], pub_year_counter1[-1]) 337 | print(pub_year_counter2[0], pub_year_counter2[-1]) 338 | 339 | print(list(zip(whole_pub_year, whole_pub_dist))) 340 | print(list(zip(pub_year1, pub_count1))) 341 | print(list(zip(pub_year2, pub_count2))) 342 | 343 | # plot.figure() 344 | idx = 0 345 | plot.scatter(whole_pub_year, # [n * 100.0 / len(whole_pub_dist) for n in range(len(whole_pub_dist))], 346 | whole_pub_dist, 347 | marker='.', 348 | color=colors[idx], label='MAG', s=4) 349 | idx = 1 350 | plot.scatter(pub_year1, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))], 351 | pub_count1, 352 | marker='o', 353 | color=colors[idx], label='LAGOS-AND-BLOCK', s=4) 354 | idx = 2 355 | plot.scatter(pub_year2, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))], 356 | pub_count2, 357 | marker='s', 358 | color=colors[idx], label='LAGOS-AND-PAIRWISE', s=4) 359 | 360 | plot.xscale('log') 361 | plot.yscale('log') 362 | plot.title(check_item, fontsize=18) 363 | plot.xlabel('LN Popularity', loc='right', fontsize=18) 364 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 365 | plot.xticks(fontsize=tick_font_size) 366 | plot.yticks(fontsize=tick_font_size) 367 | 368 | plot.legend(loc='best') # 'lower right' 369 | 370 | 371 | def plot_namespace_popularity(whole_lastname_popularity_dist, counter_block, counter_pairwise, check_item): 372 | # ratio = len(author_lastname_counter) * 1.0 / len(whole_lastname_popularity_dist) 373 | # used_for_plot_ratio = 1 374 | whole_pub_year_dist = sorted([n for n in whole_lastname_popularity_dist], 375 | # if random() <= used_for_plot_ratio * ratio 376 | key=lambda x: int(x[0]), reverse=False) 377 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 378 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist] 379 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 380 | 381 | pub_year_counter1 = sorted([n for n in counter_block], # if random() <= used_for_plot_ratio 382 | key=lambda x: int(x[0]), reverse=False) 383 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 384 | pub_year1 = [int(n[0]) for n in pub_year_counter1] 385 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 386 | 387 | pub_year_counter2 = sorted([n for n in counter_pairwise], # if random() <= used_for_plot_ratio 388 | key=lambda x: int(x[0]), reverse=False) 389 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 390 | pub_year2 = [int(n[0]) for n in pub_year_counter2] 391 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 392 | 393 | print(whole_pub_year_dist[0], whole_pub_year_dist[-1]) 394 | print(pub_year_counter1[0], pub_year_counter1[-1]) 395 | print(pub_year_counter2[0], pub_year_counter2[-1]) 396 | 397 | print(list(zip(whole_pub_year, whole_pub_dist))) 398 | print(list(zip(pub_year1, pub_count1))) 399 | print(list(zip(pub_year2, pub_count2))) 400 | 401 | # plot.figure() 402 | idx = 0 403 | plot.scatter(whole_pub_year, # [n * 100.0 / len(whole_pub_dist) for n in range(len(whole_pub_dist))], 404 | whole_pub_dist, 405 | marker='.', 406 | color=colors[idx], label='MAG', s=4) 407 | idx = 1 408 | plot.scatter(pub_year1, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))], 409 | pub_count1, 410 | marker='o', 411 | color=colors[idx], label='LAGOS-AND-BLOCK', s=4) 412 | idx = 2 413 | plot.scatter(pub_year2, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))], 414 | pub_count2, 415 | marker='s', 416 | color=colors[idx], label='LAGOS-AND-PAIRWISE', s=4) 417 | 418 | plot.xscale('log') 419 | plot.yscale('log') 420 | plot.title(check_item, fontsize=18) 421 | plot.xlabel('LNFI Popularity', loc='right', fontsize=18) 422 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 423 | plot.xticks(fontsize=tick_font_size) 424 | plot.yticks(fontsize=tick_font_size) 425 | 426 | plot.legend(loc='best') # 'lower right' 427 | 428 | 429 | def plot_fos(whole_fos_dist, counter_block, counter_pairwise, check_item): 430 | whole_pub_year_dist = sorted(whole_fos_dist, key=lambda x: x[1], reverse=False) 431 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist]) 432 | whole_pub_year = [n[0] for n in whole_pub_year_dist] 433 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist] 434 | 435 | keys1 = [n[0] for n in counter_block] 436 | print(keys1) 437 | pub_year_counter1 = [counter_block[keys1.index(n)] if n in keys1 else (n, 0) for n in whole_pub_year] 438 | pub_cnt1 = sum([n[1] for n in pub_year_counter1]) 439 | pub_year1 = [n[0] for n in pub_year_counter1] 440 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1] 441 | 442 | keys2 = [n[0] for n in counter_pairwise] 443 | print(keys2) 444 | pub_year_counter2 = [counter_block[keys2.index(n)] if n in keys2 else (n, 0) for n in whole_pub_year] 445 | pub_cnt2 = sum([n[1] for n in pub_year_counter2]) 446 | pub_year2 = [n[0] for n in pub_year_counter2] 447 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2] 448 | 449 | # plot.figure() 450 | idx = 0 451 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx], 452 | # marker=line_markers[idx], markersize=8, markevery=0.2, 453 | color=colors[idx], label='MAG', linewidth=linewidth) 454 | idx = 1 455 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx], 456 | # marker=line_markers[idx], markersize=8, markevery=0.2, 457 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth) 458 | idx = 2 459 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx], 460 | # marker=line_markers[idx], markersize=8, markevery=0.2, 461 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth) 462 | # plot.yscale('log') 463 | plot.xticks(fontsize=10, rotation=45, ha='right') 464 | # plot.autofmt_xdate(bottom=0.2, rotation=30, ha='center') 465 | plot.title(check_item, fontsize=18) 466 | # plot.xlabel('domain', loc='right') 467 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top' 468 | # plot.xticks(fontsize=tick_font_size) 469 | plot.yticks(fontsize=tick_font_size) 470 | 471 | plot.legend(loc='best') # 'lower right' 472 | 473 | 474 | plot.figure(42, figsize=(12, 18), dpi=300) 475 | plot.subplot(421) 476 | # plot.grid(True) 477 | plot_pub_year(whole_pub_year_dist, pub_year_counter_block, pub_year_counter_pairwise, 478 | check_item='(a) Publication Distribution') 479 | plot.subplot(422) 480 | plot_author_position(whole_author_position_dist, author_position_counter_block, author_position_counter_pairwise, 481 | check_item='(b) Author Position Distribution') 482 | plot.subplot(423) 483 | 484 | plot_author_gender(whole_genni_gender_dist, author_genni_gender_counter_block, author_genni_gender_counter_pairwise, 485 | check_item='(c) Gender Distribution') 486 | plot.subplot(424) 487 | # plot_author_gender(whole_mac_gender_dist, author_sex_mac_counter, check_item='mac_gender') 488 | # plot_author_gender(whole_ssn_gender_dist, author_ssn_gender_counter, check_item='ssn_gender') 489 | 490 | plot_ethnic_seer(whole_ethnic_seer_dist, author_ethnic_seer_counter_block, author_ethnic_seer_counter_pairwise, 491 | check_item='(d) Ethnicity Distribution') 492 | 493 | # plot_ethnea(whole_ethnea_dist, author_ethnea_counter, check_item='(d) ethnicity distribution') 494 | # plot.subplot(425) 495 | 496 | plot.subplot(425) 497 | plot_lastname_popularity(whole_lastname_popularity_dist, author_lastname_counter_block, author_lastname_counter_pairwise, 498 | check_item='(e) LN Popularity Distribution') 499 | plot.subplot(426) 500 | plot_namespace_popularity(whole_lastname_first_initial_popularity_dist, lastname_first_initial_counter_block, 501 | lastname_first_initial_counter_pairwise, 502 | check_item='(f) LNFI Popularity Distribution') 503 | 504 | plot.subplot(427) 505 | plot_fos(whole_fos_dist, fos_counter_block, fos_counter_pairwise, check_item='(g) Domain Distribution') 506 | 507 | plot.tight_layout() 508 | plot.savefig(os.path.join(cached_dir, 'data-distribution.png'), dpi=600) 509 | plot.savefig(os.path.join(latex_doc_base_dir, 'figs/data-distribution.png'), dpi=600) 510 | # plot.savefig(os.path.join(cached_dir, 'gold-standard-check.pdf'), dpi=500) 511 | plot.show() 512 | --------------------------------------------------------------------------------