├── .gitignore
├── .idea
├── .gitignore
├── vcs.xml
├── sqldialects.xml
├── other.xml
├── modules.xml
├── misc.xml
├── libraries
│ ├── Maven__com_google_code_gson_gson_2_8_0.xml
│ ├── Maven__org_projectlombok_lombok_1_18_8.xml
│ └── Maven__org_apache_commons_commons_lang3_3_4.xml
├── compiler.xml
├── dataSources.xml
├── jarRepositories.xml
└── uiDesigner.xml
├── src
├── model
│ ├── available_model.py
│ ├── nn.py
│ ├── regression.py
│ └── classification.py
├── eutilities
│ ├── customized_print.py
│ ├── metric.py
│ ├── MAGdata
│ │ ├── parse_absract_from_mag_kg.py
│ │ └── parse_fos_from_mag_kg.py
│ ├── preprocessor.py
│ ├── name
│ │ ├── name_parser.py
│ │ ├── name_parser_by_socket.py
│ │ └── name_parser_by_localscript.py
│ ├── train_utils.py
│ └── string_utils.py
├── myconfig.py
├── comparison
│ ├── block
│ │ ├── batch_runner.sh
│ │ ├── clustering_metrics_MAG_AID.py
│ │ └── clustering_metrics_other_baselines.py
│ └── pairwise
│ │ └── classification_metrics.py
├── statistics
│ ├── orcid_doi_number_each_year.py
│ └── last_name_variation_considering_transliterating.py
├── feature
│ ├── cluster
│ │ ├── sparse_tfidf_feature.py
│ │ ├── doc2vec_feature.py
│ │ ├── fast_feature.py
│ │ └── network_feature.py
│ ├── doc2vec_trainer.py
│ ├── pairwise
│ │ └── our_dataset_to_feature.py
│ └── simple_matching_network_trainer_evaluator.py
└── datacheck
│ └── checking_multi_facets.py
├── pubmed-paper-author-link.iml
├── dataset
├── DBLP-CiteSeerX
│ └── check.py
├── PubMed-GS
│ └── check.py
├── dataset-urls.txt
├── DBLP-GESIS
│ └── check.py
├── REXA-Culotta
│ └── check.py
├── Aminer-Simple
│ └── check.py
├── Aminer-Rich
│ └── check.py
├── SCAD-zbMATH-Muller
│ └── check.py
├── DBLP-Kim
│ └── check.py
├── PubMed-Kim
│ ├── check.py
│ └── Kim_Authority_ORCID_linkage_dataset.sql
├── BDBComp-Cota
│ └── check.py
├── DBLP-Qian
│ └── check.py
├── Aminer-Zhang
│ └── check.py
└── Aminer-WhoisWho
│ └── to_table_and_check.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /and/target/
2 | ./.idea
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
3 |
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/sqldialects.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/model/available_model.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class ModelName(Enum):
5 | linear = 'Linear'
6 | logistic = 'Logistic'
7 | dt = 'DecisionTree'
8 | randomforest = 'RandomForest'
9 |
10 | @classmethod
11 | def available_modes(self):
12 | return [self.randomforest]
13 |
14 | @classmethod
15 | def get_short_name(self, model_name):
16 | return \
17 | dict(zip(
18 | [self.linear, self.logistic, self.dt, self.randomforest],
19 | ['Linear', 'LR', 'DecisionTree', 'RF']))[model_name]
20 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_code_gson_gson_2_8_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_projectlombok_lombok_1_18_8.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_commons_commons_lang3_3_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/pubmed-paper-author-link.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/dataset/DBLP-CiteSeerX/check.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | ds_dir = './nameset_author-disamb'
4 | ds = [n for n in os.listdir(ds_dir)]
5 | print('num_block_size: %d' % len(ds))
6 |
7 | num_citation = 0
8 | num_author_group = 0
9 | for n in ds:
10 | fn = os.path.join(ds_dir, n)
11 | author_idx_arr = []
12 | for line in open(fn, encoding='iso8859-1'):
13 | author_idx_citation_idx = line[:line.index(' ')]
14 | num_citation += 1
15 | author_idx, citation_idx = author_idx_citation_idx.split('_')
16 | author_idx_arr.append(author_idx)
17 | num_author_group += len(set(author_idx_arr))
18 |
19 | print('num_author_group_size: %d' % num_author_group)
20 | print('num_citation: %d' % num_citation)
21 |
--------------------------------------------------------------------------------
/dataset/PubMed-GS/check.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | df = pd.read_csv('./1900_pairs_train_test.csv', sep=';')
4 | last_name_variation_cases = 0
5 | for i, (ln1, ln2) in df[['Last_name1', 'Last_name2']].iterrows():
6 | if str(ln1).lower() != str(ln2).lower():
7 | last_name_variation_cases += 1
8 | print(ln1, ln2)
9 |
10 | print('last_name_variation_cases: %d' % last_name_variation_cases)
11 |
12 | num_paired_records = df.shape[0]
13 | print('num_paired_records: %d' % num_paired_records)
14 |
15 | pmid_arr = list(df['PMID1'].values) + list(df['PMID2'].values)
16 | num_citation = len(set(pmid_arr))
17 | # print('num_author_group: %d' % num_author_group)
18 | print('num_citation: %d' % num_citation)
19 |
20 |
--------------------------------------------------------------------------------
/dataset/dataset-urls.txt:
--------------------------------------------------------------------------------
1 | DBLP-Muller https://github.com/yaya213/DBLP-Name-Disambiguation-Dataset
2 | DBLP-CiteSeerX http://clgiles.ist.psu.edu/data/nameset_author-disamb.tar.zip
3 | DBLP-KIM(PENN) https://doi.org/10.6084/m9.figshare.6840281.v2
4 | KISTI http://www.lbd.dcc.ufmg.br/lbd/collections/disambiguation/DBLP.tar.gz/at_download/file
5 |
6 | Aminer:
7 | rich: http://arnetminer.org/lab-datasets/disambiguation/rich-author-disambiguation-data.zip or
8 | rich: https://lfs.aminer.cn/lab-datasets/disambiguation/rich-author-disambiguation-data.zip
9 | and
10 | simple: https://lfs.aminer.cn/lab-datasets/disambiguation/author-disambiguation-data.zip
11 |
12 | Aminer-ZHANG: https://static.aminer.cn/misc/na-data-kdd18.zip
13 |
14 |
--------------------------------------------------------------------------------
/dataset/DBLP-GESIS/check.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | import pandas as pd
4 |
5 | names = pd.read_csv('Dataset/sigir/Gold Dataset/disambiguatedNames.csv', sep=';', encoding='iso8859-1')
6 | print(names.head())
7 | pubs = pd.read_csv('Dataset/sigir/Gold Dataset/goldstandardPublications.csv', sep=';', encoding='iso8859-1')
8 | print(pubs.head())
9 |
10 | author_names = names['name'].apply(lambda x: ' '.join(x.split(' ')[:-1])).values
11 |
12 | counter = Counter(author_names)
13 | print(counter)
14 |
15 | print('num_block: %d' % len(set(author_names)))
16 | num_citation = pubs.shape[0]
17 | num_author_group = len(set(names['fk_authorid'].values))
18 | print('num_author_group: %d' % num_author_group)
19 | print('num_citation: %d' % num_citation)
20 |
--------------------------------------------------------------------------------
/src/eutilities/customized_print.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | pd.set_option('display.unicode.ambiguous_as_wide', True) # 设置列名对齐
4 | pd.set_option('display.unicode.east_asian_width', True) # 设置列名对齐
5 | pd.set_option('display.max_rows', None) # 显示所有行
6 | pd.set_option('display.max_columns', None) # 显示所有列
7 | pd.set_option('expand_frame_repr', False) # 设置不换行
8 |
9 |
10 | def pprint(kv: list, decimal=2, pctg=False, sep=None):
11 | k = [item[0] for item in kv]
12 | if pctg:
13 | v = [round(item[1] * 100.0, decimal) for item in kv]
14 | else:
15 | v = [round(item[1], decimal) for item in kv]
16 | if not sep:
17 | df = pd.DataFrame(data=[v], columns=k)
18 | print(df.head())
19 | else:
20 | print(sep.join([str(s) for s in v]))
21 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/dataset/REXA-Culotta/check.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | base_path = './rexa_author_coref/rexa'
4 | blocks = [n for n in os.listdir(base_path)]
5 |
6 | num_blocks = 0
7 | num_author_group = 0
8 | num_citation = 0
9 | for n in blocks:
10 | path = os.path.join(base_path, n)
11 | if os.path.isfile(path):
12 | continue
13 | num_blocks += 1
14 | block_authors = [n for n in os.listdir(path)]
15 | num_author_group += len(block_authors)
16 | for m in block_authors:
17 | path1 = os.path.join(path, m)
18 | if os.path.isfile(path1):
19 | continue
20 | citations = [n for n in os.listdir(path1)]
21 | num_citation += len(citations)
22 |
23 | print('num_block: %d' % num_blocks)
24 | print('num_author_group: %d' % num_author_group)
25 | print('num_citation: %d' % num_citation)
26 |
--------------------------------------------------------------------------------
/dataset/Aminer-Simple/check.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | base_path = './author-disambiguation-data/data/Answer'
4 | blocks = [n for n in os.listdir(base_path)]
5 |
6 | num_blocks = 0
7 | num_author_group = 0
8 | citations = []
9 | for n in blocks:
10 | path = os.path.join(base_path, n)
11 | if not os.path.isfile(path):
12 | continue
13 | num_blocks += 1
14 | for line in open(path):
15 | if ':' not in line:
16 | continue
17 | id = line[:line.index(':')]
18 | papers = [m.strip() for m in line[line.index(':') + 1:].split(' ') if len(m.strip()) > 0]
19 | print(id, papers)
20 | num_author_group += 1
21 | citations.extend(papers)
22 |
23 | num_citation = len(set(citations))
24 | print('num_block: %d' % num_blocks)
25 | print('num_author_group: %d' % num_author_group)
26 | print('num_citation: %d' % num_citation)
27 |
--------------------------------------------------------------------------------
/dataset/Aminer-Rich/check.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | base_path = './rich-author-disambiguation-data/experimental-results'
4 | blocks = [n for n in os.listdir(base_path)]
5 |
6 | num_blocks = 0
7 | num_author_group = 0
8 | citations = []
9 | for n in blocks:
10 | path = os.path.join(base_path, n)
11 | if not os.path.isfile(path) or 'classify' not in n:
12 | continue
13 |
14 | num_blocks += 1
15 | for line in open(path):
16 | if ':' not in line:
17 | continue
18 | id = line[:line.index(':')]
19 | papers = [m.strip() for m in line[line.index(':') + 1:].split(' ') if len(m.strip()) > 0]
20 | print(id, papers)
21 | num_author_group += 1
22 | citations.extend(papers)
23 |
24 | num_citation = len(set(citations))
25 | print('num_block: %d' % num_blocks)
26 | print('num_author_group: %d' % num_author_group)
27 | print('num_citation: %d' % num_citation)
28 |
--------------------------------------------------------------------------------
/dataset/SCAD-zbMATH-Muller/check.py:
--------------------------------------------------------------------------------
1 | import xmltodict
2 |
3 | data = open('SCAD-zbMATH/scad-zbmath-01-open-access.xml').read()
4 | data = xmltodict.parse(data)
5 | data_instance = []
6 | for n in data['publications']['publication']:
7 | title = n['title']
8 | authors = n['authors']['author']
9 | if type(authors) != list:
10 | authors = [authors]
11 | # print(authors)
12 | for au in authors:
13 | name, shortname, id = au['@name'], au['@shortname'], au['@id'],
14 | print(name, shortname, id, title)
15 | last_name_first_initial = shortname
16 | data_instance.append([name, last_name_first_initial, id, title])
17 |
18 | num_blocks = len(set([n[1] for n in data_instance]))
19 | num_author_group = len(set([n[2] for n in data_instance]))
20 | num_citation = len(data_instance)
21 |
22 | print('num_block: %d' % num_blocks)
23 | print('num_author_group: %d' % num_author_group)
24 | print('num_citation: %d' % num_citation)
25 |
--------------------------------------------------------------------------------
/.idea/dataSources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | clickhouse
6 | true
7 | ru.yandex.clickhouse.ClickHouseDriver
8 | jdbc:clickhouse://localhost:8124
9 |
10 |
11 |
12 |
13 |
14 | clickhouse
15 | true
16 | ru.yandex.clickhouse.ClickHouseDriver
17 | jdbc:clickhouse://202.114.70.54:8123
18 |
19 |
20 |
--------------------------------------------------------------------------------
/dataset/DBLP-Kim/check.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | import pandas as pd
4 |
5 | # author name: full name string extracted from DBLP
6 | # unique author id: labels assigned manually by Dr. C. Lee Giles's team
7 | # paper id: assigned by Dr. Jinseok Kim
8 | # author list: names of authors in the byline of the paper
9 | # year: publication year
10 | # venue: conference or journal names
11 | # title: stopwords removed and stemmed by the Porter's stemmer
12 |
13 | df = pd.read_csv('./DBLP_labeled_data.txt', sep='\t',
14 | names=['author name', 'unique author id', 'paper id', 'author list', 'year', 'venue', 'title', 'null'], index_col=None)
15 | print(df.head())
16 |
17 |
18 | author_names = df['unique author id'].apply(lambda x: ' '.join(x.split('-')[:-1])).values
19 |
20 | counter = Counter(author_names)
21 | print(counter)
22 |
23 | print('num_block: %d' % len(set(author_names)))
24 |
25 | num_citation = len(set(df['paper id'].values))
26 | num_author_group = len(set(df['unique author id'].values))
27 | print('num_author_group: %d' % num_author_group)
28 | print('num_citation: %d' % num_citation)
29 |
--------------------------------------------------------------------------------
/src/myconfig.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | gpu_id = 0
5 | device = "cuda:%d" % gpu_id
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--which_model", type=int, default=5)
9 | cli_args = parser.parse_args()
10 | best_hac_clustering_parameters = [0.45, 0.25, 0.2, 0.2, 0.25, 0.2]
11 | # best_hac_clustering_parameters = [None, None, None, None, None, None]
12 | tuned_best_cluster_setting = best_hac_clustering_parameters[cli_args.which_model]
13 |
14 | # resource config
15 | latex_doc_base_dir = '/home/zhangli/ssd-1t/repo/manuscripts/ongoning-works/and-dataset/src/'
16 | src_base_path = os.path.dirname(os.path.abspath(__file__))
17 | cached_dir = os.path.join(src_base_path, 'cached')
18 |
19 | pretrained_model_path = proj_base_path = os.path.abspath('/home/zhangli/pre-trained-models/')
20 | glove6b_path = os.path.join(pretrained_model_path, 'glove.6B/')
21 | glove840b300d_path = os.path.join(pretrained_model_path, 'glove.840B/')
22 | fasttextcrawl300d2m_path = os.path.join(pretrained_model_path, 'fastText/crawl-300d-2M.vec')
23 | infersent_based_path = os.path.join(pretrained_model_path, 'infersent')
24 |
--------------------------------------------------------------------------------
/src/comparison/block/batch_runner.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #conda activate rapids-21.06
3 | #which python
4 | #python clustering_metrics_other_baselines.py
5 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=0 >r1_trimmed_model_trimmed_dataset0.log 2>1&
6 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=1 >r1_trimmed_model_trimmed_dataset1.log 2>1&
7 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=2 >r1_trimmed_model_trimmed_dataset2.log 2>1&
8 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=3 >r1_trimmed_model_trimmed_dataset3.log 2>1&
9 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=4 >r1_trimmed_model_trimmed_dataset4.log 2>1&
10 | nohup /home/zhangli/mydisk-2t/miniconda3/envs/rapids-21.06/bin/python -u clustering_metrics_other_baselines.py --which_model=5 >r1_trimmed_model_trimmed_dataset5.log 2>1&
--------------------------------------------------------------------------------
/src/eutilities/metric.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
2 |
3 | metric_names = ['acc', 'p', 'r', 'f1', 'macro_f1', 'macro_weighted_f1', 'micro_f1', 'auc']
4 |
5 |
6 | def calc_metrics(test_y, pred_y, average='macro_f1', search_cut_off=True):
7 | prob = 0.5
8 | pred_y_label = [1 if i > prob else 0 for i in pred_y]
9 |
10 | acc = accuracy_score(test_y, pred_y_label)
11 | p = precision_score(test_y, pred_y_label)
12 | r = recall_score(test_y, pred_y_label)
13 |
14 | macro_f1 = f1_score(test_y, pred_y_label, average='macro')
15 | macro_weighted_f1 = f1_score(test_y, pred_y_label, average='weighted')
16 | micro_f1 = f1_score(test_y, pred_y_label, average='micro')
17 |
18 | pos_label_f1 = f1_score(test_y, pred_y_label, average='binary')
19 | rocauc = roc_auc_score(y_true=test_y, y_score=pred_y)
20 | # neg_label_f1 = f1_score(test_y, pred_y_label, pos_label=0, average='binary')
21 | # print(confusion_matrix(test_y, pred_y_label))
22 | return dict(
23 | zip(metric_names,
24 | [acc, p, r, pos_label_f1, macro_f1, macro_weighted_f1, micro_f1, rocauc]))
25 |
--------------------------------------------------------------------------------
/.idea/jarRepositories.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/dataset/PubMed-Kim/check.py:
--------------------------------------------------------------------------------
1 | from unidecode import unidecode
2 |
3 | from myio.data_reader import DBReader
4 |
5 | df = DBReader.tcp_model_cached_read("cached/YYYY",
6 | """select PMID,
7 | trimBoth(splitByString(',', MEDLINE_Name)[1]) as medline_lastname,
8 | splitByChar('_', AINI)[1] as block_lastname
9 | from and_ds.AUT_ORC
10 | where medline_lastname != block_lastname""",
11 | cached=False)
12 | df['medline_lastname_parsed'] = df['medline_lastname'].apply(unidecode)
13 | df['block_lastname_parsed'] = df['block_lastname'].apply(unidecode)
14 |
15 | all = df.shape[0]
16 | cnt = 0
17 | for i, (pmid, medline_lastname, medline_lastname_parsed, block_lastname, block_lastname_parsed) in df[
18 | ['PMID', 'medline_lastname', 'medline_lastname_parsed', 'block_lastname', 'block_lastname_parsed']].iterrows():
19 | # medline_lastname_parsed = medline_lastname_parsed.lower().replace('-', '').replace(' ', '').replace('\'', '').replace('?', '')
20 | medline_lastname_parsed = ''.join([n for n in medline_lastname_parsed.lower() if n not in ('-',' ','\'', '?')])
21 | block_lastname_parsed = block_lastname_parsed.lower()
22 | if medline_lastname_parsed != block_lastname_parsed:
23 | print(pmid, medline_lastname_parsed, block_lastname_parsed)
24 | cnt += 1
25 |
26 | print(cnt, all, cnt / all)
27 |
--------------------------------------------------------------------------------
/dataset/BDBComp-Cota/check.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | import pandas as pd
4 |
5 | # font_bdbcomp.txt and title_bdbcomp.txt.
6 | #
7 | # The records of the file font_bdbcomp.txt are in the format of:
8 | # citationId<>clusterId_sequential<>coauthor:coauthor:...:coauthor<>publicationVenueTitle<>author
9 | #
10 | # The records of the file title_bdbcomp.txt are in the format of:
11 | # citationId<>workTitle
12 |
13 | name = pd.read_csv('./bdbcomp/font_bdbcomp.txt', sep='<>',
14 | names=['citationId', 'clusterId_sequential', 'authorList', 'publicationVenueTitle', 'author',
15 | 'null'])
16 | print(name.head())
17 | pub = pd.read_csv('./bdbcomp/title_bdbcomp.txt', sep='<>', names=['citationId', 'workTitle'], error_bad_lines=None)
18 | print(pub.head())
19 | print(pub.shape)
20 |
21 | author_names = name['author'].apply(lambda x: '_'.join([
22 | x.split(' ')[-1], # last name
23 | x.split(' ')[0][0] # first initial
24 | ])).values
25 |
26 | counter = Counter(author_names)
27 | print(counter)
28 |
29 |
30 | author_group_idx = [int(n) for n in set(name['clusterId_sequential'].apply(lambda x: str(x)[:str(x).index('_')]).values)]
31 | for n in range(214):
32 | if n not in author_group_idx:
33 | print(n)
34 | print(sorted(author_group_idx))
35 | num_author_group = len(author_group_idx)
36 | print('num_block: %d' % len(set(author_names)))
37 | num_citation = len(set(name['citationId'].values))
38 | print('num_author_group: %d' % num_author_group)
39 | print('num_citation: %d' % num_citation)
40 |
--------------------------------------------------------------------------------
/dataset/DBLP-Qian/check.py:
--------------------------------------------------------------------------------
1 | ds_fn = './DBLP name disambiguation dataset'
2 | lines = [line for line in open(ds_fn)]
3 | header = lines[0]
4 | print('headers: %s' % header)
5 | lines = ''.join(lines[1:])
6 |
7 | blocks = lines.split('\n\n')
8 |
9 | print('num_block: %d' % len(blocks))
10 | num_citation = 0
11 | num_author_group = 0
12 | for n in blocks:
13 | if '\n' not in n:
14 | # print(n)
15 | continue
16 | block_name = n[:n.index('\n')]
17 | fields = n[n.index('\n') + 1:].split('\n')
18 | # assert len(fields) % 9 ==0
19 | # num_names = int(len(fields) / 9)
20 | author_idx_arr = []
21 |
22 | for m in fields:
23 | try:
24 | if '\t' in m:
25 | author_idx = int(m[:m.index('\t')])
26 | author_idx_arr.append(author_idx)
27 | except Exception as e:
28 | # print(e)
29 | pass
30 | num_author_group += len(set(author_idx_arr))
31 | num_citation += len(author_idx_arr)
32 |
33 | #
34 | # num_citation = 0
35 | # num_author_group = 0
36 | # for n in ds:
37 | # fn = os.path.join(ds_dir, n)
38 | # author_idx_arr = []
39 | # for line in open(fn, encoding='iso8859-1'):
40 | # author_idx_citation_idx = line[:line.index(' ')]
41 | # num_citation += 1
42 | # author_idx, citation_idx = author_idx_citation_idx.split('_')
43 | # author_idx_arr.append(author_idx)
44 | # num_author_group += len(set(author_idx_arr))
45 |
46 | print('num_author_group: %d' % num_author_group)
47 | print('num_citation: %d' % num_citation)
48 |
--------------------------------------------------------------------------------
/src/eutilities/MAGdata/parse_absract_from_mag_kg.py:
--------------------------------------------------------------------------------
1 | # download MAG KG from https://zenodo.org/record/3930398#.X9YvjnYzY5ll
2 | import traceback
3 |
4 | temp_line = ''
5 | closed = True
6 |
7 | file_name = 'PaperAbstractsInvertedIndex.nt'
8 | orcid_mag_matched_paper_id = set([line.strip() for line in open('orcid_mag_matched_paper_id.txt')])
9 | all_need_matched = len(orcid_mag_matched_paper_id)
10 | print(all_need_matched)
11 | matched_cnt = 0
12 | fw = open('orcid_mag_matched_paper_abstract.txt', 'w')
13 | for line in open(file_name):
14 | line = line.strip()
15 | temp_line = temp_line + line + ' '
16 | if line.endswith('string> .'):
17 | # if 'string> .' in line:
18 | closed = True
19 | else:
20 | closed = False
21 |
22 | if closed:
23 | # print(temp_line)
24 | try:
25 | front = temp_line[:temp_line.index('terms/abstract>') + 16]
26 | back = temp_line[temp_line.index('terms/abstract>') + 16: temp_line.index('^^')]
27 | pid = front[front.index('entity/') + 7:front.index('> <')].strip()
28 | abstract = back
29 | # print(pid, abstract)
30 | if pid in orcid_mag_matched_paper_id:
31 | matched_cnt += 1
32 | if matched_cnt % 10000:
33 | print('matched_cnt: ', matched_cnt, matched_cnt * 100.0 / all_need_matched)
34 | fw.write(pid + '\t' + abstract + '\n')
35 | # print(temp_line)
36 | except:
37 | traceback.print_exc()
38 | # print('-' * 100)
39 | temp_line = ''
40 |
--------------------------------------------------------------------------------
/src/eutilities/preprocessor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
4 |
5 |
6 | def drop_missing_items(df):
7 | df = df.dropna(how='any')
8 | return df
9 |
10 |
11 | def down_sample(df, percent=1):
12 | '''
13 | percent:多数类别下采样的数量相对于少数类别样本数量的比例
14 | '''
15 | neg_samples_num = df[df['same_author'] == 0].shape[0]
16 | pos_samples_num = df[df['same_author'] == 1].shape[0]
17 | if neg_samples_num < pos_samples_num:
18 | data1 = df[df['same_author'] == 0] # 将多数类别的样本放在data1
19 | data0 = df[df['same_author'] == 1] # 将少数类别的样本放在data0
20 | else:
21 | data0 = df[df['same_author'] == 0] # 将多数类别的样本放在data0
22 | data1 = df[df['same_author'] == 1] # 将少数类别的样本放在data1
23 | index = np.random.randint(
24 | len(data0), size=percent * (len(df) - len(data0))) # 随机给定下采样取出样本的序号
25 | lower_data1 = data0.iloc[list(index)] # 下采样
26 | # print(lower_data1.shape)
27 | # print(data1.shape)
28 | return (pd.concat([lower_data1, data1]))
29 |
30 |
31 | def scale(df):
32 | mm_scaler = MinMaxScaler()
33 | df = mm_scaler.fit_transform(df)
34 | std_scaler = StandardScaler()
35 | df = std_scaler.fit_transform(df)
36 | return df
37 |
38 |
39 | def select_features():
40 | # #SelectKBest(卡方系数)
41 | # ch2 = SelectKBest(chi2,k=3)#在当前的案例中,使用SelectKBest这个方法从4个原始的特征属性,选择出来3个
42 | # x_train = ch2.fit_transform(x_train, y_train)#训练并转换
43 | # select_name_index = ch2.get_support(indices=True)
44 | # print ("对类别判断影响最大的三个特征属性分布是:",ch2.get_support(indices=False))
45 | # print(select_name_index)
46 | pass
47 |
48 |
49 | def preprocess(df):
50 | print('original shape: ', df.shape)
51 | df = drop_missing_items(df)
52 | print('after dropping shape: ', df.shape)
53 | df = scale(df)
54 | print('after scaling shape: ', df.shape)
55 | df = down_sample(df)
56 | print('after sampling shape: ', df.shape)
57 | return df
58 |
--------------------------------------------------------------------------------
/src/model/nn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class MatchGRU(nn.Module):
6 | def __init__(self, glove, hidden_dim=64, num_layers=2, num_hand_craft_feature=5, bidirectional=True, output_dim=1):
7 | super(MatchGRU, self).__init__()
8 | embedding_dim = len(glove.vectors[0])
9 | self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
10 | self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers,
11 | batch_first=True, dropout=0.5, bidirectional=bidirectional)
12 | # self.match_fc = nn.Linear(2 * num_layers * hidden_dim * (2 if bidirectional else 1), 5)
13 | self.match_fc = nn.Sequential(
14 | nn.Linear(2 * num_layers * hidden_dim * (2 if bidirectional else 1), 128),
15 | nn.ReLU(),
16 | nn.BatchNorm1d(128),
17 | nn.Dropout(p=0.5),
18 |
19 | nn.Linear(128, 64),
20 | nn.ReLU(),
21 | nn.BatchNorm1d(64),
22 | nn.Dropout(p=0.5),
23 |
24 | nn.Linear(64, 64),
25 | nn.ReLU(),
26 | nn.Linear(64, 16),
27 | nn.ReLU(),
28 | nn.Linear(16, 1),
29 | )
30 |
31 | def forward(self, input):
32 | XL, XR = input
33 |
34 | # output: [batch-size, Sequence-len, embedding-dim]
35 | XL = self.embedding(XL)
36 | XL, hl = self.gru(XL)
37 | # print(XL.shape, hl.shape)
38 | hl = torch.cat([hl[i] for i in range(len(hl))], dim=1)
39 | # print(hl.shape)
40 |
41 | # output: [batch-size, Sequence-len, embedding-dim]
42 | XR = self.embedding(XR)
43 | XR, hr = self.gru(XR)
44 | hr = torch.cat([hr[i] for i in range(len(hr))], dim=1)
45 |
46 | res = torch.cat([hl, hr], dim=1)
47 | res = self.match_fc(res)
48 |
49 | # convert to 0-1 possibility distribution
50 | # res = torch.softmax(res, dim=1)
51 |
52 | # add hand-craft features
53 | # res = torch.cat([HF, res], dim=1)
54 | # res = self.ml_hidden_fc(res)
55 | # print(res.shape)
56 | return res
57 |
--------------------------------------------------------------------------------
/src/statistics/orcid_doi_number_each_year.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import seaborn as sb
4 |
5 | from myconfig import cached_dir, latex_doc_base_dir
6 |
7 | # sb.set_style("darkgrid")
8 | custom_params = {"axes.spines.right": False, "axes.spines.top": False}
9 | sb.set_theme(style="ticks", rc=custom_params)
10 |
11 | from matplotlib import pyplot as plot
12 |
13 | plot.rcParams['font.family'] = 'serif'
14 | plot.rcParams['font.serif'] = ['Times New Roman'] + plot.rcParams['font.serif']
15 |
16 | from mytookit.data_reader import DBReader
17 |
18 | colors = ['green', 'red', 'gold', 'black', 'cyan', 'blue', 'magenta', 'purple', 'gray', 'fuchsia', 'orange', 'yellow']
19 | linestyles = ['--', '-.', '--', '--']
20 | line_markers = ['<', '>', '^', 'v']
21 | linewidth = 5
22 |
23 | df_orcid = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "num_orcid_each_year.pkl"),
24 | "", cached=True)
25 | print('df_orcid: ', df_orcid.values)
26 |
27 | df_doi = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "num_doi_each_year.pkl"), "", cached=True)
28 |
29 | print('df_doi: ', df_doi.values)
30 |
31 | plot.figure()
32 | # plot.grid(which='major', axis='y')
33 | idx = 0
34 | plot.plot(df_doi.values[:, 0].astype('int'), df_doi.values[:, 1], linestyle=linestyles[idx],
35 | # marker=line_markers[idx], markersize=8, markevery=0.2,
36 | color=colors[idx], label='DOI', linewidth=linewidth)
37 |
38 | idx = 1
39 | plot.plot(df_orcid.values[:, 0].astype('int'), df_orcid.values[:, 1], linestyle=linestyles[idx],
40 | # marker=line_markers[idx], markersize=8, markevery=0.2,
41 | color=colors[idx], label='ORCID', linewidth=linewidth)
42 |
43 | # plot.yscale('log')
44 | # plot.title('num of instance each year')
45 | # plot.xlabel('year', loc='right')
46 |
47 | plot.ylabel('# Records', loc='center', fontsize=18) # 'top'
48 | plot.legend(loc='best') # 'lower right'
49 | plot.xticks(fontsize=18)
50 | plot.yticks(fontsize=18)
51 |
52 | plot.tight_layout()
53 | plot.savefig(os.path.join(cached_dir, 'doi_orcid_each_year.png'), dpi=600)
54 | plot.savefig(os.path.join(latex_doc_base_dir, 'figs/doi_orcid_each_year.png'), dpi=600)
55 |
56 | plot.show()
57 |
--------------------------------------------------------------------------------
/src/comparison/block/clustering_metrics_MAG_AID.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pandas as pd
4 | from beard import metrics
5 | from mytookit.data_reader import DBReader
6 | from tqdm import tqdm
7 |
8 | from myconfig import cached_dir
9 |
10 | minimal_uniq_author_in_block = 0
11 |
12 | df_blocks = DBReader.tcp_model_cached_read(cached_file_path='xxx',
13 | sql=r'''select block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock from and_ds.our_and_dataset_block_test_set_mag_prediction;''',
14 | cached=False)
15 |
16 | num_instances = len(df_blocks)
17 | df_blocks = df_blocks[df_blocks['num_unique_author_inblock'] > minimal_uniq_author_in_block]
18 | num_instances1 = len(df_blocks)
19 | print('removed %d instances, enable each block containing more than one unique authors' % (num_instances - num_instances1))
20 |
21 | print(df_blocks.shape)
22 |
23 |
24 | # Note #############################################################################################
25 | # Note test the performance of MAG author identifier
26 | # Note the clustering evaluation can not provide the Random baseline because it can not generate the ``labels_pred``
27 |
28 | def data_precision_round(arr, precision=2, pctg=True):
29 | return [round(x * 100 if pctg else x, precision) for x in arr]
30 |
31 |
32 | mag_metrics = []
33 | for index, row in tqdm(df_blocks.iterrows(), total=df_blocks.shape[0]):
34 | block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock = row
35 | # print('block-size: %d' % len(pm_aos))
36 |
37 | # note calculate the paired-F1 and the B3-F1 score
38 | mag_metrics_b3 = metrics.b3_precision_recall_fscore(labels_true=ground_truths, labels_pred=mag_preds)
39 | mag_metrics_pairedf = metrics.paired_precision_recall_fscore(labels_true=ground_truths, labels_pred=mag_preds)
40 |
41 | mag_metrics.append([block_name] + data_precision_round(list(mag_metrics_pairedf + mag_metrics_b3)))
42 |
43 | # Note using block_name as the index row
44 | result_file = os.path.join(cached_dir, 'clustering-results-lagos-and-MAG-AID.tsv')
45 | columns = ['Block', 'pP', 'pR', 'pF', 'bP', 'bR', 'bF']
46 | df = pd.DataFrame(mag_metrics, columns=columns)
47 | df.to_csv(result_file, sep='\t')
48 | mean_metrics = df._get_numeric_data().mean()
49 | print(mean_metrics)
50 | print(columns)
51 | print('minimal_uniq_author_in_block: ', minimal_uniq_author_in_block,
52 | data_precision_round(mean_metrics.values.tolist(), pctg=False))
53 |
--------------------------------------------------------------------------------
/src/eutilities/name/name_parser.py:
--------------------------------------------------------------------------------
1 | import threading
2 |
3 | import personnamenorm as pnn
4 | from nameparser import HumanName
5 |
6 |
7 | def derek73_nameparser(name_str):
8 | name = HumanName(name_str)
9 | # name.as_dict()
10 | return [name.first, name.middle, name.last]
11 |
12 |
13 | threadLocal = threading.local()
14 |
15 |
16 | def thread_local_init():
17 | initialized = getattr(threadLocal, 'initialized', None)
18 | if initialized is None:
19 | print('init thread local and loaded pickle data')
20 | threadLocal.personnamenorm = pnn.namenorm('cached/p_firstname.p')
21 | threadLocal.initialized = True
22 | else:
23 | # print('has inited thread local')
24 | pass
25 |
26 |
27 | class NameProcessor():
28 | def __init__(self):
29 | pass
30 |
31 | def __call__(self, au):
32 | thread_local_init()
33 | if au is None or len(au) == 0:
34 | return []
35 | else:
36 | splited_au = []
37 | for pos, au_name in au:
38 | # print(id(threadLocal.personnamenorm))
39 | personnamenorm = threadLocal.personnamenorm
40 | personnamenorm.unify(au_name)
41 | splited_au.append([pos, ' '.join(personnamenorm.name['firstname']).lower(), 'merged_to_fn',
42 | ' '.join(personnamenorm.name['lastname']).lower()])
43 | # print(current_thread().name, splited_au)
44 | return splited_au
45 |
46 |
47 | personnamenorm = pnn.namenorm('cached/p_firstname.p')
48 |
49 |
50 | def klauslippert_personnamenorm(name_str):
51 | personnamenorm.unify(name_str)
52 | # print(name)
53 | return [' '.join(personnamenorm.name['firstname']).lower(), 'merged_to_fn',
54 | ' '.join(personnamenorm.name['lastname']).lower()]
55 |
56 |
57 | if __name__ == '__main__':
58 | names = ['Douglas H. Keefe', 'Carolina Abdala', 'Ram C. Naidu', 'David C. Mountain', 'Christopher A. Shera',
59 | 'John J. Guinan', 'Bernhard Ross', 'Kelly L. Tremblay', 'Terence W. Picton', 'Manfred Mauermann',
60 | 'Volker Hohmann', 'Richard L. Freyman', 'Karen S. Helfer', 'Uma Balakrishnan', 'Soha N. Garadat',
61 | 'Ruth Y. Litovsky', 'Michael A. Akeroyd', 'John Chambers', 'David Bullock', 'Alan R. Palmer',
62 | 'A. Quentin Summerfield']
63 | for n in names:
64 | print(n.lower(), derek73_nameparser(n), derek73_nameparser(n.lower()))
65 | print(n.lower(), klauslippert_personnamenorm(n), klauslippert_personnamenorm(n.lower()))
66 |
--------------------------------------------------------------------------------
/src/model/regression.py:
--------------------------------------------------------------------------------
1 | import time
2 | import warnings
3 |
4 | import numpy as np
5 | from sklearn.ensemble import RandomForestRegressor
6 | from sklearn.linear_model import LogisticRegression, LinearRegression
7 | from sklearn.tree import DecisionTreeRegressor
8 |
9 | from model.available_model import ModelName
10 |
11 | warnings.filterwarnings('ignore')
12 |
13 |
14 | def use_regression(X_train, Y_train, X_test, model_switch: str):
15 | if model_switch == ModelName.linear:
16 | pred_y, feature_importance = linear_regressor(X_train, Y_train, X_test)
17 | elif model_switch == ModelName.logistic:
18 | pred_y, coefs, _ = logistic_regressor(X_train, Y_train, X_test)
19 | feature_importance = np.array(coefs[0])
20 | elif model_switch == ModelName.dt:
21 | pred_y, feature_importance = dt_regressor(X_train, Y_train, X_test)
22 | elif model_switch == ModelName.randomforest:
23 | pred_y, feature_importance = randomforest_regressor(X_train, Y_train, X_test)
24 | else:
25 | pass
26 | return pred_y, feature_importance
27 |
28 |
29 | def linear_regressor(X_train, Y_train, X_test):
30 | model = LinearRegression()
31 | model.fit(X_train, Y_train)
32 | s = time.time()
33 | y_pred = model.predict(X_test)
34 | print('used time: ', time.time() - s)
35 | return y_pred, model.coef_
36 |
37 |
38 | def logistic_regressor(X_train, Y_train, X_test):
39 | # model = LogisticRegression(max_iter=1000, solver='newton-cg', tol=1e-5)
40 | model = LogisticRegression(max_iter=1000, tol=1e-4, class_weight='balanced', C=2)
41 | model.fit(X_train, Y_train)
42 | s = time.time()
43 | y_pred = model.predict_proba(X_test)
44 | print('used time: ', time.time() - s)
45 | # metrics = calc_metrics(Y_train, [p1 for (p0, p1) in y_pred])
46 | # pprint(metrics, pctg=True)
47 | y_pred = [p1 for (p0, p1) in y_pred]
48 | print(model.coef_, model.intercept_)
49 | return y_pred, model.coef_, model.intercept_
50 |
51 |
52 | def dt_regressor(X_train, Y_train, X_test):
53 | model = DecisionTreeRegressor(max_depth=6) # max_depth=,
54 | model.fit(X_train, Y_train)
55 | # depth = model.get_depth()
56 | # for i in range(depth):
57 | # print(model.get_params(i+1))
58 | s = time.time()
59 | y_pred = model.predict(X_test)
60 | print('used time: ', time.time() - s)
61 | return y_pred, model.feature_importances_
62 |
63 |
64 | def randomforest_regressor(X_train, Y_train, X_test):
65 | model = RandomForestRegressor(n_estimators=100)
66 | model.fit(X_train, Y_train)
67 | # s = time.time()
68 | y_pred = model.predict(X_test)
69 | # print('used time: ', time.time() - s)
70 | return y_pred, model.feature_importances_
71 |
--------------------------------------------------------------------------------
/src/model/classification.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import numpy as np
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.linear_model import LinearRegression, LogisticRegression
6 | from sklearn.tree import DecisionTreeClassifier
7 |
8 | from model.available_model import ModelName
9 |
10 | warnings.filterwarnings('ignore')
11 |
12 |
13 | def use_classifier(X_train, Y_train, X_test, model_switch: str):
14 | if model_switch == ModelName.linear:
15 | pred_y, feature_importance = linear_classifier(X_train, Y_train, X_test)
16 | elif model_switch == ModelName.logistic:
17 | pred_y, coefs, _ = logistic_classifier(X_train, Y_train, X_test)
18 | feature_importance = np.array(coefs[0])
19 | elif model_switch == ModelName.dt:
20 | pred_y, feature_importance = dt_classifier(X_train, Y_train, X_test)
21 | elif model_switch == ModelName.randomforest:
22 | pred_y, feature_importance = randomforest_classifier(X_train, Y_train, X_test)
23 | else:
24 | pass
25 | return pred_y, feature_importance
26 |
27 |
28 | def linear_classifier(X_train, Y_train, X_test):
29 | model = LinearRegression()
30 | model.fit(X_train, Y_train)
31 | y_pred = model.predict(X_test)
32 | y_pred = [1 if y >= 0.5 else 0 for y in y_pred]
33 | return y_pred, model.coef_
34 |
35 |
36 | def logistic_classifier(X_train, Y_train, X_test):
37 | model = LogisticRegression(max_iter=1000, tol=1e-4, class_weight='balanced', C=2)
38 | model.fit(X_train, Y_train)
39 | y_pred = model.predict_proba(X_test)
40 | y_pred = [p1 for (p0, p1) in y_pred]
41 | y_pred = [1 if y >= 0.5 else 0 for y in y_pred]
42 | return y_pred, model.coef_, model.intercept_
43 |
44 |
45 | def dt_classifier(X_train, Y_train, X_test):
46 | model = DecisionTreeClassifier(ccp_alpha=0,
47 | criterion='gini',
48 | max_depth=5,
49 | max_features=None)
50 |
51 | model.fit(X_train, Y_train)
52 | y_pred = model.predict(X_test)
53 | return y_pred, model.feature_importances_
54 |
55 |
56 | def randomforest_classifier(X_train, Y_train, X_test):
57 | model = RandomForestClassifier(n_estimators=100,
58 | criterion="gini",
59 | max_depth=None,
60 | min_samples_split=2,
61 | min_samples_leaf=1,
62 | max_features='auto', # "auto" class_weight='balanced'
63 | )
64 | model.fit(X_train, Y_train)
65 | y_pred = model.predict_proba(X_test)
66 | y_pred = y_pred[:, 1]
67 | return y_pred, model
68 |
--------------------------------------------------------------------------------
/dataset/Aminer-Zhang/check.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | d1 = json.load(open('na-data-kdd18/data/global/name_to_pubs_train_500.json'))
4 | print(len(d1), d1.keys())
5 |
6 | d2 = json.load(open('na-data-kdd18/data/global/name_to_pubs_test_100.json'))
7 | print(len(d2), d2.keys())
8 |
9 | d = {}
10 | d.update(d1)
11 | d.update(d2)
12 | print(len(d), d.keys())
13 | assert len(d) == len(d1) + len(d2)
14 |
15 | num_blocks = len(d)
16 | # for k,v in d.items():
17 | # print(k, v)
18 | num_author_group = sum([len(list(v.keys())) for k, v in d.items()])
19 | citation_with_author_order = sum([sum([b for a, b in v.items()], []) for k, v in d.items()], [])
20 | citations = [n[:n.index('-')] for n in citation_with_author_order]
21 | num_citation = len(set(citations))
22 |
23 | print('num_block: %d' % num_blocks)
24 | print('num_author_group: %d' % num_author_group)
25 | print('num_citation: %d' % num_citation)
26 |
27 | block_author_papers = sum([[(k, n) for n in sum([b for _, b in v.items()], [])] for k, v in d.items()], [])
28 | print(block_author_papers[:10])
29 |
30 | # read bibliographic author name
31 | pubs = json.load(open('na-data-kdd18/data/global/pubs_raw.json'))
32 | pubs_author_name = []
33 | for k, v in pubs.items():
34 | if 'authors' in v:
35 | for i, a in enumerate(v['authors']):
36 | pubs_author_name.append((k + '-' + str(i), a['name'].lower()))
37 | print(len(pubs_author_name), pubs_author_name[:10])
38 | pubs_author_name = dict(pubs_author_name)
39 | print(len(pubs_author_name))
40 |
41 | matched_names = []
42 | from collections import Counter
43 |
44 | lastnames = []
45 | for a, b in block_author_papers:
46 | biblio_name = pubs_author_name.get(b)
47 | if biblio_name is None:
48 | continue
49 | lastname = a.split('_')[-1]
50 | lastnames.append(lastname)
51 | a = a.replace('_', ' ').replace(' ', '')
52 | biblio_name = biblio_name.replace('.', '').replace('-', ' ').replace(' ', '')
53 | # have verified there is no case of full name variation in this dataset
54 | if a != biblio_name:
55 | print(a, biblio_name)
56 |
57 | matched_names.append([a, biblio_name])
58 |
59 | print(Counter(lastnames))
60 | # convert json to csv
61 | res = []
62 | pubs_author_name = []
63 | for k, v in pubs.items():
64 | authors, title, venue, year, abstract = v.get('authors'), v.get('title'), v.get('venue'), v.get('year'), v.get(
65 | 'abstract')
66 | if abstract is not None:
67 | abstract = abstract.replace('\t', ' ').replace('\n', '')
68 | res.append([authors, title, venue, year, abstract])
69 |
70 | import pandas as pd
71 |
72 | pd.DataFrame(res, columns=['authors', 'title', 'venue', 'year', 'abstract']).to_csv('aminer-zhang-csv.csv', sep='\t',
73 | index=None)
74 |
--------------------------------------------------------------------------------
/dataset/Aminer-WhoisWho/to_table_and_check.py:
--------------------------------------------------------------------------------
1 | import json
2 | from itertools import groupby
3 |
4 | import pandas as pd
5 |
6 | train_author = json.load(open('./train_author.json'))
7 | # print(train_author)
8 | author_name_aid_pid = []
9 | for author_name in train_author.keys():
10 | for aid, pids in train_author[author_name].items():
11 | # print(author_name, aid, pids)
12 | # author_name_aid_pids.append([author_name, aid, '|'.join(pids)])
13 | for pid in pids:
14 | assert len(pid) > 0
15 | author_name_aid_pid.append([author_name, aid, pid])
16 |
17 | print(len(author_name_aid_pid))
18 | train_pub = json.load(open('./train_pub.json'))
19 | pubs = {}
20 | for pid in train_pub.keys():
21 | paper_info = train_pub[pid]
22 | id = paper_info['id'] if paper_info.get('id') is not None else ''
23 | authors = paper_info['authors'] if paper_info.get('authors') is not None else ''
24 | title = paper_info['title'] if paper_info.get('title') is not None else ''
25 | abstract = paper_info['abstract'].replace('\t', ' ').replace('\n', ' ') if paper_info.get(
26 | 'abstract') is not None else ''
27 | keywords = paper_info['keywords'] if paper_info.get('keywords') is not None else ''
28 | venue = paper_info['venue'] if paper_info.get('venue') is not None else ''
29 | year = paper_info['year'] if paper_info.get('year') is not None else ''
30 | assert pid == id
31 | # print([pid, id, authors, title, abstract, keywords, venue, year])
32 | assert pubs.get(pid) == None
33 | pubs[pid] = [pid, authors, title, abstract, keywords, venue, year]
34 |
35 | author_name_aid_pid_pub = [item + pubs.get(item[-1])[1:] if pubs.get(item[-1]) is not None else [''] * 6 for item in
36 | author_name_aid_pid]
37 |
38 |
39 | def convert_to_author_list(n):
40 | res = [m['name'].lower().replace('-', '').replace('.', '').replace(' ', '') for m in n[3]]
41 | return res
42 |
43 |
44 | num_variation = 0
45 | num_all = 0
46 | for k, g in groupby(author_name_aid_pid_pub, lambda s: s[1]):
47 | for n in g:
48 | num_all += 1
49 | author_list = convert_to_author_list(n)
50 | split = n[0].split('_')
51 | au_name0 = split[-1] + split[0]
52 | au_name = n[0].replace('_', '')
53 | # last_name = n[0].split('_')[-1].strip()
54 | # TODO
55 | # if au_name not in author_list:
56 | if au_name not in author_list and au_name0 not in author_list:
57 | print(n[0], author_list)
58 | num_variation += 1
59 |
60 | print(num_variation, num_all, num_variation / num_all)
61 | # last name frequency
62 | from collections import Counter
63 |
64 | print(Counter([n[0].split('_')[-1] for n in author_name_aid_pid_pub]))
65 |
66 | pd.DataFrame(author_name_aid_pid_pub,
67 | columns=['author_name', 'aid', 'pid', 'authors', 'title', 'abstract', 'keywords', 'venue', 'year']).to_csv(
68 | 'train_author_pub.tsv', sep='\t',
69 | index=None)
70 |
--------------------------------------------------------------------------------
/src/eutilities/name/name_parser_by_socket.py:
--------------------------------------------------------------------------------
1 | import json
2 | import socket
3 | import traceback
4 |
5 | from mytookit.data_reader import DBReader
6 |
7 |
8 | def recvall(sock_cli):
9 | BUFF_SIZE = 4096 # 4 KiB
10 | data = b''
11 | while True:
12 | part = sock_cli.recv(BUFF_SIZE)
13 | data += part
14 | if len(part) < BUFF_SIZE:
15 | # either 0 or end of data
16 | break
17 | return data
18 |
19 |
20 | def insert_batch_data(batch_insert_data):
21 | paper_names = [[m[1] for m in n[1]] for n in batch_insert_data]
22 | json_str = json.dumps({'names': paper_names}, ensure_ascii=False)
23 |
24 | client = socket.socket()
25 | client.connect(('localhost', 38081))
26 | # print('connect successfully')
27 |
28 | client.send(json_str.encode("utf-8"))
29 | res = recvall(client)
30 | process_names = json.loads(res.decode())
31 | client.close()
32 | print(len(process_names), len(batch_insert_data))
33 | assert len(process_names) == len(batch_insert_data)
34 | batch_insert_data = [n + [process_names[i]] for i, n in enumerate(batch_insert_data)]
35 |
36 | v = DBReader.tcp_client.execute(
37 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES",
38 | params=batch_insert_data)
39 | print('has inserted %d instances' % v)
40 |
41 |
42 | df_s2 = DBReader.tcp_model_cached_read("XXXX",
43 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_s2_paper_linkage""",
44 | cached=False)
45 | print(df_s2.shape)
46 | batch_insert_data = []
47 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_s2.iterrows():
48 | if i > 0 and i % 10000 == 0:
49 | # trigger inserting data
50 | insert_batch_data(batch_insert_data)
51 | batch_insert_data = []
52 | batch_insert_data.append([pid, biblio_authors, doi, orcid, orcid_names, 'S2', 'joshfraser-NameParser'])
53 |
54 | if len(batch_insert_data) != 0:
55 | insert_batch_data(batch_insert_data)
56 | print('inserted completed!')
57 |
58 | # delete this obj for saving RAM
59 | if df_s2 is not None:
60 | del df_s2
61 |
62 | df_mag = DBReader.tcp_model_cached_read("XXXX",
63 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_mag_paper_linkage""",
64 | cached=False)
65 | print(df_mag.shape)
66 | batch_insert_data = []
67 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_mag.iterrows():
68 | if i > 0 and i % 10000 == 0:
69 | # trigger insert here
70 | try:
71 | insert_batch_data(batch_insert_data)
72 | except:
73 | traceback.print_exc()
74 |
75 | batch_insert_data = []
76 | batch_insert_data.append([str(pid), biblio_authors, doi, orcid, orcid_names, 'MAG', 'joshfraser-NameParser'])
77 |
78 | if len(batch_insert_data) != 0:
79 | insert_batch_data(batch_insert_data)
80 | print('inserted completed!')
81 |
--------------------------------------------------------------------------------
/src/eutilities/name/name_parser_by_localscript.py:
--------------------------------------------------------------------------------
1 | from eutilities.name.name_parser import derek73_nameparser, klauslippert_personnamenorm
2 | from mytookit.data_reader import DBReader
3 |
4 | name_parser_method = 'derek73'
5 | # name_parser_method = 'klauslippert'
6 |
7 | def split_ltiple_biblio_authors(au):
8 | if au is None or len(au) == 0:
9 | return []
10 | else:
11 | splited_au = []
12 | for pos, au_name in au:
13 | if name_parser_method == 'derek73':
14 | name_parts = derek73_nameparser(au_name)
15 | else:
16 | name_parts = klauslippert_personnamenorm(au_name)
17 | splited_au.append([pos, name_parts[0], name_parts[1], name_parts[2]])
18 | return splited_au
19 |
20 |
21 | df_s2 = DBReader.tcp_model_cached_read("XXXX",
22 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_s2_paper_linkage""",
23 | cached=False)
24 | print(df_s2.shape)
25 | batch_insert_data = []
26 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_s2.iterrows():
27 | if i > 0 and i % 100000 == 0:
28 | # trigger insert here
29 | v = DBReader.tcp_client.execute(
30 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES",
31 | params=batch_insert_data)
32 | print('has inserted %d instances' % v)
33 | batch_insert_data = []
34 | batch_insert_data.append(
35 | [pid, biblio_authors, doi, orcid, orcid_names, 'S2', name_parser_method,
36 | split_ltiple_biblio_authors(biblio_authors)])
37 |
38 | if len(batch_insert_data) != 0:
39 | v = DBReader.tcp_client.execute(query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES",
40 | params=batch_insert_data, types_check=True)
41 | print('has inserted the last %d instances' % v)
42 | print('inserted completed!')
43 |
44 | # delete this obj for saving RAM
45 | if df_s2 is not None:
46 | del df_s2
47 |
48 | df_mag = DBReader.tcp_model_cached_read("XXXX",
49 | """select pid, biblio_authors, doi, orcid, orcid_names from and_ds.orcid_mag_paper_linkage""",
50 | cached=False)
51 | print(df_mag.shape)
52 | batch_insert_data = []
53 | for i, (pid, biblio_authors, doi, orcid, orcid_names) in df_mag.iterrows():
54 | if i > 0 and i % 100000 == 0:
55 | # trigger insert here
56 | v = DBReader.tcp_client.execute(
57 | query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES",
58 | params=batch_insert_data)
59 | print('has inserted %d instances' % v)
60 | batch_insert_data = []
61 | batch_insert_data.append(
62 | [str(pid), biblio_authors, doi, orcid, orcid_names, 'MAG', name_parser_method,
63 | split_ltiple_biblio_authors(biblio_authors)])
64 |
65 | if len(batch_insert_data) != 0:
66 | v = DBReader.tcp_client.execute(query="insert into and_ds.orcid_mag_s2_author_name_split_by_various_algorithms VALUES",
67 | params=batch_insert_data, types_check=True)
68 | print('has inserted the last %d instances' % v)
69 | print('inserted completed!')
70 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### Dataset
2 |
3 | The dataset can be available from [here](https://zenodo.org/record/7313380), it is represented in two kinds of forms. The first one is full-name-block form, which arranges the dataset by ORCID iD and shared full name. The second is pairwise form, ambiguous authors are arranged in pair so that some classifier-based models can use it to capture the similarity between two authors.
4 |
5 | #### Our dataset v.s. Existing datasets
6 | Creating a new dataset is painful. In AND researches, all existing datasets created by human annotators, and most datasets are either in limited scale or biased. However, our dataset has overcome these problems. It does not need human interventions in building the dataset. Moreover, by using the two comprehensive resources, the publishing history of a specific author (query DOIs by ORCID iD) and authors of a specific paper (query ORCID iDs by DOI) can be easily and credibly identified. Thus, with the large number of records in the credible resources, a large-scale dataset can be built. More importantly, the dataset considers more realistic aspects than existing datasets. It passed a series of rigorous gold standard validations, among which the two most important ones are synonym patterns and domains. The dataset contains a similar variation degree in last names and covers wide domain of research areas, as that represented in entire MAG.
7 |
8 | #### Dataset Structure
9 | The block-based dataset contains the following fields:
10 |
11 | | Field | Date Type |
12 | |------------------------------|---------------|
13 | | block_fullname | String |
14 | | author_group_orcid | String |
15 | | author_group_idx_in_block | Int |
16 | | citation_idx_in_author_group | Int |
17 | | doi | String |
18 | | pid | Int |
19 | | author_position | Int |
20 | | author_name | String |
21 | | author_affiliation | String |
22 | | coauthors | String Array |
23 | | coauthor_affliations | String Array |
24 | | venue | String |
25 | | pub_year | Int |
26 | | paper_title | String |
27 | | paper_abstract | String |
28 |
29 | "block_fullname" is taken from the credible full name (CFN) from the ORCID system, it is used to represent the block. Due to the fact that more than one authors can exist in a block, "author_group_orcid" is the ORCID iD of a specific author in a block, it is used to represent a group of citations (CG) that authored by this author, and "author_group_idx_in_block" denotes the order of CGs in a block. Similarly, "citation_idx_in_author_group" denotes the order of citation in a CG. "pid" is the paper ID in Microsoft Academic and Microsoft Academic Graph. "author_position" is identified by heuristics.
30 |
31 | ### Citation
32 | If you used the dataset, method or model, please consider cite it.
33 | ```bibtex
34 | @article{zhang2021lagos,
35 | title={LAGOS-AND: A Large, Gold Standard Dataset for Scholarly Author Name Disambiguation},
36 | author={Zhang, Li and Lu, Wei and Yang, Jinqing},
37 | journal={arXiv preprint arXiv:2104.01821},
38 | year={2021}
39 | }
40 | ```
41 |
--------------------------------------------------------------------------------
/src/eutilities/train_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import numpy as np
4 | import torch
5 | from sklearn import metrics
6 | from tqdm import tqdm
7 |
8 |
9 | def train(model, train_loader, criterion, optimizer, epoch, epochs, train_vector, logs_per_epoch=10,
10 | device=torch.device('cuda')):
11 | model.train()
12 | train_loss = 0
13 | num_batches = len(train_loader)
14 | start = time.time()
15 | for batch_idx, (MT, XL, XR, Y) in enumerate(train_loader):
16 | # HF, XL, XR, Y = HF.to(device), XL.to(device), XR.to(device), Y.to(device)
17 | XL, XR, Y = XL.to(device), XR.to(device), Y.to(device)
18 | optimizer.zero_grad()
19 | # output = model([HF, XL, XR])
20 | output = model([XL, XR])
21 | loss = criterion(output, Y)
22 | train_loss += loss.item()
23 | loss.backward()
24 | optimizer.step()
25 |
26 | if batch_idx % (num_batches // logs_per_epoch) == 0 and batch_idx > 0:
27 | now = time.time()
28 | batch_size = len(Y)
29 | inputs_per_sec = ((batch_idx + 1) * batch_size) / (now - start)
30 | eta_min = (epochs * num_batches - (epoch - 1) * num_batches - (
31 | batch_idx + 1)) * batch_size / inputs_per_sec / 60
32 | print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tInputs/s: {:.1f}\tRemaining: {:.1f} min'.format(
33 | epoch, batch_idx * len(Y), len(train_loader.dataset),
34 | 100. * batch_idx / len(train_loader), loss.data.item(), inputs_per_sec, eta_min))
35 |
36 | train_loss /= len(train_loader)
37 | train_vector.append(train_loss)
38 |
39 |
40 | def validate(model, test_loader, criterion, loss_vector, f1_vector=[], device=torch.device('cuda'), switch_input=False):
41 | model.eval()
42 | val_loss = 0
43 | metadata = []
44 | prediction = torch.tensor([], device=device)
45 | true_labels = torch.tensor([], device=device)
46 | print('\nValidating...')
47 | with torch.no_grad():
48 | for (MT, XL, XR, Y) in tqdm(test_loader):
49 | metadata.append(np.array([n.cpu().numpy() for n in MT]))
50 | # HF, XL, XR, Y = HF.to(device), XL.to(device), XR.to(device), Y.to(device)
51 | XL, XR, Y = XL.to(device), XR.to(device), Y.to(device)
52 | # output = model([HF, XL, XR])
53 | if switch_input:
54 | output = model([XR, XL])
55 | else:
56 | output = model([XL, XR])
57 |
58 | val_loss += criterion(output, Y).data.item()
59 |
60 | if isinstance(criterion, torch.nn.BCEWithLogitsLoss):
61 | pred = output.sigmoid()
62 |
63 | prediction = torch.cat((prediction, pred))
64 | true_labels = torch.cat((true_labels, Y))
65 |
66 | if output.size(-1) == 2:
67 | true_label_numpy = [int(n[1]) for n in true_labels.cpu().numpy()]
68 | pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()]
69 | pred_prob = [n[1] for n in prediction.cpu().numpy()]
70 | else:
71 | true_label_numpy = [int(n) for n in true_labels.cpu().numpy()]
72 | pred_label_numpy = [1 if n > 0.5 else 0 for n in prediction.cpu().numpy()]
73 | pred_prob = [n[-1] for n in prediction.cpu().numpy()]
74 | print(pred_prob[:100])
75 | accuracy = metrics.accuracy_score(true_label_numpy, pred_label_numpy)
76 | f1_score = metrics.f1_score(true_label_numpy, pred_label_numpy)
77 | macro_f1_score = metrics.f1_score(true_label_numpy, pred_label_numpy, average='macro')
78 | val_loss /= len(test_loader)
79 | loss_vector.append(val_loss)
80 | f1_vector.append(f1_score)
81 | print('Validation set: Average loss: {:.4f}\t Accuracy: {:.4f}\t F1-score: {:.4f}\t Macro-F1-score: {:.4f}\n'.format(val_loss,
82 | accuracy,
83 | f1_score,
84 | macro_f1_score))
85 | metadata = np.hstack(metadata)
86 | return metadata, true_label_numpy, pred_label_numpy, pred_prob
87 |
--------------------------------------------------------------------------------
/src/eutilities/MAGdata/parse_fos_from_mag_kg.py:
--------------------------------------------------------------------------------
1 | # download MAG KG from https://zenodo.org/record/3930398#.X9YvjnYzY5ll
2 | import traceback
3 |
4 | fos_names = set(map(lambda x: x.lower(),
5 | ['Medicine', 'Biology', 'Chemistry', 'Computer Science', 'Engineering', 'Physics',
6 | 'Materials Science',
7 | 'Psychology', 'Mathematics', 'History', 'Sociology', 'Art', 'Political Science', 'Geography',
8 | 'Economics',
9 | 'Business', 'Geology', 'Philosophy', 'Environmental Science']))
10 | file_name = 'FieldsOfStudy.nt'
11 | fos_id_name_dict = {}
12 | for line in open(file_name):
13 | splt = line.strip().split('>')
14 | assert len(splt) == 4
15 | if 'name' in splt[1]:
16 | temp = splt[2]
17 | fos = temp[:temp.index('^')].replace('\"', '').strip().lower()
18 | if fos in fos_names:
19 | fos_id = splt[0][splt[0].index('entity/') + 7:].strip()
20 | fos_id_name_dict[fos_id] = fos
21 |
22 | # cat FieldsOfStudy.nt | grep 'level> "0"'
23 | # "0"^^ .
24 | # "0"^^ .
25 | # "0"^^ .
26 | # "0"^^ .
27 | # "0"^^ .
28 | # "0"^^ .
29 | # "0"^^ .
30 | # "0"^^ .
31 | # "0"^^ .
32 | # "0"^^ .
33 | # "0"^^ .
34 | # "0"^^ .
35 | # "0"^^ .
36 | # "0"^^ .
37 | # "0"^^ .
38 | # "0"^^ .
39 | # "0"^^ .
40 | # "0"^^ .
41 | # "0"^^ .
42 |
43 | # assert len(fos_id_name_dict) == 19
44 | # {'95457728': 'history', '127313418': 'geology', '162324750': 'economics', '205649164': 'geography', '185592680': 'chemistry',
45 | # '138885662': 'philosophy', '144024400': 'sociology', '192562407': 'materials science', '33923547': 'mathematics',
46 | # '86803240': 'biology', '41008148': 'computer science', '17744445': 'political science', '127413603': 'engineering',
47 | # '15744967': 'psychology', '39432304': 'environmental science', '144133560': 'business', '121332964': 'physics',
48 | # '71924100': 'medicine', '142362112': 'art'}
49 |
50 | file_name1 = 'paper_fos_parsed_using_awk.txt'
51 | with open('mag_paper_top_level_fos.tsv.1', 'w') as fw:
52 | for line in open(file_name1):
53 | splt = line.strip().split(' ')
54 | assert len(splt) == 2
55 | pid, fos_id = splt
56 | if fos_id in fos_id_name_dict:
57 | fw.write('\t'.join([pid, fos_id_name_dict[fos_id]]) + '\n')
58 | traceback.print_exc()
59 |
60 | # file_name1 = 'mag_kg/PaperFieldsOfStudy.nt'
61 | # with open('mag_paper_top_level_fos.tsv', 'w') as fw:
62 | # for line in open(file_name1):
63 | # try:
64 | # splt = line.strip().split('>')
65 | # assert len(splt) == 4
66 | # pid = splt[0][splt[0].index('entity/') + 7:].strip()
67 | # fos_id = splt[2][splt[2].index('entity/') + 7:].strip()
68 | # if fos_id in fos_id_name_dict:
69 | # fw.write('\t'.join([pid, fos_id_name_dict[fos_id]]) + '\n')
70 | # except Exception as e:
71 | # traceback.print_exc()
--------------------------------------------------------------------------------
/src/feature/cluster/sparse_tfidf_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import joblib
4 | import numpy as np
5 | import torch
6 | from mytookit.data_reader import DBReader
7 | from sklearn.feature_extraction.text import TfidfVectorizer
8 | from tqdm import tqdm
9 |
10 | from myconfig import device, cached_dir
11 |
12 | sql_block = r'''
13 | select block_fullname as block_name,
14 | arrayMap(x->x[1],
15 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos,
16 | arrayMap(x->x[2], tmp) as ground_truths,
17 | arrayMap(x->x[3], tmp) as mag_preds
18 | from (select block_fullname,
19 | author_group_orcid,
20 | -- Note has verified all mag_author_id is successfully matched
21 | toString(aid) as mag_author_id,
22 | concat(toString(pid), '_', toString(author_position)) as pid_ao
23 | from and_ds.our_and_dataset_block any
24 | left join (
25 | select pid, aid, author_position
26 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
27 | from mag.paper_author_affiliation) any
28 | inner join and_ds.our_and_dataset_block using pid, author_position
29 | ) using pid, author_position)
30 | group by block_name
31 | having xxHash32(block_name) %% 10=%d
32 | order by length(pid_aos) desc;
33 | ;'''
34 |
35 | sql_metadata = r'''
36 | select concat(toString(pid), '_', toString(author_position)) as pid_ao,
37 | block_fullname,
38 | author_group_orcid as orcid,
39 | -- -- Note has verified all mag_author_id is successfully matched
40 | -- lowerUTF8(author_name) as author_name,
41 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation,
42 | -- coauthors,
43 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue,
44 | -- pub_year,
45 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content
46 | from and_ds.our_and_dataset_block any
47 | left join (
48 | select pid, aid, author_position
49 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
50 | from mag.paper_author_affiliation) any
51 | inner join and_ds.our_and_dataset_block using pid, author_position
52 | ) using pid, author_position
53 | where xxHash32(block_fullname) %% 10=%d
54 | '''
55 |
56 |
57 | def sparse_tensor_tfidf_similarity(documents):
58 | tfidf_csr = vectorizer.transform(documents)
59 |
60 | if len(documents) < 300:
61 | # Note computer by CPU
62 | m = tfidf_csr * tfidf_csr.T
63 | similarity = m.A
64 | else:
65 | # Note computer by GPU
66 | coo = tfidf_csr.tocoo()
67 | indices = np.vstack((coo.row, coo.col))
68 | st = torch.sparse.FloatTensor(torch.LongTensor(indices),
69 | torch.FloatTensor(coo.data),
70 | torch.Size(coo.shape)).to(device)
71 | # Note this feature require a high version of pytorch
72 | multipled_st = torch.sparse.mm(st, torch.transpose(st, 0, 1))
73 | similarity = multipled_st.to_dense().cpu().numpy()
74 |
75 | return similarity
76 |
77 |
78 | for seg in range(0, 10, 1):
79 | sql = sql_metadata % seg
80 | print(sql)
81 | # Note prepare the paper metadata dict
82 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False)
83 | print(df_metadata.shape)
84 | print(df_metadata.head())
85 |
86 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values))
87 | md_content_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].values))
88 |
89 | del df_metadata
90 |
91 | # Note generate the pairwise similarity
92 | documents = md_content_dict.values()
93 | vectorizer = TfidfVectorizer() # tokenizer=normalize, stop_words='english'
94 | print('fit tfidf model')
95 | vectorizer = vectorizer.fit(documents)
96 |
97 | all_block_feature = {}
98 | sql = sql_block % seg
99 | print(sql)
100 | df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False)
101 | print(df_block.shape)
102 |
103 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]):
104 | block_name, pid_aos, ground_truths, mag_preds = row
105 | documents = [md_content_dict[pid_ao] for pid_ao in pid_aos]
106 | tfidf_similarity = sparse_tensor_tfidf_similarity(documents)
107 | tfidf_similarity = np.array(tfidf_similarity, dtype=np.float16)
108 | all_block_feature[block_name] = tfidf_similarity
109 |
110 | joblib.dump(all_block_feature,
111 | filename=os.path.join(cached_dir, 'cluster_feature/tfidf-feature-%d.pkl' % seg))
112 |
--------------------------------------------------------------------------------
/src/feature/doc2vec_trainer.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import logging
3 | import os
4 |
5 | from mytookit.data_reader import DBReader
6 | from sklearn.model_selection import train_test_split
7 |
8 | from myconfig import cached_dir
9 |
10 | logging.basicConfig(
11 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
12 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
13 | project_dir_path = os.path.dirname(os.path.abspath(base_file_path))
14 | data_path = project_dir_path
15 |
16 | import inspect
17 | import logging
18 | import os
19 | import random
20 |
21 | import numpy as np
22 | from gensim.models import doc2vec
23 |
24 | logging.basicConfig(
25 | format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
26 | base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
27 | base_path = os.path.dirname(os.path.abspath(base_file_path))
28 | project_dir_path = os.path.dirname(os.path.abspath(base_path))
29 | classifiers_path = os.path.join(project_dir_path, 'classifiers')
30 |
31 |
32 | class doc2VecModel():
33 |
34 | def __init__(self):
35 | super().__init__()
36 |
37 | def initialize_model(self, corpus):
38 | logging.info("Building Doc2Vec vocabulary")
39 | self.corpus = corpus
40 | self.model = doc2vec.Doc2Vec(
41 | epochs=8,
42 | min_count=1,
43 | # Ignores all words with
44 | # total frequency lower than this
45 | window=10,
46 | # The maximum distance between the current
47 | # and predicted word within a sentence
48 | vector_size=200, # Dimensionality of the
49 | # generated feature vectors
50 | workers=12, # Number of worker threads to
51 | # train the model
52 | alpha=0.025, # The initial learning rate
53 | min_alpha=0.00025,
54 | # Learning rate will linearly drop to
55 | # min_alpha as training progresses
56 | dm=1)
57 | # dm defines the training algorithm.
58 | # If dm=1 means 'distributed memory' (PV-DM)
59 | # and dm =0 means 'distributed bag of words' (PV-DBOW)
60 | self.model.build_vocab(self.corpus)
61 |
62 | def train_model(self):
63 | logging.info("Training Doc2Vec model")
64 | for epoch in range(2):
65 | logging.info('Training iteration #{0}'.format(epoch))
66 | self.model.train(
67 | self.corpus, total_examples=self.model.corpus_count,
68 | epochs=self.model.epochs)
69 | # shuffle the corpus
70 | random.shuffle(self.corpus)
71 | # decrease the learning rate
72 | self.model.alpha -= 0.0002
73 | # fix the learning rate, no decay
74 | self.model.min_alpha = self.model.alpha
75 |
76 | def get_vectors(self, corpus_size, vectors_size, vectors_type):
77 | """
78 | Get vectors from trained doc2vec model
79 | :param doc2vec_model: Trained Doc2Vec model
80 | :param corpus_size: Size of the data
81 | :param vectors_size: Size of the embedding vectors
82 | :param vectors_type: Training or Testing vectors
83 | :return: list of vectors
84 | """
85 | vectors = np.zeros((corpus_size, vectors_size))
86 | for i in range(0, corpus_size):
87 | prefix = vectors_type + '_' + str(i)
88 | vectors[i] = self.model.docvecs[prefix]
89 | return vectors
90 |
91 | def save_model(self, model_path):
92 | logging.info("Doc2Vec model saved at: " + model_path)
93 | self.model.save(model_path)
94 |
95 | def label_sentences(corpus, label_type):
96 | """
97 | Gensim's Doc2Vec implementation requires each
98 | document/paragraph to have a label associated with it.
99 | We do this by using the LabeledSentence method.
100 | The format will be "TRAIN_i" or "TEST_i" where "i" is
101 | a dummy index of the review.
102 | """
103 | labeled = []
104 | for i, v in enumerate(corpus):
105 | label = label_type + '_' + str(i)
106 | labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
107 | return labeled
108 |
109 |
110 | def prepare_all_data(ds):
111 | x_train, x_test, y_train, y_test = train_test_split(ds.review, ds.sentiment, random_state=0, test_size=0.1)
112 | x_train = doc2VecModel.label_sentences(x_train, 'Train')
113 | x_test = doc2VecModel.label_sentences(x_test, 'Test')
114 | all_data = x_train + x_test
115 | return x_train, x_test, y_train, y_test, all_data
116 |
117 |
118 | if __name__ == "__main__":
119 | ds = DBReader.tcp_model_cached_read(os.path.join(cached_dir, 'doc2vec_train_corpus.pkl'),
120 | """select content from and_ds.doc2vec_train_corpus""",
121 | cached=False)
122 | print(ds.shape)
123 | print(ds.head())
124 | ds_content = list(ds['content'])
125 | ds_content = [item.split() for item in ds_content]
126 | print('training samples size:', len(ds_content))
127 | print('first 3 training samples:', ds_content[:3])
128 | corpus = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(ds_content)]
129 | # print(corpus[0:4])
130 | d2v = doc2VecModel()
131 | d2v.initialize_model(corpus)
132 | d2v.train_model()
133 | d2v.save_model(os.path.join(cached_dir, 'doc2vec_model'))
134 |
--------------------------------------------------------------------------------
/src/feature/cluster/doc2vec_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 | from multiprocessing import Pool
3 |
4 | import joblib
5 | import torch
6 | from gensim.models import Doc2Vec
7 | from mytookit.data_reader import DBReader
8 | from nltk.corpus import stopwords
9 | from tqdm import tqdm
10 |
11 | from myconfig import cached_dir, device
12 |
13 | en_stopwords_set = set(stopwords.words('english'))
14 |
15 | sql_block = r'''
16 | select block_fullname as block_name,
17 | arrayMap(x->x[1],
18 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos,
19 | arrayMap(x->x[2], tmp) as ground_truths,
20 | arrayMap(x->x[3], tmp) as mag_preds
21 | from (select block_fullname,
22 | author_group_orcid,
23 | -- Note has verified all mag_author_id is successfully matched
24 | toString(aid) as mag_author_id,
25 | concat(toString(pid), '_', toString(author_position)) as pid_ao
26 | from and_ds.our_and_dataset_block any
27 | left join (
28 | select pid, aid, author_position
29 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
30 | from mag.paper_author_affiliation) any
31 | inner join and_ds.our_and_dataset_block using pid, author_position
32 | ) using pid, author_position)
33 | group by block_name
34 | having xxHash32(block_name) %% 10=%d
35 | order by length(pid_aos) desc;
36 | ;'''
37 |
38 | sql_metadata = r'''
39 | select concat(toString(pid), '_', toString(author_position)) as pid_ao,
40 | block_fullname,
41 | author_group_orcid as orcid,
42 | -- -- Note has verified all mag_author_id is successfully matched
43 | -- lowerUTF8(author_name) as author_name,
44 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation,
45 | -- coauthors,
46 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue,
47 | -- pub_year,
48 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content
49 | from and_ds.our_and_dataset_block any
50 | left join (
51 | select pid, aid, author_position
52 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
53 | from mag.paper_author_affiliation) any
54 | inner join and_ds.our_and_dataset_block using pid, author_position
55 | ) using pid, author_position
56 | where xxHash32(block_fullname) %% 10=%d
57 | '''
58 |
59 | # load doc2vec model
60 | print('begin load models... ')
61 | doc2vec_model = Doc2Vec.load('../cached/doc2vec_model')
62 | print('end load models... ')
63 |
64 |
65 | def sim_matrix(a, b, eps=1e-8):
66 | """
67 | added eps for numerical stability
68 | """
69 | a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
70 | a_norm = a / torch.clamp(a_n, min=eps)
71 | b_norm = b / torch.clamp(b_n, min=eps)
72 | b_norm = b_norm.transpose(0, 1)
73 | # print(a_norm.shape, b_norm.shape)
74 | sim_mt = torch.mm(a_norm, b_norm)
75 | return sim_mt
76 |
77 |
78 | for seg in range(0, 10, 1):
79 | sql = sql_metadata % seg
80 | print(sql)
81 | # Note prepare the paper metadata dict
82 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='XXX', sql=sql, cached=False)
83 | print(df_metadata.shape)
84 | print(df_metadata.head())
85 |
86 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values))
87 |
88 | # md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values))
89 | # md_content_word_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(
90 | # lambda x: set([w for w in x.split(' ') if not w in en_stopwords_set])).values))
91 |
92 | # for index, (pid_ao, content) in tqdm(df_metadata[['pid_ao', 'content']].iterrows(), total=df_metadata.shape[0]):
93 | # doc2vec_model.infer_vector(content.split(' '), steps=12, alpha=0.025)
94 | # Note this step will be very slow
95 | # md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(
96 | # lambda x: doc2vec_model.infer_vector(x.split(' '), steps=12, alpha=0.025)).values))
97 |
98 | def infer_vector_worker(document):
99 | vector = doc2vec_model.infer_vector(document.split(' '), steps=12, alpha=0.025)
100 | return vector
101 |
102 |
103 | with Pool(processes=14) as pool:
104 | doc2vec_emds = pool.map(infer_vector_worker, df_metadata['content'].values)
105 |
106 | md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, doc2vec_emds))
107 | del df_metadata
108 |
109 | all_block_feature = {}
110 | sql = sql_block % seg
111 | print(sql)
112 | df_block = DBReader.tcp_model_cached_read(cached_file_path='XXX', sql=sql, cached=False)
113 | print(df_block.shape)
114 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]):
115 | block_name, pid_aos, ground_truths, mag_preds = row
116 |
117 | # Note calculate the similarity between different metadata according to pid_ao
118 | num_instances = len(pid_aos)
119 | # if ij % 10 == 0:
120 | # print(num_instances)
121 | embeddings = torch.tensor([md_doc2vec_emd_dict[pid_ao] for pid_ao in pid_aos], device=device)
122 | pairwise_feature_matrix = sim_matrix(embeddings, embeddings)
123 | pairwise_feature_matrix = pairwise_feature_matrix.cpu().numpy()
124 |
125 | all_block_feature[block_name] = pairwise_feature_matrix
126 |
127 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir, 'cluster_feature/doc2vec-feature-%d.pkl' % seg))
128 |
--------------------------------------------------------------------------------
/src/feature/pairwise/our_dataset_to_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 | from multiprocessing import Pool
3 |
4 | import sys
5 |
6 | import joblib
7 | from scipy import stats
8 |
9 | sys.path.append('../../')
10 |
11 | import pandas as pd
12 | from gensim.models import Doc2Vec
13 | from mytookit.data_reader import DBReader
14 | from scipy.spatial.distance import cosine
15 | from sklearn.feature_extraction.text import TfidfVectorizer
16 |
17 | from eutilities.string_utils import jaccard_similarity, extract_word_list, ngram_sequence, \
18 | convert_unicode_to_ascii
19 | from myconfig import cached_dir
20 |
21 | df = DBReader.tcp_model_cached_read("XXXX",
22 | sql="select * from and_ds.our_and_dataset_pairwise_gold_standard;",
23 | cached=False)
24 | print('df.shape', df.shape)
25 |
26 | # ['fullname' 'pid1' 'ao1' 'pid2' 'ao2' 'same_author' 'authors1'
27 | # 'paper_title1' 'venue1' 'pub_year1' 'authors2' 'paper_title2' 'venue2'
28 | # 'pub_year2', 'train1_test0_val2']
29 | columns = df.columns.values
30 | print(len(columns), columns)
31 | h, w = df.shape
32 |
33 |
34 | def concat_title_abstract(row):
35 | return ' '.join([str(n) for n in row.values]).lower()
36 |
37 |
38 | documents = list(
39 | df[['paper_title1', 'abstract1']].apply(concat_title_abstract, axis=1).values) + list(
40 | df[['paper_title2', 'abstract2']].apply(concat_title_abstract, axis=1).values)
41 | vectorizer = TfidfVectorizer() # tokenizer=normalize, stop_words='english'
42 | print('fit tfidf model')
43 | vectorizer = vectorizer.fit(documents)
44 |
45 |
46 | def cosine_sim(text1, text2):
47 | tfidf = vectorizer.transform([text1, text2])
48 | return ((tfidf * tfidf.T).A)[0, 1]
49 |
50 |
51 | # load doc2vec model
52 | model = Doc2Vec.load(os.path.join(cached_dir, 'doc2vec_model'))
53 | print('load doc2vec model')
54 |
55 |
56 | def extract_pairwise_feature(pairwise_citation):
57 | task_id, (fullname, pid1, ao1, pid2, ao2,
58 | coauthor1, aid1, author_names1, aff_arr1, aff_id_arr1, paper_title1, abstract1, venue1, pub_year1,
59 | coauthor2, aid2, author_names2, aff_arr2, aff_id_arr2, paper_title2, abstract2, venue2, pub_year2,
60 | same_author, train1_test0_val2) = pairwise_citation
61 |
62 | try:
63 | if task_id % 10000 == 0:
64 | print(task_id * 100.0 / h)
65 |
66 | author_names1, author_names2 = author_names1.lower(), author_names2.lower()
67 |
68 | # if author_names1 != convert_unicode_to_ascii(author_names1):
69 | # print(author_names1, convert_unicode_to_ascii(author_names1))
70 |
71 | # name similarity
72 | name_similarity = jaccard_similarity(ngram_sequence(convert_unicode_to_ascii(author_names1)),
73 | ngram_sequence(convert_unicode_to_ascii(author_names2)))
74 |
75 | same_biblio_aid = 1 if aid1 == aid2 else 0
76 | pub_year_diff = abs(pub_year1 - pub_year2) if pub_year1 > 0 and pub_year2 > 0 else -1
77 |
78 | try:
79 | content1, content2 = (paper_title1 + ' ' + str(abstract1)).lower(), (paper_title2 + ' ' + str(abstract2)).lower()
80 | except Exception as e:
81 | print(e)
82 | content1, content2 = paper_title1, paper_title2
83 |
84 | word_list1 = extract_word_list(content1)
85 | word_list2 = extract_word_list(content2)
86 | paper_title_abstract_similarity = jaccard_similarity(
87 | word_list1,
88 | word_list2,
89 | remove_stop_word=True)
90 |
91 | # do2vec similarity
92 | content_cosin_sim = 0
93 | try:
94 | v1 = model.infer_vector(word_list1, steps=12, alpha=0.025)
95 | v2 = model.infer_vector(word_list2, steps=12, alpha=0.025)
96 | # Compute the Cosine distance between 1-D arrays.
97 | # distance cosine([1, 2],[3,4]) = 1 - (1*3+2*4)/(sqrt(1*1+2*2) * sqrt(3*3+4*4))
98 | content_cosin_sim = 1 - cosine(v1, v2)
99 | except Exception as e:
100 | print(e)
101 |
102 | # tfidf similarity
103 | tfidf_cosin_sim = 0
104 | try:
105 | tfidf_cosin_sim = cosine_sim(content1, content2)
106 | except Exception as e:
107 | print(e)
108 |
109 | venue_similarity = jaccard_similarity(extract_word_list(str(venue1).lower()),
110 | extract_word_list(str(venue2).lower()))
111 |
112 | aff_similarity = jaccard_similarity(extract_word_list(' '.join(str(aff_arr1).split('|')).lower()),
113 | extract_word_list(' '.join(str(aff_arr2).split('|')).lower()))
114 |
115 | feature_item = [fullname, pid1, ao1, pid2, ao2, same_author, train1_test0_val2,
116 | name_similarity, same_biblio_aid, pub_year_diff,
117 | paper_title_abstract_similarity,
118 | content_cosin_sim, tfidf_cosin_sim,
119 | venue_similarity,
120 | aff_similarity,
121 | content1,
122 | content2]
123 |
124 | return feature_item
125 | except Exception as e:
126 | print(e)
127 | return [fullname, pid1, ao1, pid2, ao2, same_author, train1_test0_val2,
128 | 0, 0, 0, 0, 0, 0, 0, 0, "", ""]
129 |
130 |
131 | task_pools = [(i, row) for i, row in df.iterrows()]
132 |
133 | with Pool(processes=14) as pool:
134 | features = pool.map(extract_pairwise_feature, task_pools)
135 |
136 | joblib.dump(features, 'tmp.pkl')
137 |
138 | pd.DataFrame(features,
139 | columns=['fullname', 'pid1', 'ao1', 'pid2', 'ao2',
140 | 'same_author', 'train1_test0_val2',
141 | 'name_similarity', 'same_biblio_aid', 'pub_year_diff',
142 | 'paper_title_abstract_similarity', 'content_cosin_similarity', 'tfidf_cosin_similarity',
143 | 'venue_similarity', 'aff_similarity', 'content1', 'content2']).to_csv(
144 | os.path.join(cached_dir, 'pairwise_and_dataset_feature_full.tsv'), sep='\t', index=False)
145 |
--------------------------------------------------------------------------------
/dataset/PubMed-Kim/Kim_Authority_ORCID_linkage_dataset.sql:
--------------------------------------------------------------------------------
1 | create table if not exists and_ds.AUT_NIH
2 | (
3 | Year String,
4 | PMID String,
5 | BylinePosition String,
6 | MEDLINE_Name String,
7 | NIH_ID String,
8 | NIH_Name String,
9 | Authority2009_ID String,
10 | Ethnea String,
11 | Genni String,
12 | AINI String,
13 | FINI String
14 | ) ENGINE = Log;
15 |
16 | create table if not exists and_ds.AUT_ORC
17 | (
18 | Year String,
19 | PMID String,
20 | BylinePosition String,
21 | MEDLINE_Name String,
22 | ORCID String,
23 | ORCID_Name String,
24 | Authority2009_ID String,
25 | Ethnea String,
26 | Genni String,
27 | AINI String,
28 | FINI String
29 | ) ENGINE = Log;
30 |
31 | create table if not exists and_ds.AUT_SCT_info
32 | (
33 | Year String,
34 | PMID String,
35 | BylinePosition String,
36 | MEDLINE_Name String,
37 | Authority2009_ID String,
38 | Ethnea String,
39 | Genni String,
40 | AINI String,
41 | FINI String
42 | ) ENGINE = Log;
43 |
44 | create table if not exists and_ds.AUT_SCT_pairs
45 | (
46 | PMID_1 String,
47 | Byline_Position_1 String,
48 | PMID_2 String,
49 | Byline_Position_2 String
50 | ) ENGINE = Log;
51 |
52 | -- 312952 AUT_NIH.txt
53 | -- 3076502 AUT_ORC.txt
54 | -- 4732531 AUT_SCT_info.txt
55 | -- 6214200 AUT_SCT_pairs.txt
56 |
57 | -- 312951
58 | -- 6214199
59 | -- 3076501
60 | -- 4732530
61 | select count()
62 | from and_ds.AUT_NIH
63 | union all
64 | select count()
65 | from and_ds.AUT_ORC
66 | union all
67 | select count()
68 | from and_ds.AUT_SCT_info
69 | union all
70 | select count()
71 | from and_ds.AUT_SCT_pairs;
72 |
73 | -- cat AUT_NIH.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_NIH FORMAT TSVWithNames'
74 | -- cat AUT_ORC.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_ORC FORMAT TSVWithNames'
75 | -- cat AUT_SCT_info.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_SCT_info FORMAT TSVWithNames'
76 | -- cat AUT_SCT_pairs.txt | dos2unix | clickhouse-client --password=root --input_format_allow_errors_ratio=0.01 --input_format_skip_unknown_fields=true --port=9001 --query='insert into and_ds.AUT_SCT_pairs FORMAT TSVWithNames'
77 |
78 | drop table if exists and_ds.AUT_NIH;
79 | drop table if exists and_ds.AUT_ORC;
80 | drop table if exists and_ds.AUT_SCT_info;
81 | drop table if exists and_ds.AUT_SCT_pairs;
82 |
83 | select *
84 | from (
85 | select *
86 | from (select concat(PMID_1, '_', Byline_Position_1) as pm_ao1, concat(PMID_2, '_', Byline_Position_2) as pm_ao2
87 | from and_ds.AUT_SCT_pairs) any
88 | inner join (select concat(PMID, '_', BylinePosition) as pm_ao1,
89 | MEDLINE_Name as MEDLINE_Name1,
90 | AINI as AINI1,
91 | FINI as FINI1
92 | from and_ds.AUT_SCT_info) using pm_ao1
93 | ) any
94 | inner join (select concat(PMID, '_', BylinePosition) as pm_ao2,
95 | MEDLINE_Name as MEDLINE_Name2,
96 | AINI as AINI2,
97 | FINI as FINI2
98 | from and_ds.AUT_SCT_info) using pm_ao2;
99 |
100 | -- 6214199 paired_authors
101 | -- 1680310 num_citations
102 | select count() as cnt, 'paired_authors' as name
103 | from and_ds.AUT_SCT_pairs
104 | union all
105 | select arrayUniq(arrayConcat(groupArray(PMID_1), groupArray(PMID_2))) as cnt, 'num_citations' as name
106 | from and_ds.AUT_SCT_pairs;
107 |
108 | -- 3076501 num_citations
109 | -- 268631 number_full_initial_based_blocks
110 | -- 245754 num_author_groups
111 | -- 197379 number_first_initial_based_blocks
112 | select count() as cnt, 'num_citations' as name
113 | from and_ds.AUT_ORC
114 | union all
115 | select count(distinct lowerUTF8(AINI)) as cnt, 'number_full_initial_based_blocks' as name
116 | from and_ds.AUT_ORC
117 | union all
118 | select count(distinct lowerUTF8(FINI)) as cnt, 'number_first_initial_based_blocks' as name
119 | from and_ds.AUT_ORC
120 | union all
121 | select count(distinct ORCID) as cnt, 'num_author_groups' as name
122 | from and_ds.AUT_ORC
123 | ;
124 |
125 | -- 312951 num_citations
126 | -- 34206 num_author_groups
127 | -- 37185 number_full_initial_based_blocks
128 | -- 29243 number_first_initial_based_blocks
129 | select count() as cnt, 'num_citations' as name
130 | from and_ds.AUT_NIH
131 | union all
132 | select count(distinct lowerUTF8(AINI)) as cnt, 'number_full_initial_based_blocks' as name
133 | from and_ds.AUT_NIH
134 | union all
135 | select count(distinct lowerUTF8(FINI)) as cnt, 'number_first_initial_based_blocks' as name
136 | from and_ds.AUT_NIH
137 | union all
138 | select count(distinct NIH_ID) as cnt, 'num_author_groups' as name
139 | from and_ds.AUT_NIH
140 | ;
141 |
142 | -- name variations
143 | -- 226588
144 | select count()
145 | from (
146 | select lowerUTF8(trimBoth(splitByString(',', MEDLINE_Name)[1])) as medline_lastname,
147 | lowerUTF8(trimBoth(splitByString('|', ORCID_Name)[1])) as orcid_lastname,
148 | MEDLINE_Name,
149 | ORCID_Name
150 | from and_ds.AUT_ORC
151 | where medline_lastname != orcid_lastname)
152 | union all
153 | select count()
154 | from and_ds.AUT_ORC
155 | ;
156 |
157 |
158 | select count()
159 | from (
160 | select lowerUTF8(trimBoth(splitByString(',', MEDLINE_Name)[1])) as medline_lastname,
161 | lowerUTF8(splitByChar('_', AINI)[1]) as block_lastname
162 | from and_ds.AUT_ORC
163 | where medline_lastname != block_lastname)
164 | ;
165 | select count()
166 | from and_ds.AUT_ORC;
167 |
168 | select arrayUniq(arrayConcat(groupArray(PMID1), groupArray(PMID2))) as cnt, 'num_citations' as name
169 | from and.GS;
--------------------------------------------------------------------------------
/src/eutilities/string_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import re
4 | import string
5 | import unicodedata
6 |
7 | import geograpy
8 | import jaro
9 | from nltk import ngrams
10 |
11 | os.environ['JAVAHOME'] = "/usr/local/jdk-11.0.1"
12 | from Levenshtein.StringMatcher import StringMatcher
13 | from nltk.tag import StanfordNERTagger
14 | from nltk.corpus import stopwords
15 |
16 | def extract_email(affi):
17 | match = re.search(r'[\w\.-]+@[\w\.-]+', affi)
18 | if match is not None:
19 | result = match.group(0)
20 |
21 | if result[-1] == '.':
22 | result = result[:len(result) - 1]
23 | return result
24 | return None
25 |
26 |
27 | def extract_inner_words(string):
28 | replaced = re.sub('[^a-z]', " ", string)
29 | splts = replaced.split(' ')
30 | return [s for s in splts if len(s) > 2]
31 |
32 |
33 | def extract_word_list(string):
34 | return re.findall(r'\w+', string)
35 |
36 |
37 | def extract_key_wods_list(key_words_str):
38 | key_words = []
39 | key_words_dict = json.loads(key_words_str)
40 | if key_words_dict == None:
41 | return []
42 | for item in key_words_dict:
43 | if 'keyword' in item:
44 | keyword_ = item['keyword']
45 | keyword_ = extract_inner_words(keyword_)
46 | key_words += keyword_
47 | return key_words
48 |
49 |
50 | # 有28895个不重复的 mesh heading
51 | def extract_mesh_headings(raw_str: str):
52 | s = json.loads(raw_str)
53 | if s == None:
54 | return []
55 | desc_name_list = []
56 | for item in s:
57 | if 'descriptorName' in item:
58 | # TODO 'qualifierNameList'
59 | descriptorname_ = item['descriptorName']
60 | descriptorname_ = extract_inner_words(descriptorname_)
61 | desc_name_list += descriptorname_
62 | return desc_name_list
63 |
64 |
65 | def edit_distinct_diff_chars(str1, str2):
66 | str_matcher = StringMatcher()
67 | if len(str1) < len(str2):
68 | str1, str2 = str2, str1
69 | str_matcher.set_seqs(str1, str2)
70 | editops = str_matcher.get_editops()
71 | # print(editops)
72 | diff_chars = []
73 | for model, pos1, pos2 in editops:
74 | if model == 'delete':
75 | # print('delete: ', str1[pos1])
76 | diff_chars.append(str1[pos1])
77 | elif model == 'replace':
78 | # print('replace: ', str1[pos1])
79 | diff_chars.append(str1[pos1])
80 | elif model == 'insert':
81 | # print('insert: ', str2[pos2])
82 | diff_chars.append(str2[pos2])
83 | return diff_chars
84 |
85 |
86 | def jaro_winkler_similarity(s1, s2):
87 | if s1 is None or s2 is None:
88 | return 0.0
89 | return jaro.jaro_winkler_metric(s1, s2)
90 |
91 |
92 | # Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
93 | all_letters = string.ascii_letters + " -"
94 | all_letters = set([c for c in all_letters])
95 | n_letters = len(all_letters)
96 |
97 |
98 | def convert_unicode_to_ascii(s):
99 | s = s.lower()
100 | return ''.join(
101 | c for c in unicodedata.normalize('NFD', s)
102 | if unicodedata.category(c) != 'Mn'
103 | )
104 |
105 |
106 | def ngram_sequence(s, n=2):
107 | grams = ngrams(list(s), n)
108 | grams = [''.join(gram) for gram in grams]
109 | return grams
110 |
111 |
112 | en_stopwords_set = set(stopwords.words('english'))
113 |
114 |
115 | def intersection(a, b, remove_stop_word=False):
116 | if a is None or b is None:
117 | return 0
118 | if remove_stop_word:
119 | a = [n for n in a if n not in en_stopwords_set]
120 | b = [n for n in b if n not in en_stopwords_set]
121 | intersections = len(set(a).intersection(set(b)))
122 | return intersections
123 |
124 |
125 | def jaccard_similarity(a, b, remove_stop_word=False):
126 | if a is None or b is None:
127 | return 0.0
128 | if remove_stop_word:
129 | a = [n for n in a if n not in en_stopwords_set]
130 | b = [n for n in b if n not in en_stopwords_set]
131 | unions = len(set(a).union(set(b)))
132 | if unions == 0:
133 | return 0.0
134 | intersections = len(set(a).intersection(set(b)))
135 | return 1. * intersections / unions
136 |
137 |
138 | # 3 class: Location, Person, Organization
139 | # 4 class: Location, Person, Organization, Misc
140 | # 7 class: Location, Person, Organization, Money, Percent, Date, Time
141 | # english.all.3class.caseless.distsim.crf.ser.gz
142 | # english.conll.4class.caseless.distsim.crf.ser.gz
143 | # english.muc.7class.caseless.distsim.crf.ser.gz
144 | stanford_ner_base_path = '/home/zhangli/mydisk-2t/apps/stanford-ner-4.0.0/'
145 | st = StanfordNERTagger(
146 | model_filename=('%sclassifiers/english.all.3class.distsim.crf.ser.gz' % stanford_ner_base_path),
147 | path_to_jar=('%sstanford-ner.jar' % stanford_ner_base_path))
148 |
149 |
150 | def ner(s):
151 | if s is None or len(s) == 0:
152 | return [], []
153 | res = st.tag(s.split())
154 | print(res)
155 | l = len(res)
156 | broken_point = [i + 1 for i in range(l - 1) if res[i][1] != res[i + 1][1]]
157 | start = [0] + broken_point
158 | end = broken_point + [l]
159 | locs, orgs = [], []
160 | for s, e in zip(start, end):
161 | if e <= s:
162 | continue
163 | entities_with_class = res[s:e]
164 | cls = entities_with_class[0][1]
165 | entity = ' '.join([n[0] for n in entities_with_class])
166 | if cls == 'ORGANIZATION':
167 | orgs.append(entity)
168 | elif cls == 'LOCATION':
169 | locs.append(entity)
170 | return locs, orgs
171 |
172 |
173 | cached_extracted_geo = dict()
174 |
175 |
176 | def extract_geo(s):
177 | if s is None or len(s) == 0:
178 | return [[], [], [], []]
179 | if s not in cached_extracted_geo:
180 | # places = geograpy.Extractor(text=s).find_geoEntities()
181 | places = geograpy.get_geoPlace_context(text=s)
182 | cached_extracted_geo[s] = [
183 | [n.lower() for n in places.countries],
184 | [n.lower() for n in places.regions],
185 | [n.lower() for n in places.cities],
186 | [n.lower() for n in places.other]
187 | ]
188 | # print(places)
189 | return cached_extracted_geo[s]
190 |
191 |
192 | if __name__ == '__main__':
193 | import time
194 |
195 | t1 = time.time()
196 | # s = "University of Minnesota, Minneapolis, Minnesota 55455, USA."
197 | s = "University of California, San Diego, La Jolla, California 92093, USA."
198 | for _ in range(10):
199 | chars = ner(s)
200 | t2 = time.time()
201 | print(t2 - t1)
202 | print(chars)
203 | print()
204 | for _ in range(10):
205 | chars = extract_geo(s)
206 | print(chars)
207 | t3 = time.time()
208 | print(t3 - t2)
209 |
--------------------------------------------------------------------------------
/src/feature/cluster/fast_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import joblib
4 | import numpy as np
5 | from mytookit.data_reader import DBReader
6 | from nltk.corpus import stopwords
7 | from tqdm import tqdm
8 |
9 | from eutilities.string_utils import jaccard_similarity, ngram_sequence, convert_unicode_to_ascii
10 | from myconfig import cached_dir
11 |
12 | en_stopwords_set = set(stopwords.words('english'))
13 |
14 | sql_block = r'''
15 | select block_fullname as block_name,
16 | arrayMap(x->x[1],
17 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos,
18 | arrayMap(x->x[2], tmp) as ground_truths,
19 | arrayMap(x->x[3], tmp) as mag_preds
20 | from (select block_fullname,
21 | author_group_orcid,
22 | -- Note has verified all mag_author_id is successfully matched
23 | toString(aid) as mag_author_id,
24 | concat(toString(pid), '_', toString(author_position)) as pid_ao
25 | from and_ds.our_and_dataset_block any
26 | left join (
27 | select pid, aid, author_position
28 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
29 | from mag.paper_author_affiliation) any
30 | inner join and_ds.our_and_dataset_block using pid, author_position
31 | ) using pid, author_position)
32 | group by block_name
33 | having xxHash32(block_name) %% 10=%d
34 | order by length(pid_aos) desc;
35 | ;'''
36 |
37 | sql_metadata = r'''
38 | select concat(toString(pid), '_', toString(author_position)) as pid_ao,
39 | block_fullname,
40 | author_group_orcid as orcid,
41 | -- -- Note has verified all mag_author_id is successfully matched
42 | lowerUTF8(author_name) as author_name,
43 | arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation,
44 | -- coauthors,
45 | arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue,
46 | pub_year,
47 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content
48 | from and_ds.our_and_dataset_block any
49 | left join (
50 | select pid, aid, author_position
51 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
52 | from mag.paper_author_affiliation) any
53 | inner join and_ds.our_and_dataset_block using pid, author_position
54 | ) using pid, author_position
55 | where xxHash32(block_fullname) %% 10=%d
56 | '''
57 |
58 | num_features = 5
59 |
60 | for seg in range(0, 10, 1):
61 | sql = sql_metadata % seg
62 | print(sql)
63 | # Note prepare the paper metadata dict
64 | df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False)
65 | print(df_metadata.shape)
66 | print(df_metadata.head())
67 |
68 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values))
69 | md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values))
70 | md_author_name_dict = dict(zip(df_metadata['pid_ao'].values,
71 | df_metadata['author_name'].apply(
72 | lambda x: ngram_sequence(convert_unicode_to_ascii(x))).values))
73 | md_author_affiliation_dict = dict(
74 | zip(df_metadata['pid_ao'].values, df_metadata['author_affiliation'].apply(lambda x: x.split(' ')).values))
75 | # md_coauthors_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['coauthors'].values))
76 | md_venue_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['venue'].apply(lambda x: x.split(' ')).values))
77 | md_pub_year_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['pub_year'].values))
78 | # md_content_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].values))
79 | md_content_word_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(
80 | lambda x: set([w for w in x.split(' ') if not w in en_stopwords_set])).values))
81 | # md_doc2vec_emd_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(
82 | # lambda x: doc2vec_model.infer_vector(x.split(' '), steps=12, alpha=0.025)).values))
83 |
84 | del df_metadata
85 |
86 | all_block_feature = {}
87 | sql = sql_block % seg
88 | print(sql)
89 | df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False)
90 | print(df_block.shape)
91 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]):
92 | block_name, pid_aos, ground_truths, mag_preds = row
93 |
94 | # Note calculate the similarity between different metadata according to pid_ao
95 | num_instances = len(pid_aos)
96 | # if ij % 10 == 0:
97 | # print(num_instances)
98 |
99 | pairwise_feature_matrix = np.zeros(shape=(num_instances, num_instances, num_features), dtype=np.float16)
100 | for i, pid_ao_i in enumerate(pid_aos):
101 | for j, pid_ao_j in enumerate(pid_aos):
102 | author_names1, author_names2 = md_author_name_dict[pid_ao_i], md_author_name_dict[pid_ao_j]
103 | aff_arr1, aff_arr2 = md_author_affiliation_dict[pid_ao_i], md_author_affiliation_dict[pid_ao_j]
104 |
105 | orcid1, orcid2 = md_orcid_dict[pid_ao_i], md_orcid_dict[pid_ao_j]
106 | content_word1, content_word2 = md_content_word_dict[pid_ao_i], md_content_word_dict[pid_ao_j]
107 |
108 | venue1, venue2 = md_venue_dict[pid_ao_i], md_venue_dict[pid_ao_j]
109 | pub_year1, pub_year2 = md_pub_year_dict[pid_ao_i], md_pub_year_dict[pid_ao_j]
110 |
111 | # if author_names1 != convert_unicode_to_ascii(author_names1):
112 | # print(author_names1, convert_unicode_to_ascii(author_names1))
113 |
114 | name_similarity = jaccard_similarity(author_names1, author_names2)
115 |
116 | pub_year_diff = 1.0 * (abs(pub_year1 - pub_year2) if pub_year1 > 0 and pub_year2 > 0 else -1)
117 |
118 | paper_title_abstract_similarity = jaccard_similarity(content_word1, content_word2, remove_stop_word=False)
119 |
120 | venue_similarity = jaccard_similarity(venue1, venue2)
121 |
122 | aff_similarity = jaccard_similarity(aff_arr1, aff_arr2)
123 |
124 | pairwise_feature_matrix[i][j] = [name_similarity,
125 | pub_year_diff,
126 | paper_title_abstract_similarity,
127 | venue_similarity,
128 | aff_similarity]
129 | all_block_feature[block_name] = pairwise_feature_matrix
130 |
131 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir, 'cluster_feature/five-fast-features-%d.pkl' % seg))
132 |
--------------------------------------------------------------------------------
/src/statistics/last_name_variation_considering_transliterating.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | from unidecode import unidecode
3 |
4 | from mytookit.data_reader import DBReader
5 |
6 | which_dataset = ['pairwise', 'block'][1]
7 | applying_transliterating = False
8 |
9 | sources, methods = ['S2', 'MAG'], ['derek73', 'joshfraser-NameParser']
10 | # sources, methods = ['S2'], ['derek73']
11 | # sources, methods = ['S2'], ['joshfraser-NameParser']
12 | # sources, methods = ['MAG'], ['derek73']
13 | # sources, methods = ['MAG'], ['joshfraser-NameParser']
14 |
15 | field_map = {'derek73': 'derek73_top_lastname', 'joshfraser-NameParser': 'joshfraser_top_lastname'}
16 |
17 |
18 | def clean_name(s):
19 | return unidecode(s).lower()
20 |
21 |
22 | sql_template = '''
23 | select matched_biblio_author_name, biblio_author_split_lastname, orcid_lastname, top_lastname
24 | from (
25 | select pid,
26 | orcid,
27 | matched_biblio_author_name,
28 | lowerUTF8(matched_biblio_author_split_names[3]) as biblio_author_split_lastname
29 | from and_ds.orcid_mag_s2_author_name_split_by_various_algorithms_with_author_position
30 | where source = 'SOURCE' and method = 'METHOD'
31 | ) any
32 | join (select orcid,
33 | orcid_lastname,
34 | FIELD as top_lastname
35 | from and_ds.orcid_mag_s2_actual_author_name
36 | where source = 'SOURCE'
37 | ) using orcid;
38 | '''
39 |
40 |
41 | for source in sources:
42 | for method in methods:
43 | sql = sql_template.replace('SOURCE', source).replace('METHOD', method).replace('FIELD', field_map[method])
44 | print(sql)
45 | df = DBReader.tcp_model_cached_read("cached/XXXXX", sql, cached=False)
46 | print(df.shape)
47 | num_instances = df.shape[0]
48 | df['matched_biblio_author_name'] = df['matched_biblio_author_name'].apply(clean_name)
49 | df['biblio_author_split_lastname'] = df['biblio_author_split_lastname'].apply(clean_name)
50 | df['orcid_lastname'] = df['orcid_lastname'].apply(clean_name)
51 | df['top_lastname'] = df['top_lastname'].apply(clean_name)
52 | not_endwith_orcid_lastname, not_endwith_top_lastname, not_identicalwith_orcid_lastname, not_identicalwith_top_lastname = 0, 0, 0, 0
53 | for i, (biblio_author_name, biblio_author_lastname, orcid_lastname, top_lastname) in df.iterrows():
54 | if i % 100000 == 0:
55 | print(i * 1.0 / num_instances)
56 | # end with orcid lastname
57 | if not biblio_author_name.endswith(orcid_lastname):
58 | not_endwith_orcid_lastname += 1
59 | # end with top lastname
60 | if not biblio_author_name.endswith(top_lastname):
61 | not_endwith_top_lastname += 1
62 | # identical with orcid lastname
63 | if biblio_author_lastname != orcid_lastname:
64 | not_identicalwith_orcid_lastname += 1
65 | # identical with top lastname
66 | if biblio_author_lastname != top_lastname:
67 | not_identicalwith_top_lastname += 1
68 | print(source, method, not_endwith_orcid_lastname, not_endwith_top_lastname, not_identicalwith_orcid_lastname,
69 | not_identicalwith_top_lastname)
70 | print(source, method, not_endwith_orcid_lastname * 1.0 / num_instances,
71 | not_endwith_top_lastname * 1.0 / num_instances, not_identicalwith_orcid_lastname * 1.0 / num_instances,
72 | not_identicalwith_top_lastname * 1.0 / num_instances)
73 |
74 | # PubMed
75 | df = DBReader.tcp_model_cached_read("cached/XXXXX", """
76 | select matched_biblio_author_lastname, orcid_lastname, top_lastname
77 | from and_ds.orcid_pubmed_author_linkage_with_author_position_with_topname;
78 | """, cached=False)
79 | print(df.shape)
80 | num_instances = df.shape[0]
81 | df['matched_biblio_author_lastname'] = df['matched_biblio_author_lastname'].apply(clean_name)
82 | df['orcid_lastname'] = df['orcid_lastname'].apply(clean_name)
83 | df['top_lastname'] = df['top_lastname'].apply(clean_name)
84 | not_identicalwith_orcid_lastname, not_identicalwith_top_lastname = 0, 0
85 |
86 | for i, (biblio_author_lastname, orcid_lastname, top_lastname) in df.iterrows():
87 | if i % 100000 == 0:
88 | print(i * 1.0 / num_instances)
89 | if biblio_author_lastname != orcid_lastname:
90 | not_identicalwith_orcid_lastname += 1
91 | # identical with top lastname
92 | if biblio_author_lastname != top_lastname:
93 | not_identicalwith_top_lastname += 1
94 | print('PubMed', '-', not_identicalwith_orcid_lastname, not_identicalwith_top_lastname)
95 | print(not_identicalwith_orcid_lastname * 1.0 / num_instances, not_identicalwith_top_lastname * 1.0 / num_instances)
96 |
97 | # Our dataset
98 | df = DBReader.tcp_model_cached_read("cached/XXXXX", """
99 | select tupleElement(item, 2) as author_biblio_name,
100 | tupleElement(item, 3) as orcid_last_name,
101 | toInt64(tupleElement(arrayJoin(paper_orcid_lastname_bib_name) as item, 1) as pid) in
102 | (select arrayJoin(flatten(groupArray([pid1, pid2])))
103 | from and_ds.our_and_dataset_pairwise_gold_standard) as for_pairwise_dataset
104 | from (
105 | select arrayJoin(
106 | -- full_name_blocks: (num_work, orcid, same_orcidauthor_paper_positions, lastname_variations, same_orcidauthor_paper_repres)
107 | -- same_orcidauthor_paper_repres: [(pid, author_position, orcid, orcid_names, matched_biblio_author, ethnic_seer, ethnea, genni, sex_mac, ssn_gender, pub_year, fos_arr), ..., ]
108 | arrayZip(
109 | arrayMap(x->arrayMap(y->y[1], x.3), full_name_blocks) as tmp_pids,
110 | arrayMap(x->arrayMap(y->
111 | y.4, x.5), full_name_blocks) as tmp_orcid_names,
112 | arrayMap(x->arrayMap(y->
113 | y.5, x.5), full_name_blocks) as tmp_bib_names
114 | )) as paper_orcid_names,
115 |
116 | tupleElement(paper_orcid_names, 1) as pids,
117 | arrayMap(x->lowerUTF8(x[2]), tupleElement(paper_orcid_names, 2)) as orcid_last_names,
118 | tupleElement(paper_orcid_names, 3) as author_biblio_names,
119 | length(pids) = length(orcid_last_names) as is_valid,
120 | arrayZip(pids, author_biblio_names, orcid_last_names) as paper_orcid_lastname_bib_name
121 | from and_ds.orcid_mag_matched_fullname_block)
122 | ;
123 | """, cached=False)
124 |
125 | if which_dataset == 'pairwise':
126 | df = df[df['for_pairwise_dataset'] == 1]
127 |
128 | del df['for_pairwise_dataset']
129 | print(df.shape)
130 |
131 | if applying_transliterating:
132 | df['author_biblio_name'] = df['author_biblio_name'].apply(clean_name)
133 | df['orcid_last_name'] = df['orcid_last_name'].apply(clean_name)
134 |
135 | not_endwith_orcid_lastname = 0
136 | for i, (author_biblio_name, orcid_last_name) in tqdm(df.iterrows(), total=df.shape[0]):
137 | if not author_biblio_name.endswith(orcid_last_name):
138 | not_endwith_orcid_lastname += 1
139 |
140 | print(which_dataset, '%s transliterating' % ('with' if applying_transliterating else 'without'), 'Our dataset', 'Endwith',
141 | not_endwith_orcid_lastname)
142 | num_instances = df.shape[0]
143 | print(not_endwith_orcid_lastname * 1.0 / num_instances)
144 | # Note result
145 | # pairwise without transliterating Our dataset Endwith 181348; 0.09719875824336029
146 | # pairwise with transliterating Our dataset Endwith 122208; 0.06550094761124785
147 |
148 | # block without transliterating Our dataset Endwith 722965; 0.09626437527720622
149 | # block with transliterating Our dataset Endwith 485079; 0.06458933267183324
150 |
151 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/src/feature/simple_matching_network_trainer_evaluator.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from matplotlib import pyplot as plt
9 | from mytookit.data_reader import DBReader
10 | from torch.utils.data.dataset import Dataset
11 | from torchtext import vocab
12 |
13 | from eutilities import train_utils
14 | from eutilities.preprocessor import down_sample
15 | from model.nn import MatchGRU
16 | from myconfig import cached_dir, glove840b300d_path
17 |
18 | underlying_dataset = 'pairwise-gold-standard'
19 | print(underlying_dataset)
20 | glove_vocab_size = ['6B', '840B'][1]
21 |
22 | need_balance_dataset = True
23 |
24 | # # Note we use the 840B model as the word embedding
25 | glove = vocab.GloVe(name=glove_vocab_size, dim=300, cache=glove840b300d_path)
26 | pad_idx = 0
27 | batch_size = 128
28 | epochs = 30
29 | lr = 5e-5
30 | max_sql_len = 300
31 | print(max_sql_len)
32 |
33 | # evice = torch.device('cpu')
34 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
35 | print('use device: ', device)
36 |
37 |
38 | def word_token(txt):
39 | words = txt.lower().split()
40 | tokens = [glove.stoi[word] for word in words if word in glove.stoi]
41 | tokens = tokens[:max_sql_len] if len(tokens) >= max_sql_len else tokens + [pad_idx] * (
42 | max_sql_len - len(tokens))
43 | return tokens
44 |
45 |
46 | class ANDDataset(Dataset):
47 | def __init__(self, df):
48 | # self.num_hand_craft_feature_set = ['name_similarity', 'same_biblio_aid', 'pub_year_diff', 'venue_similarity',
49 | # 'aff_similarity']
50 | # df[self.num_hand_craft_feature_set] = MinMaxScaler().fit_transform(df[self.num_hand_craft_feature_set])
51 | # df[self.num_hand_craft_feature_set] = StandardScaler().fit_transform(df[self.num_hand_craft_feature_set])
52 | self.df = df
53 |
54 | def __getitem__(self, index):
55 | data_item = self.df.iloc[index]
56 | pid1 = data_item.pid1
57 | ao1 = data_item.ao1
58 | pid2 = data_item.pid2
59 | ao2 = data_item.ao2
60 | same_author = data_item.same_author
61 | # train1_test0_val2 = data_item.train1_test0_val2
62 | content1 = data_item.content1
63 | content2 = data_item.content2
64 | # hand-craft features
65 | # HF = torch.tensor([data_item[n] for n in self.num_hand_craft_feature_set],
66 | # dtype=torch.float)
67 | XL, XR = torch.tensor(word_token(content1)), torch.tensor(word_token(content2))
68 | Y = torch.tensor([0], dtype=torch.float) if same_author == 0 else torch.tensor([1], dtype=torch.float)
69 | MT = [int(pid1), int(ao1), int(pid2), int(ao2), int(same_author)]
70 | return MT, XL, XR, Y
71 |
72 | def __len__(self):
73 | return len(self.df)
74 |
75 |
76 | df = DBReader.tcp_model_cached_read("XXXX",
77 | sql="""select * from and_ds.matching_network_train_corpus""",
78 | cached=False)
79 | print(df.shape)
80 | # df = df.dropna(0)
81 |
82 | if underlying_dataset == 'pairwise-gold-standard':
83 | data_split_field = 'train1_test0_val2'
84 | print(set(df[data_split_field].values))
85 | df_train_set = df[df[data_split_field].astype(int) == 1]
86 | # df_train_set = df_train_set.sample(frac=0.8, random_state=42)
87 | df_val_set = df[df[data_split_field].astype(int) == 2]
88 | # Note because we need to give all the instances a similar score, so the infer_set used here is all the instances
89 | df_infer_set = df
90 | elif underlying_dataset == 'block-gold-standard':
91 | pass
92 |
93 | # Note for the training dataset, try to balance the dataset
94 | if need_balance_dataset:
95 | print('pos_samples_num: ', df_train_set[df_train_set['same_author'] == 1].shape[0])
96 | print('neg_samples_num: ', df_train_set[df_train_set['same_author'] == 0].shape[0])
97 | df_train_set = down_sample(df_train_set, percent=4)
98 | print('after balancing dataset shape: ', df_train_set.shape)
99 | print('pos_samples_num: ', df_train_set[df_train_set['same_author'] == 1].shape[0])
100 | print('neg_samples_num: ', df_train_set[df_train_set['same_author'] == 0].shape[0])
101 |
102 | df_train_set.reset_index(inplace=True, drop=True)
103 | df_val_set.reset_index(inplace=True, drop=True)
104 | df_infer_set.reset_index(inplace=True, drop=True)
105 |
106 | print('df_train shape:', df_train_set.shape, 'df_val shape:', df_val_set.shape, 'df_test shape:', df_infer_set.shape)
107 | train_set = ANDDataset(df_train_set)
108 | val_set = ANDDataset(df_val_set)
109 | infer_set = ANDDataset(df_infer_set)
110 |
111 | # Instantiate the dataset and get data loaders. The training dataset is split into train_set and test_set.
112 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True,
113 | num_workers=8) # collate_fn=pad_collate
114 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=batch_size, shuffle=False,
115 | num_workers=8) # collate_fn=pad_collate
116 | infer_loader = torch.utils.data.DataLoader(dataset=infer_set, batch_size=batch_size, shuffle=False,
117 | num_workers=8) # collate_fn=pad_collate
118 |
119 | model = MatchGRU(glove, hidden_dim=64, num_layers=2,
120 | # num_hand_craft_feature=len(train_set.num_hand_craft_feature_set),
121 | bidirectional=True, output_dim=2).to(device)
122 | print(model)
123 |
124 | # pos_weight (Tensor, optional): a weight of positive examples. Must be a vector with length equal to the number of classes.
125 | pos_weight = len(df_train_set[df_train_set['same_author'] == 0]) * 1.0 / len(df_train_set[df_train_set['same_author'] == 1])
126 |
127 | # criterion = nn.MSELoss()
128 | criterion = nn.BCEWithLogitsLoss()
129 | # criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))
130 |
131 | parameters = model.parameters()
132 | optimizer = optim.Adam(parameters, lr=lr)
133 |
134 | losst, lossv = [], []
135 | for epoch in range(1, epochs + 1):
136 | train_utils.train(model, train_loader, criterion, optimizer, epoch, epochs, losst)
137 | train_utils.validate(model, val_loader, criterion, lossv)
138 | if lossv[-1] == min(lossv): # Current best model, push to disk
139 | torch.save({
140 | 'epoch': epoch,
141 | 'model_state_dict': model.state_dict(),
142 | 'optimizer_state_dict': optimizer.state_dict(),
143 | 'losst': losst[-1],
144 | 'lossv': lossv[-1]
145 | }, os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset)))
146 |
147 | plt.figure(figsize=(5, 3))
148 | plt.plot(np.arange(1, len(losst) + 1), losst, label="training")
149 | plt.plot(np.arange(1, len(lossv) + 1), lossv, label="validation")
150 | plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
151 | plt.grid()
152 | plt.title('loss vs epoch')
153 | plt.show()
154 | plt.savefig(os.path.join(cached_dir, 'match-network-training-loss.png'))
155 |
156 | checkpoint = torch.load(os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset)))
157 | model.load_state_dict(checkpoint['model_state_dict'])
158 | optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
159 | print('Epoch:', checkpoint['epoch'])
160 | print('losst:', checkpoint['losst'])
161 | print('lossv:', checkpoint['lossv'])
162 | model.eval()
163 |
164 | # The following code is used for inferring the similarity scores for the pairwise dataset, the simple text matching neural network acts as a content-based feature generator.
165 | # In doing so, we find that passing the paired input to the network with different orders (LEFT-RIGHT or RIGHT-LEFT) may yield different results,
166 | # So, we simply calcualte the same paired input twice but with different orders and calcualte the averaged scores as the final sorces.
167 |
168 | # Note inferring LEFT_RIGHT input
169 | d1 = dict()
170 | metadata, true_label_numpy, pred_label_numpy, pred_prob = train_utils.validate(model, infer_loader, criterion, [],
171 | switch_input=False)
172 | print(metadata.shape)
173 | assert metadata.shape[1] == len(true_label_numpy) == len(pred_label_numpy)
174 |
175 | same_author_metadata = metadata[4]
176 | for i, n in enumerate(true_label_numpy):
177 | k = '-'.join(list(map(lambda x: str(x), [metadata[0][i], metadata[1][i], metadata[2][i], metadata[3][i]])))
178 | m = same_author_metadata[i]
179 | assert n == m
180 | prob = pred_prob[i]
181 | print(n, pred_label_numpy[i], prob)
182 | d1[k] = str(prob)
183 |
184 | # Note inferring RIGHT_LEFT input
185 | d2 = dict()
186 | metadata, true_label_numpy, pred_label_numpy, pred_prob = train_utils.validate(model, infer_loader, criterion, [],
187 | switch_input=True)
188 | print(metadata.shape)
189 | assert metadata.shape[1] == len(true_label_numpy) == len(pred_label_numpy)
190 |
191 | same_author_metadata = metadata[4]
192 | for i, n in enumerate(true_label_numpy):
193 | k = '-'.join(list(map(lambda x: str(x), [metadata[0][i], metadata[1][i], metadata[2][i], metadata[3][i]])))
194 | m = same_author_metadata[i]
195 | assert n == m
196 | prob = pred_prob[i]
197 | print(n, pred_label_numpy[i], prob)
198 | d2[k] = str(prob)
199 |
200 | d1_keys, d2_keys = set(d1.keys()), set(d2.keys())
201 | print('number exclusive elements: %d; %d' % (
202 | len(d1_keys.difference(d1_keys.intersection(d2_keys))), len(d2_keys.difference(d1_keys.intersection(d2_keys)))))
203 |
204 | d = {}
205 | for k in d1_keys:
206 | d[k] = [d1[k], d2[k]]
207 |
208 | with open(os.path.join(cached_dir, 'matching-score-glove%s-%s.json' % (glove_vocab_size, underlying_dataset)), 'w') as fw:
209 | fw.write(json.dumps(d) + '\n')
210 |
--------------------------------------------------------------------------------
/src/comparison/pairwise/classification_metrics.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import warnings
4 | from random import random
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import seaborn as sb
9 |
10 | custom_params = {"axes.spines.right": False, "axes.spines.top": False}
11 | sb.set_theme(style="ticks", rc=custom_params)
12 |
13 | from matplotlib import pyplot as plt
14 | from sklearn.utils import shuffle
15 | from eutilities.customized_print import pprint
16 | from eutilities.preprocessor import drop_missing_items
17 | from eutilities.metric import calc_metrics, metric_names
18 | from model.available_model import ModelName
19 | from model.classification import use_classifier
20 | from myconfig import cached_dir, latex_doc_base_dir
21 | from mytookit.data_reader import DBReader
22 |
23 | warnings.filterwarnings('ignore')
24 |
25 | glove_vocab_size = '840B'
26 | underlying_dataset = ['pairwise-gold-standard', 'block-gold-standard'][0]
27 | cluster_uniq_author_gt_1 = False
28 | # cluster_uniq_author_gt_1 = True
29 |
30 | print(underlying_dataset)
31 |
32 | feature_file = os.path.join(cached_dir, 'pairwise_and_dataset_feature_full.tsv')
33 | df = pd.read_csv(feature_file, sep='\t')
34 | column_names = df.columns.values.tolist()
35 | print('column_names: ', column_names)
36 | del df['content1'], df['content2']
37 |
38 | # if we use the pairwise AND model to disambiguate trimmed-blocks-based dataset, the training dataset should contain "large" fullname
39 | if cluster_uniq_author_gt_1:
40 | num_instances = len(df)
41 | block_uniq_author_gt_1 = set(DBReader.tcp_model_cached_read(cached_file_path='xxxxx',
42 | sql='''select block_fullname from and_ds.our_and_dataset_block where num_unique_author_inblock = 1;''',
43 | cached=False)['block_fullname'].values)
44 | df = df[df['fullname'].apply(lambda x: x not in block_uniq_author_gt_1) == 1]
45 | num_instances1 = len(df)
46 | print('removed %d instances which are in small blocks' % (num_instances - num_instances1))
47 |
48 | print('dataset size before deduplication', df.shape)
49 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0])
50 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0])
51 | df.drop_duplicates(keep='first', inplace=True)
52 | print('dataset size after deduplication', df.shape)
53 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0])
54 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0])
55 |
56 | mode_names = ModelName.available_modes()
57 | print('available_modes: ', mode_names)
58 |
59 | matching_score_dict = json.loads(open(os.path.join(cached_dir,
60 | 'matching-score-glove%s-%s.json' % (glove_vocab_size, underlying_dataset))
61 | ).readline())
62 | print(len(matching_score_dict))
63 |
64 |
65 | def get_score(row):
66 | try:
67 | k = '-'.join(map(lambda x: str(x), row.values.astype(int)))
68 | if k in matching_score_dict:
69 | # print('hit')
70 | v_left_right, v_right_left = matching_score_dict[k]
71 | v_left_right, v_right_left = float(v_left_right), float(v_right_left)
72 | # v = v if v > 0.5 else 0
73 | v = (v_left_right + v_right_left) / 2
74 | return v
75 | else:
76 | # print('nan')
77 | return np.nan
78 | except Exception as e:
79 | print('error: ', e)
80 | return np.nan
81 |
82 |
83 | df['random'] = df['pid1'].apply(lambda x: random())
84 |
85 | df['match_score'] = df[['pid1', 'ao1', 'pid2', 'ao2']].apply(
86 | lambda row: get_score(row), axis=1)
87 |
88 | print(df.shape, df[['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity',
89 | 'tfidf_cosin_similarity', 'content_cosin_similarity', 'match_score']].mean())
90 |
91 | feature_names_groups = [
92 | ['rand', ['random']],
93 | ['magaid', ['same_biblio_aid']],
94 | ['match_score', ['match_score']],
95 | ['name', ['name_similarity']],
96 | ['bf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity']],
97 | ['bf-cfjaccard',
98 | ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity']],
99 | ['bf-cftfidf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'tfidf_cosin_similarity']],
100 | ['bf-cfdoc2vec', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'content_cosin_similarity']],
101 | ['bf-cfnn', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'match_score']]
102 | ]
103 |
104 | formal_feature_name_dict = {'same_biblio_aid': 'MAG Author ID', 'name_similarity': 'Name Similarity',
105 | 'pub_year_diff': 'Publication Year Gap',
106 | 'venue_similarity': 'Venue Similarity', 'aff_similarity': 'Affiliation Similarity',
107 | 'paper_title_abstract_similarity': r'Content Similarity $cf_{jaccard}$',
108 | 'tfidf_cosin_similarity': r'Content Similarity $cf_{tfidf}$',
109 | 'content_cosin_similarity': r'Content Similarity $cf_{doc2vec}$',
110 | 'match_score': r'Content Similarity $cf_{nn}$'}
111 |
112 | if __name__ == '__main__':
113 | # df.to_csv('tmp.tsv', sep='\t', index=False)
114 | df = shuffle(df)
115 | print(df.head())
116 |
117 | print('original shape: ', df.shape)
118 | df = drop_missing_items(df)
119 | print('after dropping none shape: ', df.shape)
120 |
121 | print('pos_samples_num: ', df[df['same_author'] == 1].shape[0])
122 | print('neg_samples_num: ', df[df['same_author'] == 0].shape[0])
123 | # df = down_sample(df)
124 | # print('after balancing dataset shape: ', df.shape)
125 | # print('pos_samples_num: ', df[df['same_author'] == 1].shape[0])
126 | # print('neg_samples_num: ', df[df['same_author'] == 0].shape[0])
127 |
128 | for feature_group_name, feature_names in feature_names_groups:
129 | for idx, model_switch in enumerate(mode_names):
130 | df_copy = df.copy(deep=True)
131 | print('-' * 160)
132 | print(str(model_switch) + '\tused features:\n', '\t'.join(feature_names))
133 | Y = np.array(df_copy['same_author'].astype('int'))
134 | X = df_copy[feature_names]
135 | # X = scale(X) # TODO scale will improve the performance sightly
136 | X = np.array(X)
137 |
138 | avg_metrics = []
139 |
140 | # Note we do not using cross validation because the test set is very large
141 | train_test_index = df_copy['train1_test0_val2'].astype('int')
142 | indx_split = [
143 | ([i for i, n in enumerate(train_test_index) if n == 1],
144 | [i for i, n in enumerate(train_test_index) if n == 0])
145 | ]
146 |
147 | for round_idx, (train_index, test_index) in enumerate(indx_split):
148 | train_X, train_y = X[train_index], Y[train_index]
149 | test_X, test_y = X[test_index], Y[test_index]
150 |
151 | if len(feature_names) == 1:
152 | # Note if only one feature, then no need to use any classifier. 0.5 is the cut-off value
153 | pred_y, model = test_X, None
154 | else:
155 | # Note if only multiple features, then using a classifier
156 | pred_y, model = use_classifier(train_X, train_y, test_X, model_switch=model_switch)
157 |
158 | # pred_y, model = use_regression(train_X, train_y, test_X, model_switch=model_switch)
159 | # save the model
160 | # file_name = 'cached/lagos-and-rf-model.pkl'
161 | # pickle.dump(model, open(file_name, 'wb'))
162 | importances = model.feature_importances_
163 | pprint(list(zip(feature_names, importances)), sep='\t')
164 |
165 | if round_idx == 0 and model is not None:
166 | # Note save the model
167 | # joblib.dump(model, os.path.join(cached_dir,
168 | # 'pairwise_and_models/rf-model-with-feature-%s-trained-on-lagos-and-%s-%s-based-dataset.pkl' % (
169 | # feature_group_name, underlying_dataset,
170 | # 'trimmed' if cluster_uniq_author_gt_1 else 'original')))
171 |
172 | std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
173 | plt.figure(figsize=(6, 4), dpi=300)
174 | plt.grid(linestyle='dashed', linewidth=1, axis='y')
175 |
176 | plt.errorbar([formal_feature_name_dict[n] for n in feature_names], importances, yerr=std,
177 | fmt='D',
178 | # mfc='#C9A66B',
179 | # mec='#662E1C',
180 | ms=3,
181 | mew=3,
182 | ecolor='#AF4425',
183 | lw=3,
184 | ls=':',
185 | color='#AF4425',
186 | capsize=6)
187 | plt.ylabel('Feature Contribution', loc='center') # 'top'
188 | plt.xticks(fontsize=8, rotation=10, ha='center')
189 | plt.tight_layout()
190 | if not cluster_uniq_author_gt_1 and feature_group_name == 'bf-cfnn':
191 | plt.savefig(os.path.join(cached_dir, 'feature-contributions.png'), dpi=600)
192 | plt.savefig(os.path.join(latex_doc_base_dir, 'figs/feature-contributions.png'), dpi=600)
193 | plt.show()
194 |
195 | df_test = pd.DataFrame(df_copy.values[test_index], columns=df_copy.columns.values.tolist())
196 | df_test[feature_names] = test_X
197 | df_test['test_y'] = test_y
198 | df_test['pred_y'] = pred_y
199 | df_test.to_csv(feature_group_name + '_test_instance_predictions.tsv', sep='\t', index=False)
200 | metric_dict = calc_metrics(test_y, pred_y)
201 | metric_tuple = [(m, metric_dict[m]) for m in metric_names]
202 | # pprint(metric_tuple, pctg=True, sep='\t')
203 | avg_metrics.append(metric_dict)
204 |
205 | avg_metric_vals = [np.average([item[m] for item in avg_metrics]) for m in metric_names]
206 | print(metric_names)
207 | pprint(list(zip(metric_names, avg_metric_vals)), pctg=True, sep='\t')
208 |
--------------------------------------------------------------------------------
/src/feature/cluster/network_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append('../../')
5 |
6 | from myconfig import glove840b300d_path, cached_dir, device
7 | import joblib
8 | import numpy as np
9 | import torch
10 | from torchtext import vocab
11 | from tqdm import tqdm
12 | from model.nn import MatchGRU
13 | from mytookit.data_reader import DBReader
14 | from nltk.corpus import stopwords
15 | from torch.utils.data.dataset import Dataset
16 |
17 | en_stopwords_set = set(stopwords.words('english'))
18 |
19 | glove_vocab_size = ['6B', '840B'][1]
20 | underlying_dataset = ['pairwise-gold-standard', 'block-gold-standard'][0]
21 |
22 | sql_block = r'''
23 | select block_fullname as block_name,
24 | arrayMap(x->x[1],
25 | arraySort(x->x[1], groupArray([pid_ao, author_group_orcid, toString(mag_author_id)])) as tmp) as pid_aos,
26 | arrayMap(x->x[2], tmp) as ground_truths,
27 | arrayMap(x->x[3], tmp) as mag_preds
28 | from (select block_fullname,
29 | author_group_orcid,
30 | -- Note has verified all mag_author_id is successfully matched
31 | toString(aid) as mag_author_id,
32 | concat(toString(pid), '_', toString(author_position)) as pid_ao
33 | from and_ds.our_and_dataset_block any
34 | left join (
35 | select pid, aid, author_position
36 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
37 | from mag.paper_author_affiliation) any
38 | inner join and_ds.our_and_dataset_block using pid, author_position
39 | ) using pid, author_position)
40 | group by block_name
41 | having xxHash32(block_name) %% 10=%d
42 | order by length(pid_aos) desc
43 | ;'''
44 |
45 | sql_metadata = r'''
46 | select concat(toString(pid), '_', toString(author_position)) as pid_ao,
47 | block_fullname,
48 | author_group_orcid as orcid,
49 | -- -- Note has verified all mag_author_id is successfully matched
50 | -- lowerUTF8(author_name) as author_name,
51 | -- arrayStringConcat(extractAll(lowerUTF8(author_affiliation), '\\w{1,}'), ' ') as author_affiliation,
52 | -- coauthors,
53 | -- arrayStringConcat(extractAll(lowerUTF8(venue), '\\w{1,}'), ' ') as venue,
54 | -- pub_year,
55 | arrayStringConcat(extractAll(lowerUTF8(concat(paper_title, ' ', paper_abstract)), '\\w{1,}'), ' ') as content
56 | from and_ds.our_and_dataset_block any
57 | left join (
58 | select pid, aid, author_position
59 | from (select PaperId as pid, AuthorId as aid, toInt64(AuthorSequenceNumber) as author_position
60 | from mag.paper_author_affiliation) any
61 | inner join and_ds.our_and_dataset_block using pid, author_position
62 | ) using pid, author_position
63 | where xxHash32(block_fullname) %% 10=%d
64 | '''
65 |
66 | # Note #############################################################################################
67 | # Note test the performance of learnable method.
68 | print('begin load models... ')
69 | glove = vocab.GloVe(name=glove_vocab_size, dim=300, cache=glove840b300d_path)
70 | pad_idx = 0
71 | batch_size = 640
72 | max_sql_len = 300
73 | print(max_sql_len)
74 |
75 | print('use device: ', device)
76 |
77 |
78 | def word_token(txt):
79 | words = txt.lower().split()
80 | tokens = [glove.stoi[word] for word in words if word in glove.stoi]
81 | tokens = tokens[:max_sql_len] if len(tokens) >= max_sql_len else tokens + [pad_idx] * (
82 | max_sql_len - len(tokens))
83 | tokens = np.array(tokens)
84 | return tokens
85 |
86 |
87 | # the model accept the GloVe pretrained word embedding
88 | model = MatchGRU(glove, hidden_dim=64, num_layers=2,
89 | # num_hand_craft_feature=len(train_set.num_hand_craft_feature_set),
90 | bidirectional=True, output_dim=2).to(device)
91 |
92 | model_path = os.path.join(cached_dir, 'match-checkpoint-glove%s-%s.pkl' % (glove_vocab_size, underlying_dataset))
93 | print('model_path: %s' % model_path)
94 | checkpoint = torch.load(model_path)
95 | model.load_state_dict(checkpoint['model_state_dict'])
96 | model.eval()
97 | print('end load models... ')
98 |
99 |
100 | class MyDataset(Dataset):
101 | def __init__(self, all_XL, all_XR):
102 | self.all_XL = all_XL
103 | self.all_XR = all_XR
104 |
105 | def __getitem__(self, index):
106 | XL = self.all_XL[index]
107 | XR = self.all_XR[index]
108 | return XL, XR
109 |
110 | def __len__(self):
111 | return self.all_XL.size(0)
112 |
113 |
114 | def compute_batch_pairwise_similarity(pairwise_dataset):
115 | all_input_loader = torch.utils.data.DataLoader(dataset=pairwise_dataset,
116 | batch_size=batch_size,
117 | # Note shuffle should not be True,
118 | # Note we do not perform shuffle because the output should be in order with the inout samples
119 | shuffle=False)
120 |
121 | prediction1 = torch.tensor([], device=device)
122 | prediction2 = torch.tensor([], device=device)
123 |
124 | for batch_idx, (XL, XR) in enumerate(all_input_loader):
125 | # Note matching similarity
126 | # XL, XR = torch.tensor(word_token(content1), device=device), torch.tensor(word_token(content2), device=device)
127 | with torch.no_grad():
128 | output = model([XL, XR])
129 |
130 | # Note if using BCELogistiLoss, the model does not contain the activation layer
131 | pred = output.sigmoid()
132 |
133 | prediction1 = torch.cat((prediction1, pred))
134 | # prediction2 = torch.cat((prediction2, pred2))
135 |
136 | # pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()]
137 |
138 | # for i, pid_ao_i in enumerate(pid_aos):
139 | # for j, pid_ao_j in enumerate(pid_aos):
140 | # content_word1, content_word2 = md_content_word_dict[pid_ao_i], md_content_word_dict[pid_ao_j]
141 | # # content1, content2 = md_content_dict[pid_ao_i], md_content_dict[pid_ao_j]
142 | # # doc2vec_v1, doc2vec_v2 = md_doc2vec_emd_dict[pid_ao_i], md_doc2vec_emd_dict[pid_ao_j]
143 | #
144 | # # Note matching similarity
145 | # XL, XR = torch.tensor(word_token(content1), device=device), torch.tensor(word_token(content2), device=device)
146 | # prediction = torch.tensor([], device=device)
147 | # with torch.no_grad():
148 | # output = model([XL, XR])
149 | # pred = output.sigmoid()
150 | # prediction = torch.cat((prediction, pred))
151 | #
152 | # pred_label_numpy = [1 if n[1] > 0.5 else 0 for n in prediction.cpu().numpy()]
153 | # pred_prob = [n[1] for n in prediction.cpu().numpy()]
154 |
155 | return prediction1, prediction2
156 |
157 |
158 | for seg in list(range(0, 10, 1))[::-1]:
159 | sql = sql_metadata % seg
160 | print(sql)
161 | # Note prepare the paper metadata dict
162 | # df_metadata = DBReader.tcp_model_cached_read(cached_file_path='yyy', sql=sql, cached=False)
163 | df_metadata = DBReader.tcp_model_cached_read(
164 | cached_file_path=os.path.join(cached_dir, 'block_data/block_metadata_%d.pkl' % seg), sql=sql, cached=True)
165 |
166 | print(df_metadata.shape)
167 | # print(df_metadata[['pid_ao', 'content']].values[:100])
168 |
169 | md_block_fullname_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['block_fullname'].values))
170 | md_orcid_dict = dict(zip(df_metadata['pid_ao'].values, df_metadata['orcid'].values))
171 |
172 | md_content_word_embedding = dict(
173 | zip(df_metadata['pid_ao'].values, df_metadata['content'].apply(lambda x: word_token(x)).values))
174 |
175 | del df_metadata
176 |
177 | all_block_feature = {}
178 | sql = sql_block % seg
179 | print(sql)
180 | # df_block = DBReader.tcp_model_cached_read(cached_file_path='xxx', sql=sql, cached=False)
181 | df_block = DBReader.tcp_model_cached_read(cached_file_path=os.path.join(cached_dir, 'block_data/block_data_%d.pkl' % seg),
182 | sql=sql, cached=True)
183 | for ij, row in tqdm(df_block.iterrows(), total=df_block.shape[0]):
184 | block_name, pid_aos, ground_truths, mag_preds = row
185 |
186 | # Note calculate the similarity between different metadata according to pid_ao
187 | num_instances = len(pid_aos)
188 |
189 | if num_instances > 700:
190 | block_content_term_ids = torch.tensor(np.array([md_content_word_embedding[pid_ao] for pid_ao in pid_aos]),
191 | device=device)
192 | # Note this block is very large that can not fit into the GPU RAM, thus, we should process the each XL individually.
193 | all_XR = block_content_term_ids
194 |
195 | prediction1, prediction2 = torch.tensor([], device=device), torch.tensor([], device=device)
196 | for i in range(num_instances):
197 | one_XL = block_content_term_ids[i]
198 | one_XL = one_XL.unsqueeze(0).repeat(num_instances, 1)
199 | assert one_XL.shape == all_XR.shape
200 |
201 | pairwised_dataset = MyDataset(one_XL, all_XR)
202 | tmp_prediction1, tmp_prediction2 = compute_batch_pairwise_similarity(pairwised_dataset)
203 | prediction1 = torch.cat((prediction1, tmp_prediction1))
204 | prediction2 = torch.cat((prediction2, tmp_prediction2))
205 | else:
206 | block_content_term_ids = torch.tensor(np.array([md_content_word_embedding[pid_ao] for pid_ao in pid_aos]),
207 | device=device)
208 | all_XL = block_content_term_ids.repeat(1, block_content_term_ids.size(0)).view(-1, block_content_term_ids.size(-1))
209 | all_XR = block_content_term_ids.repeat(block_content_term_ids.size(0), 1)
210 | pairwised_dataset = MyDataset(all_XL, all_XR)
211 | prediction1, prediction2 = compute_batch_pairwise_similarity(pairwised_dataset)
212 |
213 | pred_prob = [prediction1.reshape(num_instances, -1).cpu().numpy().astype(np.float16),
214 | prediction2.reshape(num_instances, -1).cpu().numpy().astype(np.float16)]
215 |
216 | all_block_feature[block_name] = pred_prob
217 |
218 | joblib.dump(all_block_feature, filename=os.path.join(cached_dir,
219 | 'cluster_feature/matching-features-glove840B-%d-with-model-trained-on-%s.pkl' % (
220 | seg, underlying_dataset)))
221 |
--------------------------------------------------------------------------------
/src/comparison/block/clustering_metrics_other_baselines.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append('../../')
5 | import joblib
6 | import numpy as np
7 | from beard import metrics
8 | from mytookit.data_reader import DBReader
9 | from sklearn.cluster import AgglomerativeClustering
10 | from tqdm import tqdm
11 |
12 | from myconfig import cached_dir, cli_args, tuned_best_cluster_setting
13 |
14 | ''' Note
15 | This script aims to evaluate some semi-supervised learning baseline methods. To do this, We
16 | 1) trained pairwise disambiguation models on the TRAINING set;
17 | 2) use these model to search the most effect parameters on the DEV set;
18 | 3) last, using the pairwise AND models and the optimal clustering parameters to perform clustering on the TEST set.
19 |
20 | Note that we also conduct experiments on the slimmer LAGOS-AND dataset created by removing those blocks containing only one author
21 | '''
22 |
23 | # Note hyper-parameters ################################################################################
24 | underlying_dataset = 'pairwise-gold-standard'
25 | # cluster_uniq_author_gt_1 = False
26 | cluster_uniq_author_gt_1 = True
27 | which_model = cli_args.which_model
28 | print(which_model)
29 | HAC_distance_threshold_trials = range(1, 11, 1)
30 |
31 | # Note step 1 ##########################################################################################
32 | # Note load the trained model, which is trained on the block-based LAGOS-AND training set
33 | cached_file_base_dir = os.path.join(cached_dir, 'cluster_feature')
34 | available_model_names = ['name', 'bf', 'bf-cfjaccard', 'bf-cftfidf', 'bf-cfdoc2vec', 'bf-cfnn']
35 | available_feature_masks = [[0], [0, 1, 2, 3], [0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7]]
36 |
37 | # Note load all the pairwise AND models
38 | available_models = []
39 | for n in available_model_names:
40 | model_path = os.path.join(cached_dir,
41 | 'pairwise_and_models/rf-model-with-feature-%s-trained-on-lagos-and-%s-%s-based-dataset.pkl' % (
42 | n, underlying_dataset, 'trimmed' if cluster_uniq_author_gt_1 else 'original'))
43 | print(model_path)
44 | available_models.append(joblib.load(model_path))
45 |
46 | current_model = available_model_names[which_model]
47 | ml_model = available_models[which_model]
48 | feature_mask = available_feature_masks[which_model]
49 | print('current_model: ', current_model)
50 |
51 | # Note step 2 ##########################################################################################
52 | # Note load the DEV and TEST set
53 | df_blocks = DBReader.tcp_model_cached_read(cached_file_path=os.path.join(cached_dir, 'lagos-and-block-info.pkl'),
54 | sql=r'''select block_name, pid_aos, ground_truths, mag_preds, seg, train1_test0_val2, num_unique_author_inblock, num_citaion_in_block from and_ds.our_and_dataset_block_with_block_info;''',
55 | cached=True)
56 | print(df_blocks.shape)
57 |
58 | # Note this is very important here as it will greatly reduce the size of the dataset
59 | if cluster_uniq_author_gt_1:
60 | num_instances = len(df_blocks)
61 | df_blocks = df_blocks[df_blocks['num_unique_author_inblock'] > 1]
62 | num_instances1 = len(df_blocks)
63 | print('removed %d instances, enable each block containing more than one unique authors' % (num_instances - num_instances1))
64 | # del df_blocks['num_unique_author_inblock'], df_blocks['num_citaion_in_block']
65 |
66 | df_train_blocks = df_blocks[df_blocks['train1_test0_val2'] == 1]
67 | df_test_blocks = df_blocks[df_blocks['train1_test0_val2'] == 0]
68 | df_val_blocks = df_blocks[df_blocks['train1_test0_val2'] == 2]
69 | print('train/val/test block sizes', df_train_blocks.shape, df_val_blocks.shape, df_test_blocks.shape)
70 | del df_blocks, df_train_blocks['train1_test0_val2'], df_test_blocks['train1_test0_val2'], df_val_blocks['train1_test0_val2']
71 |
72 |
73 | # Note step 3 ##########################################################################################
74 | # Note eval on the DEV set, trying to find the best clustering parameters
75 | def merge_feature(five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature):
76 | # set1==set2 compares for equality of each element in both the sets,
77 | # and evaluates to true if and only if both the sets are exactly same.
78 | assert five_fast_feature.keys() == tfidf_feature.keys() == dov2vec_feature.keys() == matching_feature.keys()
79 | avg_feature_values = []
80 | merged_feature_map = {}
81 | for k in matching_feature.keys():
82 | fv1, fv2, fv3, (fv41, fv42) = five_fast_feature[k], tfidf_feature[k], dov2vec_feature[k], matching_feature[k]
83 | assert fv1.shape[:2] == fv2.shape[:2] == fv3.shape[:2] == fv41.shape[:2]
84 | num_authors = fv1.shape[0]
85 | # Note all these are numpy array while permuted the feature order,
86 | # Note making it aligned with the order of the original feature training the AND models
87 |
88 | # feature_names_groups = [
89 | # # ['rand', ['random']],
90 | # # ['magaid', ['same_biblio_aid']],
91 | # ['name', ['name_similarity']],
92 | # ['bf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity']],
93 | # ['bf-cfjaccard',
94 | # ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'paper_title_abstract_similarity']],
95 | # ['bf-cftfidf', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'tfidf_cosin_similarity']],
96 | # ['bf-cfdoc2vec', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'content_cosin_similarity']],
97 | # ['bf-cfnn', ['name_similarity', 'pub_year_diff', 'venue_similarity', 'aff_similarity', 'match_score']]
98 | # ]
99 |
100 | # fv41[fv41 <= 0.5] = 0
101 |
102 | # Note convert the 2D numpy to a symmetry 2D matrix
103 | fv41 = (fv41 + fv41.T) / 2
104 |
105 | tmp_concat_feature = np.concatenate(
106 | (np.expand_dims(fv1[:, :, 0], axis=2), # name_similarity, 0
107 | np.expand_dims(fv1[:, :, 1], axis=2), # pub_year_diff, 1
108 | np.expand_dims(fv1[:, :, 3], axis=2), # venue_similarity, 2
109 | np.expand_dims(fv1[:, :, 4], axis=2), # aff_similarity, 3
110 | np.expand_dims(fv1[:, :, 2], axis=2), # paper_title_abstract_similarity, 4
111 | np.expand_dims(fv2, axis=2), # tfidf, 5
112 | np.expand_dims(fv3, axis=2), # dov2vec, 6
113 | np.expand_dims(fv41, axis=2), # nn1 sigmoid, 7
114 | ),
115 | axis=2)
116 |
117 | tmp_avg_feature_value = [[num_authors * num_authors, np.sum(tmp_concat_feature[:, :, i].view().reshape(-1))] for i in
118 | range(0, 8, 1)]
119 | avg_feature_values.append(tmp_avg_feature_value)
120 |
121 | # print(tmp_concat_feature.shape)
122 | merged_feature_map[k] = tmp_concat_feature
123 |
124 | # avg_feature_values = np.array(avg_feature_values)
125 | # avg_feature_values = [np.sum(avg_feature_values[:, i, 1]) / np.sum(avg_feature_values[:, i, 0]) for i in range(0, 8, 1)]
126 | # print('feature average values: ', avg_feature_values)
127 | # feature average values: [0.8576142289367379, 5.90870462549071, 0.17193027950163528, 0.32441071932990906, 0.08016590954082886, 0.13715612273098102, 0.2845889716223027, 0.7829814895889327, 0.8295835544720577]
128 | return merged_feature_map
129 |
130 |
131 | def data_precision_round(arr, precision=2, pctg=True):
132 | return [round(x * 100 if pctg else x, precision) for x in arr]
133 |
134 |
135 | def clustering_over_input_blocks(cluster_algo, input_df_blocks):
136 | all_clustering_metrics = []
137 | all_clustering_predictions = {}
138 | segments = range(0, 10, 1)
139 | for seg in segments:
140 | # Note loading DEV block information
141 | df_seg = input_df_blocks[input_df_blocks['seg'] == seg]
142 | del df_seg['seg']
143 |
144 | # Note loading the cached feature data
145 | merged_feature_path = os.path.join(cached_file_base_dir, 'merged_features-gold-standard-%d.pkl' % seg)
146 | # merged_feature_path = os.path.join(cached_dir, 'temp/merged_features-%d.pkl' % seg)
147 | if os.path.exists(merged_feature_path):
148 | merged_feature_map = joblib.load(merged_feature_path)
149 | else:
150 | # Note consolidating the features into one feature file
151 | five_fast_feature = joblib.load(os.path.join(cached_file_base_dir, 'five-fast-features-%d.pkl' % seg))
152 | tfidf_feature = joblib.load(os.path.join(cached_file_base_dir, 'tfidf-feature-%d.pkl' % seg))
153 | dov2vec_feature = joblib.load(os.path.join(cached_file_base_dir, 'doc2vec-feature-%d.pkl' % seg))
154 | # matching_feature = joblib.load(os.path.join(cached_file_base_dir, 'matching-features-%d.pkl' % seg))
155 | # matching_feature = joblib.load(os.path.join(cached_file_base_dir, 'matching-features-glove840B-%d.pkl' % seg))
156 | matching_feature = joblib.load(os.path.join(cached_file_base_dir,
157 | 'matching-features-glove840B-%d-with-model-trained-on-%s.pkl' % (
158 | seg, underlying_dataset)))
159 |
160 | print(len(five_fast_feature), len(dov2vec_feature), len(matching_feature))
161 | merged_feature_map = merge_feature(five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature)
162 | del five_fast_feature, tfidf_feature, dov2vec_feature, matching_feature
163 | joblib.dump(merged_feature_map, merged_feature_path)
164 |
165 | for ij, row in tqdm(df_seg.iterrows(), total=df_seg.shape[0]):
166 | block_name, pid_aos, ground_truths, mag_preds, num_unique_author_inblock, num_citaiton_in_block = row
167 | num_authors = len(pid_aos)
168 |
169 | block_feature_matrix = merged_feature_map[block_name]
170 | assert block_feature_matrix.shape[:2] == (num_authors, num_authors)
171 |
172 | # Note squared predictions based on the giving features
173 | block_feature_matrix = block_feature_matrix[:, :, feature_mask]
174 | block_flatten_feature_vector = block_feature_matrix.view().reshape(-1, block_feature_matrix.shape[-1])
175 | block_flatten_predictions = ml_model.predict_proba(block_flatten_feature_vector)[:, 1]
176 |
177 | # ground_truths_1D = np.array([[1 if aa == bb else 0 for aa in ground_truths] for bb in ground_truths]).reshape(-1)
178 | # for k, _ in enumerate(feature_mask):
179 | # print(k, stats.spearmanr(block_flatten_feature_vector[:, k], ground_truths_1D)[0])
180 | # print(stats.spearmanr(block_flatten_predictions, ground_truths_1D)[0])
181 |
182 | block_flatten_predictions = 1 - block_flatten_predictions # convert to distance matrix
183 | block_squared_predictions = block_flatten_predictions.reshape(num_authors, num_authors)
184 |
185 | # block_squared_predictions = 1 - block_feature_matrix[:, :, 8]
186 |
187 | # Note clustering on the block_squared_predictions using DBSCAN
188 | # cluster = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed') # , n_jobs=-1 ``-1`` means using all processors
189 | # cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, affinity='precomputed',
190 | # linkage='single')
191 |
192 | # Note the input of clustering algorithm is the distance matrix
193 | cluster_labels = cluster_algo.fit_predict(X=block_squared_predictions)
194 | all_clustering_predictions[block_name] = [cluster_labels, ground_truths]
195 |
196 | # print(block_name, len(ground_truths), len(set(ground_truths)), cluster_labels)
197 |
198 | # Note compare the cluster_labels with the ground truth and calculate the metrics
199 | block_metrics_b3 = metrics.b3_precision_recall_fscore(labels_true=ground_truths, labels_pred=cluster_labels)
200 | block_metrics_pairwisef = metrics.paired_precision_recall_fscore(labels_true=ground_truths,
201 | labels_pred=cluster_labels)
202 | all_clustering_metrics.append(
203 | [block_name] + data_precision_round(list(block_metrics_b3 + block_metrics_pairwisef), pctg=False))
204 |
205 | # if np.random.random() < 0.001:
206 | # print('intermediate results: ', np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0))
207 |
208 | return all_clustering_metrics, all_clustering_predictions
209 |
210 |
211 | if tuned_best_cluster_setting is None:
212 | print('evaluating ...')
213 | best_cluster_setting = None
214 | best_metric = -1
215 | metric_tendencies = []
216 | for cluster_setting in HAC_distance_threshold_trials:
217 | distance_threshold = 0.2 + cluster_setting * 0.01
218 |
219 | cluster_algo = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold,
220 | affinity='precomputed', linkage='single')
221 | all_clustering_metrics, all_clustering_predictions = clustering_over_input_blocks(cluster_algo, df_val_blocks)
222 |
223 | # Note computer average metrics
224 | avg_metrics = np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0)
225 | print(avg_metrics)
226 | bp, br, bf, pp, pr, pf = avg_metrics
227 | metric_tendencies.append([metric_tendencies, bf])
228 | if best_metric < bf:
229 | print('updated the best clustering B3-F1 metric from %f to %f, and the the corresponding clustering setting is %f' % (
230 | best_metric, bf, distance_threshold))
231 | best_metric = bf
232 | best_cluster_setting = distance_threshold
233 |
234 | # plt.plot([n[0] for n in metric_tendencies], [n[1] for n in metric_tendencies])
235 | # plt.title(current_model)
236 | # plt.savefig(os.path.join(cached_dir, 'cluster_parameter_tuning/%s.png' % current_model), dpi=600)
237 | # plt.show()
238 |
239 | print('the best_cluster_setting for current_model: %s is %f' % (current_model, best_cluster_setting))
240 | tuned_best_cluster_setting = best_cluster_setting
241 |
242 | # Note step 3 ##########################################################################################
243 | # Note clustering on the the block-based LAGOS-AND test set and calculating the metrics
244 | print('evaluating on the test set using the parameter %f ...' % tuned_best_cluster_setting)
245 | tuned_cluster_algo = AgglomerativeClustering(n_clusters=None, distance_threshold=tuned_best_cluster_setting,
246 | affinity='precomputed', linkage='single')
247 | all_clustering_metrics, all_clustering_predictions = clustering_over_input_blocks(tuned_cluster_algo,
248 | # Note must use the TEST set
249 | df_test_blocks)
250 | # Note computer average metrics
251 | avg_metrics = np.array([n[1:] for n in all_clustering_metrics]).mean(axis=0)
252 | print('avg_metrics: ', avg_metrics)
253 | bp, br, bf, pp, pr, pf = avg_metrics
254 |
255 | joblib.dump([avg_metrics, all_clustering_metrics, all_clustering_predictions],
256 | os.path.join(cached_dir, 'cluster_metrics/all-metrics-predictions-%s-%f.pkl' %
257 | (current_model, tuned_best_cluster_setting))
258 | )
259 |
--------------------------------------------------------------------------------
/src/datacheck/checking_multi_facets.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import os
3 |
4 | import numpy as np
5 | from matplotlib import pyplot as plot
6 |
7 | from myconfig import cached_dir, latex_doc_base_dir
8 |
9 | plot.rcParams['font.family'] = 'serif'
10 | plot.rcParams['font.serif'] = ['Times New Roman'] + plot.rcParams['font.serif']
11 |
12 | from mytookit.data_reader import DBReader
13 |
14 | colors = ['green', 'gold', 'red', 'black', 'cyan', 'blue', 'magenta', 'purple', 'gray', 'fuchsia', 'orange', 'yellow']
15 | linestyles = ['--', '-.', ':', '--']
16 | line_markers = ['<', '>', '^', 'v']
17 | linewidth = 4
18 | tick_font_size = 14
19 | df_whole = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "whole_mag_representativeness_distribution.pkl"),
20 | "select * from and_ds.whole_mag_representativeness_distribution;",
21 | cached=True)
22 | print('df_whole.shape', df_whole.shape)
23 | # ['check_item' 'distribution']
24 | print(df_whole['check_item'].values)
25 | # for i, (check_item, distribution) in df_whole.iterrows():
26 | # print(check_item, len(distribution), distribution[:5])
27 | # pub_year, author_position, lastname_popularity, ssn_gender-sex, sex_mac-sex, ethnic-seer, genni-sex, ethnea
28 | whole_pub_year_dist = df_whole[df_whole['check_item'] == 'pub_year']['distribution'].values[0]
29 | whole_author_position_dist = df_whole[df_whole['check_item'] == 'author_position']['distribution'].values[0]
30 | whole_mac_gender_dist = df_whole[df_whole['check_item'] == 'sex_mac-sex']['distribution'].values[0]
31 | whole_genni_gender_dist = df_whole[df_whole['check_item'] == 'genni-sex']['distribution'].values[0]
32 | whole_ssn_gender_dist = df_whole[df_whole['check_item'] == 'ssn_gender-sex']['distribution'].values[0]
33 | whole_lastname_popularity_dist = df_whole[df_whole['check_item'] == 'lastname_popularity']['distribution'].values[0]
34 | whole_lastname_first_initial_popularity_dist = \
35 | df_whole[df_whole['check_item'] == 'lastname_first_initial_popularity']['distribution'].values[0]
36 | # whole_lastname_popularity_dist = df_whole[df_whole['check_item'] == 'lastname']['distribution'].values[0]
37 | # whole_lastname_first_initial_popularity_dist = df_whole[df_whole['check_item'] == 'lastname_first_initial']['distribution'].values[0]
38 | whole_ethnic_seer_dist = df_whole[df_whole['check_item'] == 'ethnic-seer']['distribution'].values[0]
39 | whole_ethnea_dist = df_whole[df_whole['check_item'] == 'ethnea']['distribution'].values[0]
40 | whole_fos_dist = df_whole[df_whole['check_item'] == 'fos']['distribution'].values[0]
41 |
42 | print(whole_pub_year_dist[:5])
43 | print(whole_author_position_dist[:5])
44 | print(whole_mac_gender_dist[:5])
45 | print(whole_genni_gender_dist[:5])
46 | print(whole_ssn_gender_dist[:5])
47 | print(whole_lastname_popularity_dist[:5])
48 | print(whole_ethnic_seer_dist[:5])
49 | print(whole_ethnea_dist[:5])
50 |
51 | df_block = DBReader.tcp_model_cached_read(os.path.join(cached_dir, "orcid_mag_matched_representativeness.pkl"),
52 | sql="""select * from and_ds.our_dataset_representativeness;""",
53 | cached=True)
54 | # ['pid' 'orcid' 'author_position' 'lastname' 'ethnic_seer' 'ethnea' 'genni', 'sex_mac' 'ssn_gender' 'pub_year']
55 | print(len(df_block.columns.values), df_block.columns.values)
56 | # df_sample = df_block[:10]
57 |
58 | # Note distribution of various aspects of the block-based dataset
59 | pub_year_counter_block = sorted(collections.Counter(df_block['pub_year'].values).items(), key=lambda x: x[0], reverse=False)
60 | author_position_counter_block = sorted(collections.Counter(df_block['author_position'].values).items(), key=lambda x: x[0],
61 | reverse=False)
62 | author_genni_gender_counter_block = sorted(collections.Counter(df_block['genni'].values).items(), key=lambda x: x[0],
63 | reverse=False)
64 |
65 | author_sex_mac_counter_block = sorted(collections.Counter(df_block['sex_mac'].values).items(), key=lambda x: x[0],
66 | reverse=False)
67 | author_ssn_gender_counter_block = sorted(collections.Counter(df_block['ssn_gender'].values).items(), key=lambda x: x[0],
68 | reverse=False)
69 |
70 | author_ethnic_seer_counter_block = sorted(collections.Counter(df_block['ethnic_seer'].values).items(), key=lambda x: x[0],
71 | reverse=False)
72 | author_ethnea_counter_block = sorted(collections.Counter(df_block['ethnea'].values).items(), key=lambda x: x[0],
73 | reverse=False)
74 | author_lastname_counter_block = sorted(collections.Counter(df_block['lastname'].values).items(), key=lambda x: x[1],
75 | reverse=True)
76 | author_lastname_counter_block = sorted(collections.Counter([n[1] for n in author_lastname_counter_block]).items(),
77 | key=lambda x: x[0],
78 | reverse=True)
79 | lastname_first_initial_counter_block = sorted(collections.Counter(df_block['lastname_first_initial'].values).items(),
80 | key=lambda x: x[1], reverse=True)
81 | lastname_first_initial_counter_block = sorted(collections.Counter([n[1] for n in lastname_first_initial_counter_block]).items(),
82 | key=lambda x: x[0],
83 | reverse=True)
84 | fos_counter_block = sorted(
85 | collections.Counter([n for n in np.hstack(df_block['fos_arr'].values) if len(n) > 0]).items(), key=lambda x: x[1],
86 | reverse=False)
87 |
88 | # Note distribution of various aspects of the pairwise-based dataset
89 | df_pairwise = DBReader.tcp_model_cached_read("xxx",
90 | sql="""select * from and_ds.our_dataset_pairwise_representativeness;""",
91 | cached=False)
92 | print('df_pairwise.shape before adjustment', df_pairwise.shape)
93 |
94 | pub_year_counter_pairwise = sorted(collections.Counter(df_pairwise['pub_year'].values).items(), key=lambda x: x[0], reverse=False)
95 | author_position_counter_pairwise = sorted(collections.Counter(df_pairwise['author_position'].values).items(), key=lambda x: x[0],
96 | reverse=False)
97 | author_genni_gender_counter_pairwise = sorted(collections.Counter(df_pairwise['genni'].values).items(), key=lambda x: x[0],
98 | reverse=False)
99 | author_sex_mac_counter_pairwise = sorted(collections.Counter(df_pairwise['sex_mac'].values).items(), key=lambda x: x[0],
100 | reverse=False)
101 | author_ssn_gender_counter_pairwise = sorted(collections.Counter(df_pairwise['ssn_gender'].values).items(), key=lambda x: x[0],
102 | reverse=False)
103 | author_ethnic_seer_counter_pairwise = sorted(collections.Counter(df_pairwise['ethnic_seer'].values).items(), key=lambda x: x[0],
104 | reverse=False)
105 | author_ethnea_counter_pairwise = sorted(collections.Counter(df_pairwise['ethnea'].values).items(), key=lambda x: x[0],
106 | reverse=False)
107 | author_lastname_counter_pairwise = sorted(collections.Counter(df_pairwise['lastname'].values).items(), key=lambda x: x[1],
108 | reverse=True)
109 | author_lastname_counter_pairwise = sorted(collections.Counter([n[1] for n in author_lastname_counter_pairwise]).items(),
110 | key=lambda x: x[0],
111 | reverse=True)
112 | lastname_first_initial_counter_pairwise = sorted(collections.Counter(df_pairwise['lastname_first_initial'].values).items(),
113 | key=lambda x: x[1], reverse=True)
114 | lastname_first_initial_counter_pairwise = sorted(
115 | collections.Counter([n[1] for n in lastname_first_initial_counter_pairwise]).items(),
116 | key=lambda x: x[0],
117 | reverse=True)
118 | fos_counter_pairwise = sorted(
119 | collections.Counter([n for n in np.hstack(df_pairwise['fos_arr'].values) if len(n) > 0]).items(), key=lambda x: x[1],
120 | reverse=False)
121 |
122 |
123 | def plot_pub_year(whole_pub_year_dist, counter_block, counter_pairwise, check_item):
124 | whole_pub_year_dist = [n for n in whole_pub_year_dist if 1970 <= int(n[0]) <= 2018]
125 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
126 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist]
127 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
128 |
129 | pub_year_counter1 = [n for n in counter_block if 1970 <= n[0] <= 2018]
130 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
131 | pub_year1 = [n[0] for n in pub_year_counter1]
132 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
133 |
134 | pub_year_counter2 = [n for n in counter_pairwise if 1970 <= n[0] <= 2018]
135 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
136 | pub_year2 = [n[0] for n in pub_year_counter2]
137 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
138 |
139 | # plot.figure()
140 | idx = 0
141 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
142 | # marker=line_markers[idx], markersize=8, markevery=0.2,
143 | color=colors[idx], label='MAG', linewidth=linewidth)
144 | idx = 1
145 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx],
146 | # marker=line_markers[idx], markersize=8, markevery=0.2,
147 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth)
148 | idx = 2
149 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx],
150 | # marker=line_markers[idx], markersize=8, markevery=0.2,
151 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth)
152 |
153 | # plot.yscale('log')
154 | plot.title(check_item, fontsize=18)
155 | plot.xlabel('Year', loc='right', fontsize=18)
156 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
157 | plot.xticks(fontsize=tick_font_size)
158 | plot.yticks(fontsize=tick_font_size)
159 | plot.legend(loc='best') # 'lower right'
160 |
161 |
162 | def plot_author_position(whole_author_position_dist, counter_block, counter_pairwise, check_item):
163 | whole_pub_year_dist = [n for n in whole_author_position_dist if int(n[0]) <= 15]
164 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
165 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist]
166 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
167 |
168 | pub_year_counter1 = [n for n in counter_block if n[0] <= 15]
169 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
170 | pub_year1 = [n[0] for n in pub_year_counter1]
171 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
172 |
173 | pub_year_counter2 = [n for n in counter_pairwise if n[0] <= 15]
174 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
175 | pub_year2 = [n[0] for n in pub_year_counter2]
176 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
177 |
178 | # plot.figure()
179 | idx = 0
180 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
181 | # marker=line_markers[idx], markersize=8, markevery=0.2,
182 | color=colors[idx], label='MAG', linewidth=linewidth)
183 | idx = 1
184 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx],
185 | # marker=line_markers[idx], markersize=8, markevery=0.2,
186 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth)
187 | idx = 2
188 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx],
189 | # marker=line_markers[idx], markersize=8, markevery=0.2,
190 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth)
191 |
192 | plot.yscale('log')
193 | plot.title(check_item, fontsize=18)
194 | plot.xlabel('Author Position', loc='right', fontsize=18)
195 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
196 | plot.xticks(fontsize=tick_font_size)
197 | plot.yticks(fontsize=tick_font_size)
198 | plot.legend(loc='best') # 'lower right'
199 |
200 |
201 | def plot_author_gender(whole_genni_gender_dist, counter_block, counter_pairwise, check_item):
202 | x_label_map = {'-': 'Unsure', 'F': 'Female', 'M': 'Male', '': ''}
203 | whole_pub_year_dist = sorted(whole_genni_gender_dist, key=lambda x: x[0], reverse=False)
204 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
205 | whole_pub_year = [x_label_map[n[0]] for n in whole_pub_year_dist]
206 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
207 |
208 | pub_year_counter1 = sorted([n for n in counter_block if x_label_map[n[0]] in set(whole_pub_year)],
209 | key=lambda x: x[0], reverse=False)
210 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
211 | pub_year1 = [x_label_map[n[0]] for n in pub_year_counter1]
212 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
213 |
214 | pub_year_counter2 = sorted([n for n in counter_pairwise if x_label_map[n[0]] in set(whole_pub_year)],
215 | key=lambda x: x[0], reverse=False)
216 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
217 | pub_year2 = [x_label_map[n[0]] for n in pub_year_counter2]
218 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
219 |
220 | # plot.figure()
221 | idx = 0
222 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
223 | # marker=line_markers[idx], markersize=8, markevery=0.2,
224 | color=colors[idx], label='MAG', linewidth=linewidth)
225 | idx = 1
226 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx],
227 | # marker=line_markers[idx], markersize=8, markevery=0.2,
228 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth)
229 | idx = 2
230 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx],
231 | # marker=line_markers[idx], markersize=8, markevery=0.2,
232 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth)
233 |
234 | plot.title(check_item, fontsize=18)
235 | plot.xlabel('Gender', loc='right', fontsize=18)
236 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
237 | plot.xticks(fontsize=tick_font_size)
238 | plot.yticks(fontsize=tick_font_size)
239 |
240 | plot.legend(loc='best') # 'lower right'
241 |
242 |
243 | def plot_ethnic_seer(whole_ethnic_seer_dist, counter_block, counter_pairwise, check_item):
244 | whole_pub_year_dist = sorted(whole_ethnic_seer_dist, key=lambda x: x[1], reverse=False)
245 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
246 | whole_pub_year = [n[0] for n in whole_pub_year_dist]
247 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
248 |
249 | keys1 = [n[0] for n in counter_block]
250 | pub_year_counter1 = [counter_block[keys1.index(n)] if n in keys1 else (n, 0) for n in whole_pub_year]
251 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
252 | pub_year1 = [n[0] for n in pub_year_counter1]
253 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
254 |
255 | keys2 = [n[0] for n in counter_pairwise]
256 | pub_year_counter2 = [counter_block[keys2.index(n)] if n in keys2 else (n, 0) for n in whole_pub_year]
257 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
258 | pub_year2 = [n[0] for n in pub_year_counter2]
259 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
260 |
261 | # plot.figure()
262 | idx = 0
263 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
264 | # marker=line_markers[idx], markersize=8, markevery=0.2,
265 | color=colors[idx], label='MAG', linewidth=linewidth)
266 | idx = 1
267 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx],
268 | # marker=line_markers[idx], markersize=8, markevery=0.2,
269 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth)
270 | idx = 2
271 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx],
272 | # marker=line_markers[idx], markersize=8, markevery=0.2,
273 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth)
274 |
275 | # plot.xscale('log')
276 | plot.yscale('log')
277 | plot.title(check_item, fontsize=18)
278 | plot.xlabel('Ethnicity', loc='right', fontsize=18)
279 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
280 | plot.xticks(fontsize=tick_font_size - 4)
281 | plot.yticks(fontsize=tick_font_size)
282 |
283 | plot.legend(loc='best') # 'lower right'
284 |
285 |
286 | # def plot_ethnea(whole_ethnic_seer_dist, author_ethnic_seer_counter, check_item):
287 | # whole_pub_year_dist = sorted(whole_ethnic_seer_dist, key=lambda x: x[1], reverse=False)
288 | # all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
289 | # whole_pub_year = [n[0] for n in whole_pub_year_dist]
290 | # whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
291 | #
292 | # keys = [n[0] for n in author_ethnic_seer_counter]
293 | # pub_year_counter = [author_ethnic_seer_counter[keys.index(n)] if n in keys else (n, 0) for n in whole_pub_year]
294 | # pub_cnt = sum([n[1] for n in pub_year_counter])
295 | # pub_year = [n[0] for n in pub_year_counter]
296 | # pub_count = [n[1] * 1.0 / pub_cnt for n in pub_year_counter]
297 | #
298 | # # plot.figure()
299 | # idx = 0
300 | # plot.loglog(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
301 | # # marker=line_markers[idx], markersize=8, markevery=0.2,
302 | # color=colors[idx], label='MAG', linewidth=linewidth)
303 | # idx = 1
304 | # plot.loglog(pub_year, pub_count, linestyle=linestyles[idx],
305 | # # marker=line_markers[idx], markersize=8, markevery=0.2,
306 | # color=colors[idx], label='LAGOS-AND', linewidth=linewidth)
307 | # plot.title(check_item, fontsize=18)
308 | # plot.xlabel('ethnicity', loc='right', fontsize=18)
309 | # plot.ylabel('ethnicity proportion', loc='center', fontsize=18) # 'top'
310 | # plot.legend(loc='best') # 'lower right'
311 |
312 |
313 | def plot_lastname_popularity(whole_lastname_popularity_dist, counter_block, counter_pairwise, check_item):
314 | # ratio = len(author_lastname_counter) * 1.0 / len(whole_lastname_popularity_dist)
315 | # used_for_plot_ratio = 1
316 | whole_pub_year_dist = sorted([n for n in whole_lastname_popularity_dist],
317 | # if random() <= used_for_plot_ratio * ratio
318 | key=lambda x: int(x[0]), reverse=False)
319 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
320 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist]
321 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
322 |
323 | pub_year_counter1 = sorted([n for n in counter_block], # if random() <= used_for_plot_ratio
324 | key=lambda x: int(x[0]), reverse=False)
325 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
326 | pub_year1 = [int(n[0]) for n in pub_year_counter1]
327 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
328 |
329 | pub_year_counter2 = sorted([n for n in counter_pairwise], # if random() <= used_for_plot_ratio
330 | key=lambda x: int(x[0]), reverse=False)
331 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
332 | pub_year2 = [int(n[0]) for n in pub_year_counter2]
333 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
334 |
335 | print(whole_pub_year_dist[0], whole_pub_year_dist[-1])
336 | print(pub_year_counter1[0], pub_year_counter1[-1])
337 | print(pub_year_counter2[0], pub_year_counter2[-1])
338 |
339 | print(list(zip(whole_pub_year, whole_pub_dist)))
340 | print(list(zip(pub_year1, pub_count1)))
341 | print(list(zip(pub_year2, pub_count2)))
342 |
343 | # plot.figure()
344 | idx = 0
345 | plot.scatter(whole_pub_year, # [n * 100.0 / len(whole_pub_dist) for n in range(len(whole_pub_dist))],
346 | whole_pub_dist,
347 | marker='.',
348 | color=colors[idx], label='MAG', s=4)
349 | idx = 1
350 | plot.scatter(pub_year1, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))],
351 | pub_count1,
352 | marker='o',
353 | color=colors[idx], label='LAGOS-AND-BLOCK', s=4)
354 | idx = 2
355 | plot.scatter(pub_year2, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))],
356 | pub_count2,
357 | marker='s',
358 | color=colors[idx], label='LAGOS-AND-PAIRWISE', s=4)
359 |
360 | plot.xscale('log')
361 | plot.yscale('log')
362 | plot.title(check_item, fontsize=18)
363 | plot.xlabel('LN Popularity', loc='right', fontsize=18)
364 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
365 | plot.xticks(fontsize=tick_font_size)
366 | plot.yticks(fontsize=tick_font_size)
367 |
368 | plot.legend(loc='best') # 'lower right'
369 |
370 |
371 | def plot_namespace_popularity(whole_lastname_popularity_dist, counter_block, counter_pairwise, check_item):
372 | # ratio = len(author_lastname_counter) * 1.0 / len(whole_lastname_popularity_dist)
373 | # used_for_plot_ratio = 1
374 | whole_pub_year_dist = sorted([n for n in whole_lastname_popularity_dist],
375 | # if random() <= used_for_plot_ratio * ratio
376 | key=lambda x: int(x[0]), reverse=False)
377 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
378 | whole_pub_year = [int(n[0]) for n in whole_pub_year_dist]
379 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
380 |
381 | pub_year_counter1 = sorted([n for n in counter_block], # if random() <= used_for_plot_ratio
382 | key=lambda x: int(x[0]), reverse=False)
383 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
384 | pub_year1 = [int(n[0]) for n in pub_year_counter1]
385 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
386 |
387 | pub_year_counter2 = sorted([n for n in counter_pairwise], # if random() <= used_for_plot_ratio
388 | key=lambda x: int(x[0]), reverse=False)
389 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
390 | pub_year2 = [int(n[0]) for n in pub_year_counter2]
391 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
392 |
393 | print(whole_pub_year_dist[0], whole_pub_year_dist[-1])
394 | print(pub_year_counter1[0], pub_year_counter1[-1])
395 | print(pub_year_counter2[0], pub_year_counter2[-1])
396 |
397 | print(list(zip(whole_pub_year, whole_pub_dist)))
398 | print(list(zip(pub_year1, pub_count1)))
399 | print(list(zip(pub_year2, pub_count2)))
400 |
401 | # plot.figure()
402 | idx = 0
403 | plot.scatter(whole_pub_year, # [n * 100.0 / len(whole_pub_dist) for n in range(len(whole_pub_dist))],
404 | whole_pub_dist,
405 | marker='.',
406 | color=colors[idx], label='MAG', s=4)
407 | idx = 1
408 | plot.scatter(pub_year1, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))],
409 | pub_count1,
410 | marker='o',
411 | color=colors[idx], label='LAGOS-AND-BLOCK', s=4)
412 | idx = 2
413 | plot.scatter(pub_year2, # [n * 100.0 / len(pub_year) for n in range(len(pub_year))],
414 | pub_count2,
415 | marker='s',
416 | color=colors[idx], label='LAGOS-AND-PAIRWISE', s=4)
417 |
418 | plot.xscale('log')
419 | plot.yscale('log')
420 | plot.title(check_item, fontsize=18)
421 | plot.xlabel('LNFI Popularity', loc='right', fontsize=18)
422 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
423 | plot.xticks(fontsize=tick_font_size)
424 | plot.yticks(fontsize=tick_font_size)
425 |
426 | plot.legend(loc='best') # 'lower right'
427 |
428 |
429 | def plot_fos(whole_fos_dist, counter_block, counter_pairwise, check_item):
430 | whole_pub_year_dist = sorted(whole_fos_dist, key=lambda x: x[1], reverse=False)
431 | all_pub_cnt = sum([n[1] for n in whole_pub_year_dist])
432 | whole_pub_year = [n[0] for n in whole_pub_year_dist]
433 | whole_pub_dist = [n[1] * 1.0 / all_pub_cnt for n in whole_pub_year_dist]
434 |
435 | keys1 = [n[0] for n in counter_block]
436 | print(keys1)
437 | pub_year_counter1 = [counter_block[keys1.index(n)] if n in keys1 else (n, 0) for n in whole_pub_year]
438 | pub_cnt1 = sum([n[1] for n in pub_year_counter1])
439 | pub_year1 = [n[0] for n in pub_year_counter1]
440 | pub_count1 = [n[1] * 1.0 / pub_cnt1 for n in pub_year_counter1]
441 |
442 | keys2 = [n[0] for n in counter_pairwise]
443 | print(keys2)
444 | pub_year_counter2 = [counter_block[keys2.index(n)] if n in keys2 else (n, 0) for n in whole_pub_year]
445 | pub_cnt2 = sum([n[1] for n in pub_year_counter2])
446 | pub_year2 = [n[0] for n in pub_year_counter2]
447 | pub_count2 = [n[1] * 1.0 / pub_cnt2 for n in pub_year_counter2]
448 |
449 | # plot.figure()
450 | idx = 0
451 | plot.plot(whole_pub_year, whole_pub_dist, linestyle=linestyles[idx],
452 | # marker=line_markers[idx], markersize=8, markevery=0.2,
453 | color=colors[idx], label='MAG', linewidth=linewidth)
454 | idx = 1
455 | plot.plot(pub_year1, pub_count1, linestyle=linestyles[idx],
456 | # marker=line_markers[idx], markersize=8, markevery=0.2,
457 | color=colors[idx], label='LAGOS-AND-BLOCK', linewidth=linewidth)
458 | idx = 2
459 | plot.plot(pub_year2, pub_count2, linestyle=linestyles[idx],
460 | # marker=line_markers[idx], markersize=8, markevery=0.2,
461 | color=colors[idx], label='LAGOS-AND-PAIRWISE', linewidth=linewidth)
462 | # plot.yscale('log')
463 | plot.xticks(fontsize=10, rotation=45, ha='right')
464 | # plot.autofmt_xdate(bottom=0.2, rotation=30, ha='center')
465 | plot.title(check_item, fontsize=18)
466 | # plot.xlabel('domain', loc='right')
467 | plot.ylabel('Proportion', loc='center', fontsize=18) # 'top'
468 | # plot.xticks(fontsize=tick_font_size)
469 | plot.yticks(fontsize=tick_font_size)
470 |
471 | plot.legend(loc='best') # 'lower right'
472 |
473 |
474 | plot.figure(42, figsize=(12, 18), dpi=300)
475 | plot.subplot(421)
476 | # plot.grid(True)
477 | plot_pub_year(whole_pub_year_dist, pub_year_counter_block, pub_year_counter_pairwise,
478 | check_item='(a) Publication Distribution')
479 | plot.subplot(422)
480 | plot_author_position(whole_author_position_dist, author_position_counter_block, author_position_counter_pairwise,
481 | check_item='(b) Author Position Distribution')
482 | plot.subplot(423)
483 |
484 | plot_author_gender(whole_genni_gender_dist, author_genni_gender_counter_block, author_genni_gender_counter_pairwise,
485 | check_item='(c) Gender Distribution')
486 | plot.subplot(424)
487 | # plot_author_gender(whole_mac_gender_dist, author_sex_mac_counter, check_item='mac_gender')
488 | # plot_author_gender(whole_ssn_gender_dist, author_ssn_gender_counter, check_item='ssn_gender')
489 |
490 | plot_ethnic_seer(whole_ethnic_seer_dist, author_ethnic_seer_counter_block, author_ethnic_seer_counter_pairwise,
491 | check_item='(d) Ethnicity Distribution')
492 |
493 | # plot_ethnea(whole_ethnea_dist, author_ethnea_counter, check_item='(d) ethnicity distribution')
494 | # plot.subplot(425)
495 |
496 | plot.subplot(425)
497 | plot_lastname_popularity(whole_lastname_popularity_dist, author_lastname_counter_block, author_lastname_counter_pairwise,
498 | check_item='(e) LN Popularity Distribution')
499 | plot.subplot(426)
500 | plot_namespace_popularity(whole_lastname_first_initial_popularity_dist, lastname_first_initial_counter_block,
501 | lastname_first_initial_counter_pairwise,
502 | check_item='(f) LNFI Popularity Distribution')
503 |
504 | plot.subplot(427)
505 | plot_fos(whole_fos_dist, fos_counter_block, fos_counter_pairwise, check_item='(g) Domain Distribution')
506 |
507 | plot.tight_layout()
508 | plot.savefig(os.path.join(cached_dir, 'data-distribution.png'), dpi=600)
509 | plot.savefig(os.path.join(latex_doc_base_dir, 'figs/data-distribution.png'), dpi=600)
510 | # plot.savefig(os.path.join(cached_dir, 'gold-standard-check.pdf'), dpi=500)
511 | plot.show()
512 |
--------------------------------------------------------------------------------