├── .DS_Store ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── doc ├── API list.xlsx ├── Expert_Recommendation.md ├── Gender_Prediction.md ├── Identity_Prediction.md ├── Jobhopping_Prediction.md ├── NSFC_AI_Subject_Classifier.md ├── NSFC_Subject_Classifier.md ├── Paper_Ranker.md └── paper_ranker.png ├── model └── README.md ├── src ├── __init__.py ├── aiclassifier.py ├── classifier.py ├── config.py ├── expertrec.py ├── gender.py ├── jobhopping.py ├── paperranker.py ├── tors.py └── utils │ ├── __init__.py │ ├── acautomaton.py │ ├── crawler.py │ └── translator.py └── test ├── nsfc_test.py └── pageranker_test.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/* 2 | .vscode 3 | .idea/* 4 | .idea 5 | *.pkl 6 | *.csv 7 | *.bin 8 | *.json 9 | *.html 10 | *.pk 11 | jobhopping/* 12 | __pycache__ 13 | __pycache__/* 14 | model/student/model.ckpt-1000.meta 15 | model/student/model.ckpt-1000.index 16 | model/student/model.ckpt-1000.data-00000-of-00001 17 | model/student/graph.pbtxt 18 | model/student/eval/events.out.tfevents.1539419480.Juliuss-MacBook-Pro.local 19 | model/student/eval/events.out.tfevents.1539416954.Juliuss-MacBook-Pro.local 20 | model/student/checkpoint 21 | model/jobhopping/orgID2orgname 22 | model/jobhopping/model 23 | model/student/orgID2orgname 24 | model/student/model 25 | doc/~$API list.xlsx 26 | model/expert/model_aminer 27 | test/baidu_translator.py 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 AMiner Open Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prediction API 2 | ![](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg) 3 | 4 | ## Introduction 5 | 6 | _AMiner Prediction API_ is a toolkit for science data prediction, such as scholar portrait property prediction. The toolkit aims to utilize science data and machine learning algorithms to provide more intelligent functionality for global researchers. All algorithms and models which the toolkit uses are derived from [AMiner](https://aminer.cn). 7 | 8 | ## Pre-requirement 9 | 10 | [`Anaconda`](https://www.anaconda.com/) is strongly recommended for environment configuration. Additionally, some libraries are requied. Use following commands to install these libraries: 11 | 12 | | Library | Command | 13 | | :----------------------------------------------------------: | :-----------------------------------------------: | 14 | | [`fastText`](https://fasttext.cc/) | `conda install -c mbednarski fasttext` ***** | 15 | | [`Scikit-learn`](https://scikit-learn.org/) | `conda install -c anaconda scikit-learn` | 16 | | [`Jieba`](https://github.com/fxsjy/jieba) | `conda install -c conda-forge jieba` | 17 | | [`Requests`](https://2.python-requests.org/) | `conda install -c conda-forge requests` | 18 | | [`Tensorflow`](http://tensorflow.org/) | `conda install -c conda-forge tensorflow` | 19 | | [`Pytorch`](http://pytorch.org/) | `conda install -c pytorch pytorch` | 20 | | [`Numpy`](http://numpy.scipy.org/) | `conda install -c conda-forge numpy` | 21 | | [`Pandas`](http://pandas.pydata.org/) | `conda install -c conda-forge pandas` | 22 | | [`BeautifulSoup4`](http://www.crummy.com/software/BeautifulSoup/) | `conda install -c conda-forge beautifulsoup4` | 23 | | [`Scrapy`](https://scrapy.org/) | `conda install -c conda-forge scrapy` | 24 | | `Levenshtein` | `conda install -c conda-forge python-levenshtein` | 25 | 26 | > ***** If you are using **OSX**, you should use following command to intall `fastText` 27 | > 28 | > ```bash 29 | > conda install -c conda-forge fasttext 30 | > ``` 31 | 32 | ## Models Download 33 | 34 | Toolkit depends on some pre-trained model files which can be downloaded at following address: 35 | 36 | [Download](https://lfs.aminer.cn/misc/model.zip) 37 | 38 | Extract it and move all files into `model` directory before testing code. 39 | 40 | ## Document 41 | 42 | [NSFC Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_Subject_Classifier.md) 43 | 44 | [NSFC AI Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_AI_Subject_Classifier.md) 45 | 46 | [Gender Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Gender_Prediction.md) 47 | 48 | [Identity Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Identity_Prediction.md) 49 | 50 | [Jobhopping Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Jobhopping_Prediction.md) 51 | 52 | [Expert Recommendation](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Expert_Recommendation.md) 53 | 54 | [Paper Ranker](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Paper_Ranker.md) 55 | 56 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/__init__.py -------------------------------------------------------------------------------- /doc/API list.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/doc/API list.xlsx -------------------------------------------------------------------------------- /doc/Expert_Recommendation.md: -------------------------------------------------------------------------------- 1 | # Expert recommendation 2 | 3 | ## Introduction 4 | 5 | ExpertRec is a class which is used to recommend some experts in the given text's field. 6 | 7 | ## Method 8 | 9 | ### search 10 | 11 | ```python 12 | search(text, num=20) 13 | ``` 14 | 15 | #### Introduction 16 | 17 | Recommend some experts in the given text's field. 18 | 19 | #### Parameters 20 | 21 | ##### text 22 | 23 | The text. 24 | 25 | ##### num 26 | 27 | The number of the recommended experts. 28 | 29 | #### Return value 30 | 31 | A list of dictionaries: 32 | 33 | ```python 34 | { 35 | 'id': The expert's ID in AMiner(http://www.aminer.cn/), 36 | 'url': The expert's AMiner homepage. 37 | 'L2 distance': Similarity. The smaller the L2 distance is , the more likely the expert is interested in the given text's field. 38 | } 39 | ``` 40 | 41 | #### An example 42 | 43 | ```python 44 | e = ExpertRec() 45 | rt = e.search('natural language processing') 46 | ``` 47 | 48 | `rt`: 49 | 50 | ```python 51 | [ 52 | { 53 | 'id': '544572eddabfae862da1d4e0', 54 | 'url': 'http://www.aminer.cn/profile/544572eddabfae862da1d4e0', 55 | 'L2 distance': 0.0 56 | }, 57 | { 58 | 'id': '53f438eadabfaee0d9b7cce4', 59 | 'url': 'http://www.aminer.cn/profile/53f438eadabfaee0d9b7cce4', 60 | 'L2 distance': 0.26824072 61 | }, 62 | { 63 | 'id': '53f432cbdabfaeb1a7bcfd9a', 64 | 'url': 'http://www.aminer.cn/profile/53f432cbdabfaeb1a7bcfd9a', 65 | 'L2 distance': 0.31506824 66 | }, 67 | { 68 | 'id': '53f432b7dabfaeb2ac02dc61', 69 | 'url': 'http://www.aminer.cn/profile/53f432b7dabfaeb2ac02dc61', 70 | 'L2 distance': 0.3284118 71 | }, 72 | { 73 | 'id': '53f43757dabfaeecd696742f', 74 | 'url': 'http://www.aminer.cn/profile/53f43757dabfaeecd696742f', 75 | 'L2 distance': 0.34276736 76 | } 77 | ] 78 | ``` 79 | 80 | ## API 81 | 82 | ### https://innovaapi.aminer.cn/tools/v1/predict/experts 83 | 84 | ![](https://img.shields.io/badge/http-post-blue.svg) 85 | 86 | An online version of method `search` 87 | 88 | #### Request body 89 | 90 | ##### text 91 | 92 | The text 93 | 94 | ##### num 95 | 96 | The number of the recommended experts. 97 | 98 | #### Return value 99 | 100 | In the `Response` object, there will be three fields. 101 | 102 | ##### status 103 | 104 | `0`: Success 105 | 106 | `1`: There are some errors. 107 | 108 | ##### message 109 | 110 | `success`: Success 111 | 112 | If there are some errors, you will get the error information. 113 | 114 | ##### data 115 | 116 | The return value from the method. 117 | 118 | #### An example 119 | 120 | ```http 121 | POST /tools/v1/predict/experts? HTTP/1.1 122 | Host: innovaapi.aminer.cn 123 | Content-Type: application/json 124 | User-Agent: PostmanRuntime/7.13.0 125 | Accept: */* 126 | Cache-Control: no-cache 127 | Postman-Token: 05b4af12-9cf0-4cf9-a45c-6f8fd2a9d0a9,867a4a93-f753-4799-9aa9-96b8208f5067 128 | Host: innovaapi.aminer.cn 129 | accept-encoding: gzip, deflate 130 | content-length: 60 131 | Connection: keep-alive 132 | cache-control: no-cache 133 | 134 | { 135 | "text": "natural language processing", 136 | "num": 20 137 | } 138 | ``` 139 | 140 | `Response`: 141 | 142 | ```json 143 | { 144 | "status": 0, 145 | "message": "success", 146 | "data": [ 147 | { 148 | "id": "544572eddabfae862da1d4e0", 149 | "url": "http://www.aminer.cn/profile/544572eddabfae862da1d4e0", 150 | "L2 distance": 0 151 | }, 152 | { 153 | "id": "53f438eadabfaee0d9b7cce4", 154 | "url": "http://www.aminer.cn/profile/53f438eadabfaee0d9b7cce4", 155 | "L2 distance": 0.27 156 | }, 157 | { 158 | "id": "53f432cbdabfaeb1a7bcfd9a", 159 | "url": "http://www.aminer.cn/profile/53f432cbdabfaeb1a7bcfd9a", 160 | "L2 distance": 0.32 161 | }, 162 | { 163 | "id": "53f432b7dabfaeb2ac02dc61", 164 | "url": "http://www.aminer.cn/profile/53f432b7dabfaeb2ac02dc61", 165 | "L2 distance": 0.33 166 | }, 167 | { 168 | "id": "53f43757dabfaeecd696742f", 169 | "url": "http://www.aminer.cn/profile/53f43757dabfaeecd696742f", 170 | "L2 distance": 0.34 171 | }, 172 | { 173 | "id": "53f556d8dabfae963d25e88d", 174 | "url": "http://www.aminer.cn/profile/53f556d8dabfae963d25e88d", 175 | "L2 distance": 0.35 176 | }, 177 | { 178 | "id": "53f4dc08dabfaef7e077b586", 179 | "url": "http://www.aminer.cn/profile/53f4dc08dabfaef7e077b586", 180 | "L2 distance": 0.35 181 | }, 182 | { 183 | "id": "5448db69dabfae87b7e87eb5", 184 | "url": "http://www.aminer.cn/profile/5448db69dabfae87b7e87eb5", 185 | "L2 distance": 0.38 186 | }, 187 | { 188 | "id": "53f430c6dabfaeb2ac014a3a", 189 | "url": "http://www.aminer.cn/profile/53f430c6dabfaeb2ac014a3a", 190 | "L2 distance": 0.39 191 | }, 192 | { 193 | "id": "53f42cebdabfaee02ac5a471", 194 | "url": "http://www.aminer.cn/profile/53f42cebdabfaee02ac5a471", 195 | "L2 distance": 0.4 196 | }, 197 | { 198 | "id": "53f43940dabfaefedbae3ddb", 199 | "url": "http://www.aminer.cn/profile/53f43940dabfaefedbae3ddb", 200 | "L2 distance": 0.4 201 | }, 202 | { 203 | "id": "53f44514dabfaee43ec789c9", 204 | "url": "http://www.aminer.cn/profile/53f44514dabfaee43ec789c9", 205 | "L2 distance": 0.41 206 | }, 207 | { 208 | "id": "53f4616fdabfaee4dc839eba", 209 | "url": "http://www.aminer.cn/profile/53f4616fdabfaee4dc839eba", 210 | "L2 distance": 0.41 211 | }, 212 | { 213 | "id": "53f7c250dabfae938c6d865c", 214 | "url": "http://www.aminer.cn/profile/53f7c250dabfae938c6d865c", 215 | "L2 distance": 0.41 216 | }, 217 | { 218 | "id": "53f43403dabfaee1c0a86645", 219 | "url": "http://www.aminer.cn/profile/53f43403dabfaee1c0a86645", 220 | "L2 distance": 0.41 221 | }, 222 | { 223 | "id": "53f44194dabfaee2a1d254c8", 224 | "url": "http://www.aminer.cn/profile/53f44194dabfaee2a1d254c8", 225 | "L2 distance": 0.41 226 | }, 227 | { 228 | "id": "53f430dddabfaeb1a7bb7664", 229 | "url": "http://www.aminer.cn/profile/53f430dddabfaeb1a7bb7664", 230 | "L2 distance": 0.41 231 | }, 232 | { 233 | "id": "542c4a3bdabfae2b4e1fe347", 234 | "url": "http://www.aminer.cn/profile/542c4a3bdabfae2b4e1fe347", 235 | "L2 distance": 0.41 236 | }, 237 | { 238 | "id": "53f482cedabfaec09f2a3dfb", 239 | "url": "http://www.aminer.cn/profile/53f482cedabfaec09f2a3dfb", 240 | "L2 distance": 0.42 241 | }, 242 | { 243 | "id": "53f42945dabfaeb22f3d3d86", 244 | "url": "http://www.aminer.cn/profile/53f42945dabfaeb22f3d3d86", 245 | "L2 distance": 0.42 246 | } 247 | ] 248 | } 249 | ``` 250 | 251 | -------------------------------------------------------------------------------- /doc/Gender_Prediction.md: -------------------------------------------------------------------------------- 1 | # Gender Prediction 2 | 3 | ## Introduction 4 | 5 | Gender is a class which is used to predict a person's gender. 6 | 7 | If you want to use face detection to help you predict a person's gender, you can get an api-key from [Face++]( https://console.faceplusplus.com/documents/7079083), and then put the api-key into the `config.py` 8 | 9 | ## Method 10 | 11 | ### predict 12 | 13 | ```python 14 | predict(self, name, org, source='google', image_url=None, image_file=None) 15 | ``` 16 | 17 | #### Introduction 18 | 19 | Predict a person's gender. We use name, results from search engine and the person's photo as features to predict his or her gender. About the photos, you can choose either online photos or local photos. 20 | 21 | #### Parameters 22 | 23 | ##### name 24 | 25 | **string**, the person's name 26 | 27 | ##### org 28 | 29 | **string**, the person's organization 30 | 31 | ##### source 32 | 33 | Use `google` or `baidu` as the search engine. 34 | 35 | It is strongly recommended to use Google because the model is trained accoring to the results from Google. 36 | 37 | ##### image_url 38 | 39 | The photo's online url. 40 | 41 | ##### image_file 42 | 43 | The photo's local path. 44 | 45 | #### An example 46 | 47 | ```python 48 | g = Gender() 49 | gen = g.predict(name='Jie Tang', org='Tsinghua University', image_url='http://www.cs.tsinghua.edu.cn/publish/cs/4616/20110330101939787483549/20190321114128398502759.jpg') 50 | ``` 51 | 52 | gen: 53 | 54 | ```python 55 | { 56 | 'name': { 57 | 'male': 0.5, 58 | 'female': 0.5 59 | }, 60 | 'search': { 61 | 'male': 0.9173952287088033, 62 | 'female': 0.0826047712911967 63 | }, 64 | 'face': { 65 | 'male': 1, 66 | 'female': 0 67 | }, 68 | 'male': 0.96, 69 | 'female': 0.04 70 | } 71 | ``` 72 | 73 | ## API 74 | 75 | ### https://innovaapi.aminer.cn/tools/v1/predict 76 | 77 | ![](https://img.shields.io/badge/http-get-brightgreen.svg) 78 | 79 | An online version of the method `predict` 80 | 81 | #### Parameters 82 | 83 | ##### name 84 | 85 | **string**, the person's name 86 | 87 | ##### org 88 | 89 | **string**, the person's organization 90 | 91 | ##### image_url 92 | 93 | The photo's online url. 94 | 95 | #### Return value 96 | 97 | In the `Response` object, there will be three fields. 98 | 99 | ##### status 100 | 101 | `0`: Success 102 | 103 | `1`: There are some errors. 104 | 105 | ##### message 106 | 107 | `success`: Success 108 | 109 | If there are some errors, you will get the error information. 110 | 111 | ##### data 112 | 113 | The return value from the method. 114 | 115 | #### An example 116 | 117 | https://innovaapi.aminer.cn/tools/v1/predict/gender?name=Feifei%20Li&org=Stanford%20University 118 | 119 | `Response`: 120 | 121 | ```json 122 | { 123 | "status": 0, 124 | "message": "success", 125 | "data": { 126 | "male": 0.07, 127 | "female": 0.93, 128 | "name": { 129 | "male": 0, 130 | "female": 1 131 | }, 132 | "search": { 133 | "male": 0.13, 134 | "female": 0.87 135 | }, 136 | "face": { 137 | "male": 0.5, 138 | "female": 0.5 139 | } 140 | } 141 | } 142 | ``` 143 | 144 | -------------------------------------------------------------------------------- /doc/Identity_Prediction.md: -------------------------------------------------------------------------------- 1 | # Identity Prediction 2 | 3 | ## Introduction 4 | 5 | Predict a scholar's identity (teacher or student) and his or her degree. 6 | 7 | ## Method 8 | 9 | ### predict 10 | 11 | ```python 12 | predict(pc=0, cn=0, hi=0, gi=0, year_range=0) 13 | ``` 14 | 15 | #### Introduction 16 | 17 | Predict whether a scholar is a teacher or a student, and then predict his degree. 18 | 19 | #### Parameters 20 | 21 | ##### pc 22 | 23 | Number of papers 24 | 25 | ##### cn 26 | 27 | Citation number 28 | 29 | ##### hi 30 | 31 | H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times. 32 | 33 | ##### gi 34 | 35 | G-index. Given a set of articles ranked in decreasing order of the number of citations that they received, the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations. 36 | 37 | ##### year_range 38 | 39 | Time range of papers. 40 | 41 | #### Return value 42 | 43 | A dictionary: 44 | 45 | ```python 46 | { 47 | 'label': 'student' or 'teacher', 48 | 'degree': 'undergraduate', 'master' or 'doctor' 49 | 'p': probability 50 | } 51 | ``` 52 | 53 | #### An example 54 | 55 | ```python 56 | identity = TorS() 57 | i = identity.predict(pc=10, cn=10000, hi=40, gi=0, year_range=14) 58 | ``` 59 | 60 | `i`: 61 | 62 | ```python 63 | {'label': 'teacher', 'degree': 'doctor', 'p': 0.9993} 64 | ``` 65 | 66 | ## API 67 | 68 | ### https://innovaapi.aminer.cn/tools/v1/predict/identity 69 | 70 | ![](https://img.shields.io/badge/http-get-brightgreen.svg) 71 | 72 | An online version of the method `predict` 73 | 74 | #### Parameters 75 | 76 | ##### pc 77 | 78 | Number of papers 79 | 80 | ##### cn 81 | 82 | Citation number 83 | 84 | ##### hi 85 | 86 | H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times. 87 | 88 | ##### gi 89 | 90 | G-index. Given a set of articles ranked in decreasing order of the number of citations that they received, the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations. 91 | 92 | ##### year_range 93 | 94 | Time range of papers. 95 | 96 | #### Return value 97 | 98 | In the `Response` object, there will be three fields. 99 | 100 | ##### status 101 | 102 | `0`: Success 103 | 104 | `1`: There are some errors. 105 | 106 | ##### message 107 | 108 | `success`: Success 109 | 110 | If there are some errors, you will get the error information. 111 | 112 | ##### data 113 | 114 | The return value from the method. 115 | 116 | #### An example 117 | 118 | https://innovaapi.aminer.cn/tools/v1/predict/identity?pc=10&cn=10000&hi=40&gi=0&year_range=14 119 | 120 | `Response`: 121 | 122 | ```json 123 | { 124 | "status": 0, 125 | "message": "success", 126 | "data": { 127 | "label": "teacher", 128 | "degree": "doctor", 129 | "p": 0.9993 130 | } 131 | } 132 | ``` 133 | 134 | -------------------------------------------------------------------------------- /doc/Jobhopping_Prediction.md: -------------------------------------------------------------------------------- 1 | # Jobhopping Prediction 2 | 3 | ## Introduction 4 | 5 | JobHopping is a class which is used to predict where a scholar may hop to. 6 | 7 | ## Method 8 | 9 | ### predict 10 | 11 | ```python 12 | predict(name_squence, ntop=3) 13 | ``` 14 | 15 | #### Introduction 16 | 17 | Get a scholar's possible future affiliation according to a list(squence) of affiliation's name where he had worked. 18 | 19 | #### Parameters 20 | 21 | ##### name_squence 22 | 23 | a list of the scholar's institution he had worded 24 | 25 | ##### ntop 26 | 27 | How many possible affiliations will the method return. 28 | 29 | #### Return value 30 | 31 | A list of dictionaries 32 | 33 | ```python 34 | { 35 | 'name': the most likely future affiliation's name 36 | 'p': the probability 37 | } 38 | ``` 39 | 40 | #### An example 41 | 42 | ```python 43 | j = JobHopping() 44 | aff = j.predict(['tsinghua university','mazandaran university','birsa agricultural university']) 45 | ``` 46 | 47 | `aff`: 48 | 49 | ```python 50 | [ 51 | { 52 | 'name': 'university of michigan', 53 | 'p': 0.33 54 | }, 55 | { 56 | 'name': 'university of cambridge', 57 | 'p': 0.33 58 | }, 59 | { 60 | 'name': 'university of california berkeley', 61 | 'p': 0.33 62 | } 63 | ] 64 | ``` 65 | 66 | ## API 67 | 68 | ### https://innovaapi.aminer.cn/tools/v1/predict/career 69 | 70 | ![](https://img.shields.io/badge/http-get-brightgreen.svg) 71 | 72 | An online version of method `predict` 73 | 74 | #### Parameters 75 | ##### per_name 76 | 77 | The scholar's name 78 | 79 | ##### org_name 80 | 81 | The scholar's affiliation name 82 | 83 | #### Return value 84 | 85 | In the `Response` object, there will be three fields. 86 | 87 | ##### status 88 | 89 | `0`: Success 90 | 91 | `1`: There are some errors. 92 | 93 | ##### message 94 | 95 | `success`: Success 96 | 97 | If there are some errors, you will get the error infomation. 98 | 99 | ##### data 100 | 101 | The return value from the method. 102 | 103 | #### An example 104 | 105 | https://innovaapi.aminer.cn/tools/v1/predict/career?per_name=XXX&org_name=XXX 106 | 107 | Return Value: 108 | 109 | ```json 110 | { 111 | "status": 0, 112 | "message": "success", 113 | "data": [ 114 | { 115 | "name": "university of michigan", 116 | "p": 0.33 117 | }, 118 | { 119 | "name": "university of california berkeley", 120 | "p": 0.33 121 | }, 122 | { 123 | "name": "stanford university", 124 | "p": 0.33 125 | } 126 | ] 127 | } 128 | ``` 129 | 130 | -------------------------------------------------------------------------------- /doc/NSFC_AI_Subject_Classifier.md: -------------------------------------------------------------------------------- 1 | # NSFC AI Subject Classifier 2 | 3 | ## Introduction 4 | 5 | AIClassifier is a class which is used to classify AI subjects according to some keywords. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 6 | 7 | ## Method 8 | 9 | ### get_tree 10 | 11 | ```python 12 | get_tree(words): 13 | ``` 14 | 15 | #### Introduction 16 | 17 | Get a subject tree according to some keywords. 18 | 19 | #### Parameters 20 | 21 | ##### words 22 | 23 | A **list** of keywords. 24 | 25 | #### Return value 26 | 27 | A list of **dictionary** 28 | 29 | ```python 30 | [ 31 | { 32 | "name": subject name, 33 | "value": probability, 34 | "children": subtrees. They also have the same structure. If this is a leaf node, it won't have this field 35 | } 36 | ] 37 | ``` 38 | 39 | #### An example 40 | 41 | ```python 42 | ai_nsfc = AIClassifier() 43 | words = ['search engine'] 44 | subject = ai_nsfc.get_tree(words) 45 | print(subject) 46 | ``` 47 | 48 | Return value: 49 | 50 | ```python 51 | [ 52 | { 53 | 'name': '人工智能', 54 | 'value': 1.0, 55 | 'children': [ 56 | { 57 | 'name': '自然语言处理', 58 | 'value': 0.6236383458601308, 59 | 'children': [ 60 | {'name': '文本检索、挖掘与信息抽取', 'value': 0.6106190412551927} 61 | ] 62 | }, 63 | { 64 | 'name': '知识表示与处理', 65 | 'value': 0.3763616541398693, 66 | 'children': [ 67 | {'name': '知识发现与数据挖掘', 'value': 0.3893809587448072} 68 | ] 69 | } 70 | ] 71 | } 72 | ] 73 | ``` 74 | 75 | ### classify_level 76 | 77 | ```python 78 | classify_level(words, level=1, lang_zh=False): 79 | ``` 80 | 81 | #### Introduction 82 | 83 | Classify which subjects these keywords belong to. 84 | 85 | #### Parameters 86 | 87 | ##### words 88 | 89 | A **list** of keywords. 90 | 91 | ##### level 92 | 93 | Classification level(1,2,3), for other numbers you will get a `[]`. 94 | 95 | [NSFC](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html) uses a three-level classification. Use graph theory as an example, 96 | 97 | ``` 98 | A01 mathematics 99 | - A0116 combinatorial mathematics 100 | - A011602 graph theory 101 | ``` 102 | 103 | ##### lang_zh 104 | 105 | Whether the return values are Chinese or not. 106 | 107 | #### Return Value 108 | 109 | A **list** of strings contains related subject names at the level 110 | 111 | #### An example 112 | 113 | ```python 114 | ai_nsfc = AIClassifier() 115 | words = ['search engine'] 116 | subject = ai_nsfc.classify_level(words, level=3) 117 | print(subject) 118 | ``` 119 | 120 | Return Value: 121 | 122 | ```python 123 | ['Text Retrieval, Mining And Information Extraction', 'Knowledge Discovery And Data Mining'] 124 | ``` 125 | 126 | ### classify 127 | 128 | Get the classification of the keywords and a subject tree 129 | 130 | #### Parameters 131 | 132 | ##### words 133 | 134 | A **list** of keywords. Accept both Chinese words and English words. 135 | 136 | #### Return value 137 | 138 | A dictionary contains four items: 139 | 140 | ``` 141 | 'level{x}'(x = 1, 2, 3): Related subjects of the words on level x. 142 | 'tree': Subject trees of the given words(a list of dictionary). 143 | ``` 144 | 145 | #### An example 146 | 147 | ```python 148 | ai_nsfc = AIClassifier() 149 | words = ['search engine'] 150 | subject = ai_nsfc.classify(words) 151 | print(subject) 152 | ``` 153 | 154 | Return Value: 155 | 156 | ```python 157 | { 158 | 'level1': [ 159 | {'p': 1.0, 'name': 'Artificial Intelligence', 'name_zh': '人工智能'} 160 | ], 161 | 'level2': [ 162 | {'p': 0.6236383458601308,'name': 'Natural Language Processing', 'name_zh': '自然语言处理'}, 163 | {'p': 0.3763616541398693, 'name': 'Knowledge Representation And Processing', 'name_zh': '知识表示与处理'} 164 | ], 165 | 'level3': [ 166 | {'p': 0.6106190412551927, 'name': 'Text Retrieval, Mining And Information Extraction', 'name_zh': '文本检索、挖掘与信息抽取'}, 167 | {'p': 0.3893809587448072, 'name': 'Knowledge Discovery And Data Mining', 'name_zh': '知识发现与数据挖掘'} 168 | ], 169 | 'tree': [ 170 | { 171 | 'name': '人工智能', 172 | 'value': 1.0, 173 | 'children': [ 174 | { 175 | 'name': '自然语言处理', 176 | 'value': 0.6236383458601308, 177 | 'children': [ 178 | {'name': '文本检索、挖掘与信息抽取', 'value': 0.6106190412551927} 179 | ] 180 | }, 181 | { 182 | 'name': '知识表示与处理', 183 | 'value': 0.3763616541398693, 184 | 'children': [ 185 | {'name': '知识发现与数据挖掘', 'value': 0.3893809587448072} 186 | ] 187 | } 188 | ] 189 | } 190 | ] 191 | } 192 | ``` 193 | 194 | ## API 195 | 196 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc/ai 197 | 198 | ![](https://img.shields.io/badge/http-post-blue.svg) 199 | 200 | An online version of the method "classify" 201 | 202 | ### Request body 203 | 204 | ##### words 205 | 206 | A **list** of key words. Accept both Chinese words and English words. 207 | 208 | #### Return value 209 | 210 | In the `Response` object, there will be three fields. 211 | 212 | ##### status 213 | 214 | `0`: Success 215 | 216 | `1`: There are some errors. 217 | 218 | ##### message 219 | 220 | `success`: Success 221 | 222 | If there are some errors, you will get the error information. 223 | 224 | ##### data 225 | 226 | The return value from the method. 227 | 228 | ### An example 229 | 230 | ```http 231 | POST /tools/v1/predict/nsfc/ai? HTTP/1.1 232 | Host: innovaapi.aminer.cn 233 | Content-Type: application/json 234 | User-Agent: PostmanRuntime/7.13.0 235 | Accept: */* 236 | Cache-Control: no-cache 237 | Postman-Token: 72d90554-ead1-4606-be9e-ce64a9b38391,354aaeaa-976a-406c-902b-d3d1e52389f7 238 | Host: innovaapi.aminer.cn 239 | accept-encoding: gzip, deflate 240 | content-length: 49 241 | Connection: keep-alive 242 | cache-control: no-cache 243 | 244 | { 245 | "words": [ 246 | "search engine" 247 | ] 248 | } 249 | ``` 250 | 251 | Return Message: 252 | 253 | ```json 254 | { 255 | "status": 0, 256 | "message": "success", 257 | "data": { 258 | "level1": [ 259 | { 260 | "p": 1.0, 261 | "name": "Artificial Intelligence", 262 | "name_zh": "人工智能" 263 | } 264 | ], 265 | "level2": [ 266 | { 267 | "p": 0.6236383458601308, 268 | "name": "Natural Language Processing", 269 | "name_zh": "自然语言处理" 270 | }, 271 | { 272 | "p": 0.3763616541398693, 273 | "name": "Knowledge Representation And Processing", 274 | "name_zh": "知识表示与处理" 275 | } 276 | ], 277 | "level3": [ 278 | { 279 | "p": 0.6106190412551927, 280 | "name": "Text Retrieval, Mining And Information Extraction", 281 | "name_zh": "文本检索、挖掘与信息抽取" 282 | }, 283 | { 284 | "p": 0.3893809587448072, 285 | "name": "Knowledge Discovery And Data Mining", 286 | "name_zh": "知识发现与数据挖掘" 287 | } 288 | ], 289 | "tree": [ 290 | { 291 | "name": "人工智能", 292 | "value": 1.0, 293 | "children": [ 294 | { 295 | "name": "自然语言处理", 296 | "value": 0.6236383458601308, 297 | "children": [ 298 | { 299 | "name": "文本检索、挖掘与信息抽取", 300 | "value": 0.6106190412551927 301 | } 302 | ] 303 | }, 304 | { 305 | "name": "知识表示与处理", 306 | "value": 0.3763616541398693, 307 | "children": [ 308 | { 309 | "name": "知识发现与数据挖掘", 310 | "value": 0.3893809587448072 311 | } 312 | ] 313 | } 314 | ] 315 | } 316 | ] 317 | } 318 | } 319 | ``` 320 | 321 | -------------------------------------------------------------------------------- /doc/NSFC_Subject_Classifier.md: -------------------------------------------------------------------------------- 1 | # NSFC Subject Classifier 2 | 3 | ## Introduction 4 | 5 | Classifier is a class which is used to classify publications according to their subjects. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 6 | 7 | ## Method 8 | 9 | ### classify 10 | 11 | ```python 12 | classify(pub_titles, level=0, ntop=5, lang_zh=False, translatation_func=youdao_translate) 13 | ``` 14 | #### Introduction 15 | 16 | Use publications' titles to classify which subjects these publications belong to. 17 | 18 | #### Parameters 19 | 20 | ##### pub_titles 21 | 22 | A **list** of **strings**. The titles of publications. 23 | 24 | ##### level 25 | 26 | Classification level(1,2,3), for other numbers you will get all three levels. 27 | 28 | [NSFC](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html) uses a three-level classification. Use graph theory as an example, 29 | 30 | ``` 31 | A01 mathematics 32 | - A0116 combinatorial mathematics 33 | - A011602 graph theory 34 | ``` 35 | 36 | ##### ntop 37 | 38 | The number of possible subjects you want to get. 39 | 40 | ##### lang_zh 41 | 42 | Whether the titles are Chinese or not. For `True`, it means you are using Chinese publications. 43 | 44 | ##### translation_func 45 | 46 | In fact, the classifier can only work on **Chinese** words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function. It should be able to translate a list of **strings** in another language to Chinese. 47 | 48 | In default, we provide a translator based on [youdao api](http://fanyi.youdao.com/). But you cannot use this translator too often because it is only a free version. 49 | 50 | #### Return value 51 | 52 | A **dictionary** 53 | 54 | ```python 55 | 'level{x}'(x = 1, 2, 3)': 56 | { 57 | 'code': subject code 58 | 'name': subject name 59 | 'p': probability 60 | } 61 | ``` 62 | 63 | If there are some errors in the method, you will get a `{}` 64 | 65 | #### An example 66 | 67 | ```python 68 | nsfc = Classifier() 69 | pub_titles = ['基于多通道卷积神经网络的中文微博情感分析'] 70 | subject = nsfc.classify(pub_titles) 71 | ``` 72 | 73 | `subject`: 74 | 75 | ```python 76 | { 77 | 'level1': [ 78 | {'code': 'F02', 'name': '计算机科学', 'p': 0.9745969772338867}, 79 | {'code': 'F01', 'name': '电子学与信息系统', 'p': 0.02385014481842518}, 80 | {'code': 'B05', 'name': '分析化学', 'p': 0.0005464374553412199}, 81 | {'code': 'F03', 'name': '自动化', 'p': 0.00039022043347358704}, 82 | {'code': 'H18', 'name': '影像医学与生物医学工程', 'p': 0.0001973187318071723} 83 | ], 84 | 'level2': [ 85 | {'code': 'F0206', 'name': '自然语言理解与机器翻译', 'p': 0.8545559048652649}, 86 | {'code': 'F0205', 'name': '计算机应用技术', 'p': 0.08089018613100052}, 87 | {'code': 'F0305', 'name': '人工智能与知识工程', 'p': 0.023599255830049515}, 88 | {'code': 'B0512', 'name': '化学计量学与化学信息学', 'p': 0.0228357} 89 | ], 90 | 'level3': [ 91 | {'code': 'F020601', 'name': '计算语言学', 'p': 0.9999170303344727}, 92 | {'code': 'F020504', 'name': '生物信息计算', 'p': 4.625070505426265e-05}, 93 | {'code': 'F020506', 'name': '人机界面技术', 'p': 2.3111495465855114e-05}, 94 | {'code': 'F010403', 'name': '物联网', 'p': 2.2251791961025447e-05}, 95 | {'code': 'F010303', 'name': '协作通信', 'p': 2.0015930203953758e-05} 96 | ] 97 | } 98 | ``` 99 | 100 | ## API 101 | 102 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc 103 | 104 | ![](https://img.shields.io/badge/http-post-blue.svg) 105 | 106 | An online version of the method `classify` 107 | 108 | #### Request body 109 | 110 | ##### titles 111 | 112 | A **list** of **strings**. The titles of publications. 113 | 114 | #### Return value 115 | 116 | In the `Response` object, there will be three fields. 117 | 118 | ##### status 119 | 120 | `0`: Success 121 | 122 | `1`: There are some errors. 123 | 124 | ##### message 125 | 126 | `success`: Success 127 | 128 | If there are some errors, you will get the error information. 129 | 130 | ##### data 131 | 132 | The return value from the method. 133 | 134 | #### An example 135 | 136 | ```http 137 | POST /tools/v1/predict/nsfc? HTTP/1.1 138 | Host: innovaapi.aminer.cn 139 | Content-Type: application/json 140 | User-Agent: PostmanRuntime/7.13.0 141 | Accept: */* 142 | Cache-Control: no-cache 143 | Postman-Token: 5f0fbe87-e333-40b1-b9c3-23f64c137c15,1927af8e-4a86-4319-8024-684d6b9e46f7 144 | Host: innovaapi.aminer.cn 145 | accept-encoding: gzip, deflate 146 | content-length: 100 147 | Connection: keep-alive 148 | cache-control: no-cache 149 | 150 | { 151 | "titles": [ 152 | "基于多通道卷积神经网络的中文微博情感分析" 153 | ] 154 | } 155 | ``` 156 | 157 | Return Message: 158 | 159 | ```json 160 | { 161 | "status": 0, 162 | "message": "success", 163 | "data": { 164 | "level1": [ 165 | { 166 | "code": "F02", 167 | "name": "计算机科学", 168 | "p": 0.9745969772338867 169 | }, 170 | { 171 | "code": "F01", 172 | "name": "电子学与信息系统", 173 | "p": 0.02385014481842518 174 | }, 175 | { 176 | "code": "B05", 177 | "name": "分析化学", 178 | "p": 0.0005464374553412199 179 | }, 180 | { 181 | "code": "F03", 182 | "name": "自动化", 183 | "p": 0.00039022043347358704 184 | }, 185 | { 186 | "code": "H18", 187 | "name": "影像医学与生物医学工程", 188 | "p": 0.0001973187318071723 189 | } 190 | ], 191 | "level2": [ 192 | { 193 | "code": "F0206", 194 | "name": "自然语言理解与机器翻译", 195 | "p": 0.8545559048652649 196 | }, 197 | { 198 | "code": "F0205", 199 | "name": "计算机应用技术", 200 | "p": 0.08089018613100052 201 | }, 202 | { 203 | "code": "F0305", 204 | "name": "人工智能与知识工程", 205 | "p": 0.023599255830049515 206 | }, 207 | { 208 | "code": "B0512", 209 | "name": "化学计量学与化学信息学", 210 | "p": 0.022835755720734596 211 | }, 212 | { 213 | "code": "F0104", 214 | "name": "通信网络", 215 | "p": 0.01253295037895441 216 | } 217 | ], 218 | "level3": [ 219 | { 220 | "code": "F020601", 221 | "name": "计算语言学", 222 | "p": 0.9999170303344727 223 | }, 224 | { 225 | "code": "F020504", 226 | "name": "生物信息计算", 227 | "p": 0.00004625070505426265 228 | }, 229 | { 230 | "code": "F020506", 231 | "name": "人机界面技术", 232 | "p": 0.000023111495465855114 233 | }, 234 | { 235 | "code": "F010403", 236 | "name": "物联网", 237 | "p": 0.000022251791961025447 238 | }, 239 | { 240 | "code": "F010303", 241 | "name": "协作通信", 242 | "p": 0.000020015930203953758 243 | } 244 | ] 245 | } 246 | } 247 | ``` 248 | 249 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc/person 250 | 251 | ![](https://img.shields.io/badge/http-get-brightgreen.svg) 252 | 253 | Get a professor's research interests according to his publications' titles. 254 | 255 | #### Parameters 256 | 257 | ##### pid 258 | 259 | the professor's id in [AMiner](https://aminer.cn). 260 | 261 | For example, you want to know Qiang Yang's research interests. First, you should search Qiang Yang in [AMiner](https://aminer.cn), and get his page url https://www.aminer.cn/profile/qiang-yang/53f48041dabfae963d25910a. His id in [AMiner](https://aminer.cn) is the suffix of the url string `53f48041dabfae963d25910a`. 262 | 263 | #### Return value 264 | 265 | In the `Response` object, there will be three fields. 266 | 267 | ##### status 268 | 269 | `0`: Success 270 | 271 | `1`: There are some errors. 272 | 273 | ##### message 274 | 275 | `success`: Success 276 | 277 | If there are some errors, you will get the error information. 278 | 279 | ##### data 280 | 281 | The return value from the method. 282 | 283 | #### An example 284 | 285 | https://innovaapi.aminer.cn/tools/v1/predict/nsfc/person?pid=53f48041dabfae963d25910a 286 | 287 | ## Accuracy 288 | 289 | | level | top1 | top5 | 290 | | :---: | :----: | :----: | 291 | | 1 | 0.5079 | 0.8331 | 292 | | 2 | 0.3629 | 0.6668 | 293 | | 3 | 0.3342 | 0.6317 | 294 | 295 | -------------------------------------------------------------------------------- /doc/Paper_Ranker.md: -------------------------------------------------------------------------------- 1 | # Paper Ranker 2 | 3 | ## Introduction 4 | 5 | There are some professors that have the same name. In this case, we have difficulties in distinguishing whether a publication is long to a professor. PaperRanker is a class which is used to predict how much possibility there is that a publication is belong to a professor. 6 | 7 | ![](paper_ranker.png) 8 | 9 | Our idea is that we can use some correct publications and coauthor relationship to solve this problem. If a correct coauthor relationship exists in an unsure publication, the possibility that this publication is belong to that professor is much higher. 10 | 11 | ## Definition 12 | 13 | ```python 14 | def __init__(self, use_clf=False) 15 | ``` 16 | 17 | Based on the idea that a professor's interest won't change too much, we also try to use [NSFC Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_Subject_Classifier.md) to help us to predict. If you want to use it, you should set `use_clf=True`. 18 | 19 | ## Method 20 | 21 | ### label 22 | 23 | ```python 24 | def label(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate) 25 | ``` 26 | 27 | #### Introduction 28 | 29 | Use iterative algorithm to predict how much possibility there is that a publication is belong to a professor. 30 | 31 | #### Paramters 32 | 33 | ##### correct_pubs 34 | 35 | A list of this professor's publications. This is a list of dictionaries. And the dictionary should have following fields. 36 | 37 | | name | Introdcution | 38 | | :-----: | :-------------------------: | 39 | | title | `string`, publication title | 40 | | authors | A list of `string`, authors | 41 | | year | `integer`, publication year | 42 | 43 | For example: 44 | 45 | ```python 46 | { 47 | "title": "Study of quantitative elastography with supersonic shear imaging in the diagnosis of breast tumours", 48 | "year": 2013, 49 | "authors": [ 50 | "Zhili Wang", 51 | "Junlai Li", 52 | "Min Li", 53 | "Yan Huang", 54 | "WenBo Wan", 55 | "Jie Tang" 56 | ] 57 | } 58 | ``` 59 | 60 | ##### unsure_pubs 61 | 62 | A list of unsure publications. The format is similar to `correct_pubs` 63 | 64 | ##### threshold 65 | 66 | If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication. 67 | 68 | ##### trans 69 | 70 | In fact, the classifier can only work on **Chinese** words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function. It should be able to translate a list of **strings** in another language to Chinese. 71 | 72 | In default, we provide a translator based on [youdao api](http://fanyi.youdao.com/). But you cannot use this translator too often because it is only a free version. 73 | 74 | #### Return value 75 | 76 | `(a,b)`, two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities. 77 | 78 | ### ranking 79 | 80 | ```python 81 | def ranking(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate) 82 | ``` 83 | 84 | #### Introduction 85 | 86 | Predict how much possibility there is that a publication is belong to a professor. The algorithm will be used just for one time. 87 | 88 | #### Parameters 89 | 90 | #### Return value 91 | 92 | Similar to `label` 93 | 94 | ## Test 95 | 96 | The experiment is based on the publications from [Jie Tang (唐杰)](http://www.aminer.cn/profile/jie-tang/53f46a3edabfaee43ed05f08) and [Jie Tang (唐捷)](http://www.aminer.cn/profile/jie-tang/542edff0dabfae498ae3c756) . 97 | 98 | I use [baidu translation](http://api.fanyi.baidu.com/api/trans/product/index) as the translator and set `threshold = 0.5` 99 | 100 | | Precision Rate | Recall Rate | F1 Score | 101 | | :------------: | :---------: | :------: | 102 | | 0.960 | 0.705 | 0.813 | -------------------------------------------------------------------------------- /doc/paper_ranker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/doc/paper_ranker.png -------------------------------------------------------------------------------- /model/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/model/README.md -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/src/__init__.py -------------------------------------------------------------------------------- /src/aiclassifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Introduction: 4 | AIClassifier is a class which is used to classify AI subjects. 5 | It depends on the classification of Natural Science Foundation of China(NSFC). 6 | Usage: 7 | >>> ai_nsfc = AIClassifier() 8 | >>> words = ['search engine'] 9 | >>> subject = ai_nsfc.classify(words) 10 | ''' 11 | import os 12 | from config import model_path 13 | from sklearn.externals import joblib 14 | from collections.abc import Iterable 15 | 16 | 17 | data_path = os.path.join(model_path, 'nsfc') 18 | 19 | 20 | class AIClassifier: 21 | 22 | def __init__(self, path=data_path): 23 | self._mat = [] 24 | for level in range(3): 25 | file = os.path.join(path, 'ai_lev{}_w.pkl'.format(level)) 26 | self._mat.append(joblib.load(file)) 27 | 28 | file_id2name = os.path.join(path, 'id2name.pkl') 29 | file_id2father = os.path.join(path, 'id2father.pkl') 30 | self._id2name = joblib.load(file_id2name) 31 | self._id2father = joblib.load(file_id2father) 32 | 33 | def classify(self, words): 34 | ''' 35 | Get the classification of the key words and its subject tree 36 | :param words: A key words list. Accept both English words and Chinese words 37 | :return: 38 | A dictionary contains four items: 39 | 'level{x}'(x = 1, 2, 3): Related subjects of the words on level x. 40 | 'tree': Subject trees of the given words(a list of dictionary). 41 | ''' 42 | distribution = self._get_all_level_distribution(words) 43 | ret = self._get_all_info(words, distribution) 44 | subject_tree = self.get_tree(words, distribution) 45 | ret['tree'] = subject_tree 46 | return ret 47 | 48 | def get_tree(self, words, _distribution=None): 49 | ''' 50 | Get a related subject tree using the key words 51 | :param words: A key words list. Accept both English words and Chinese words 52 | :param _distribution: A param designed to reuse codes. 53 | :return: Subject trees of the given words(a list of dictionary). 54 | ''' 55 | subject_tree = {} 56 | if _distribution is None: 57 | _distribution = self._get_all_level_distribution(words) 58 | for level in range(0, 3): 59 | for k, p in _distribution[level].items(): 60 | self._insert_subject2tree(k, p, subject_tree) 61 | return self._format_tree(subject_tree) 62 | 63 | def classify_level(self, words, level=1, lang_zh=False): 64 | ''' 65 | Get the most likely subject names at the given level according to some key words 66 | :param words: A key words list. Accept both English words and Chinese words 67 | :param level: classification level(1, 2 or 3) 68 | :param zh: Whether to use Chinese subject names 69 | :return: A list contains related subject names at the level 70 | ''' 71 | if level not in [1, 2, 3]: 72 | return [] 73 | main_subjects = self._get_all_level_distribution(words) 74 | if lang_zh: 75 | ret_iter = map(lambda x: self._get_zh_name(x), main_subjects[level - 1].keys()) 76 | else: 77 | ret_iter = map(lambda x: self._get_name(x), main_subjects[level - 1].keys()) 78 | return list(ret_iter) 79 | 80 | def _get_father_id(self, nsfc_id): 81 | return self._id2father.get(nsfc_id) 82 | 83 | def _get_name(self, nsfc_id): 84 | return self._id2name[nsfc_id][0] 85 | 86 | def _get_zh_name(self, nsfc_id): 87 | return self._id2name[nsfc_id][1] 88 | 89 | def _get_ancestors_list(self, nsfc_id): 90 | # get all of ancestors of a node on the subject tree 91 | ancestors = [] 92 | father_id = self._get_father_id(nsfc_id) 93 | if father_id: 94 | ancestors.append(self._get_zh_name(father_id)) 95 | # from top to bottom 96 | return self._get_ancestors_list(father_id) + ancestors 97 | else: 98 | return ancestors 99 | 100 | def _get_all_info(self, words, distribution=None): 101 | # Get the most likely subject names and their values at three levels 102 | ret = {} 103 | if distribution is None: 104 | distribution = self._get_all_level_distribution(words) 105 | for level in range(0, 3): 106 | level_name = 'level{}'.format(level + 1) 107 | ret[level_name] = [] 108 | for k, p in distribution[level].items(): 109 | ret[level_name].append({ 110 | 'p': p, 111 | 'name': self._get_name(k), 112 | 'name_zh': self._get_zh_name(k) 113 | }) 114 | return ret 115 | 116 | def _insert_subject2tree(self, nsfc_id, prob, tree): 117 | # insert a new node to a subject tree 118 | ancestors = self._get_ancestors_list(nsfc_id) 119 | point = tree 120 | node_name = self._get_zh_name(nsfc_id) 121 | if ancestors: 122 | for ancestor in ancestors: 123 | point = point.setdefault(ancestor, {'value': None}) 124 | point = point.setdefault('child', {}) 125 | point[node_name] = {'name': node_name, 'value': prob} 126 | 127 | def _get_all_level_distribution(self, words): 128 | restrict = None 129 | ret = [] 130 | for i in range(0, 3): 131 | distri = self._get_distribution(words, i, restrict) 132 | main_subjects = self._get_main_subject(distri) 133 | ret.append(main_subjects) 134 | # ensure there is no repetition 135 | restrict = main_subjects.keys() 136 | return ret 137 | 138 | def _get_distribution(self, words, level, restrict=None, ban=None): 139 | # get the weight distribution at the given level 140 | rs = {} 141 | if words and isinstance(words, Iterable): 142 | for w in words: 143 | data = self._mat[level].get(w.lower(), {}) 144 | for sub_id, v in data.items(): 145 | if ban is None or sub_id not in ban: 146 | if restrict is None or self._id2father.get(sub_id) in restrict: 147 | rs.setdefault(sub_id, 0) 148 | rs[sub_id] += v 149 | self._norm(rs) 150 | return rs 151 | 152 | def _norm(self, dict_data): 153 | # normalize the value of a dictionary 154 | s = sum(dict_data.values()) 155 | for k, v in dict_data.items(): 156 | dict_data[k] = v / s 157 | 158 | def _format_tree(self, tree): 159 | ''' 160 | In order to insert a node to a tree, we use subject names as keys. 161 | This function can format a subject tree(dictionary) by using the nodes itself as keys. 162 | ''' 163 | new_tree = [] 164 | for k, v in tree.items(): 165 | child = v.get('child') 166 | if child: 167 | new_child = self._format_tree(child) 168 | new_tree.append({'name': k, 'value': v['value'], 'children': new_child}) 169 | else: 170 | new_tree.append({'name': k, 'value': v['value']}) 171 | return new_tree 172 | 173 | def _get_main_subject(self, distribution, thresh_prob=0.6, min_prob=0.1, dec_drop=10): 174 | # select the most possible subjects 175 | dis_len = len(distribution) 176 | if dis_len == 0: 177 | return {} 178 | sorted_distribution = sorted(distribution.items(), key=lambda x: -x[1]) 179 | # after sorting, the dict becomes a list of pairs, item[0]: nsfc id, item[1]: its value 180 | ret = {sorted_distribution[0][0]: sorted_distribution[0][1]} 181 | sum_value = sorted_distribution[0][1] 182 | for i in range(1, dis_len): 183 | prev_value = sorted_distribution[i-1][1] 184 | now_value = sorted_distribution[i][1] 185 | if now_value < min_prob or (prev_value - now_value) / now_value > dec_drop: 186 | break 187 | ret[sorted_distribution[i][0]] = sorted_distribution[i][1] 188 | sum_value += now_value 189 | if sum_value > thresh_prob: 190 | break 191 | self._norm(ret) 192 | return ret 193 | 194 | if __name__ == '__main__': 195 | words = [ 196 | 'Controlled Experiment', 197 | 'Fit Tables.', 198 | 'Executable Test Case', 199 | 'Source Code', 200 | 'Static Analysis', 201 | 'Comprehension Task', 202 | 'Legacy System', 203 | 'Web Applications', 204 | 'Genetic Algorithm', 205 | 'Test Case', 206 | 'Security Testing', 207 | 'Empirical Study', 208 | 'Acceptance Testing', 209 | 'Data Model', 210 | 'Fit Table', 211 | 'Case Study', 212 | 'Crosscutting Concern', 213 | 'Web Application', 214 | 'Empirical Studies', 215 | 'Aspect Oriented Programming' 216 | ] 217 | aic = AIClassifier() 218 | print(aic.classify(words)) -------------------------------------------------------------------------------- /src/classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Introduction: 4 | Classifier is a class which is used to classify publications according to their subjects. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 5 | Usage: 6 | >>> nsfc = Classifier() 7 | >>> pub_titles = ['Annotating gene sets by mining large literature collections with protein networks.'] 8 | >>> subject = nsfc.classify(pub_titles) 9 | ''' 10 | 11 | import os 12 | import fastText 13 | import re 14 | import jieba 15 | from config import model_path 16 | from utils.translator import youdao_translate 17 | 18 | data_path = os.path.join(model_path, 'nsfc') 19 | 20 | 21 | class Classifier: 22 | 23 | def __init__(self, path=data_path): 24 | self._clf = [ 25 | fastText.load_model(os.path.join(path, 'clf0.bin')), 26 | fastText.load_model(os.path.join(path, 'clf1.bin')), 27 | fastText.load_model(os.path.join(path, 'clf2.bin')), 28 | ] 29 | self._zh_chars = re.compile(r'[^\u4e00-\u9fff]+') 30 | self._id2name = dict() 31 | with open(os.path.join(data_path, 'nsfc_subject.csv'), encoding='utf-8') as f: 32 | for line in f: 33 | _id, _name = line[:-1].split(',') # encode csv 34 | self._id2name[_id] = _name 35 | 36 | def _get_name(self, code): 37 | # Get the name from the given subject code 38 | return self._id2name[code] 39 | 40 | def _get_code(self, label): 41 | ''' 42 | In the model, we use '__label__' + NSFC subject code as labels. 43 | This function can extract the subject code from a label. 44 | ''' 45 | return label[9:] 46 | 47 | def _tokenize(self, pubs, lang_zh=False, translatation_func=youdao_translate): 48 | # Convert a sequence of characters into a sequence of tokens 49 | if not lang_zh: 50 | text_zh = translatation_func(pubs) 51 | else: 52 | text_zh = pubs 53 | words = [] 54 | for s in text_zh: 55 | # delete all characters which are not Chinese 56 | all_zh = self._zh_chars.sub('', s) 57 | words.extend(jieba.lcut(all_zh)) 58 | return words 59 | 60 | def classify(self, pub_titles, level=0, ntop=5, lang_zh=False, translatation_func=youdao_translate): 61 | ''' 62 | Use publications' titles to classify which subjects these publications belong to. 63 | :param pub_titles: A list of publication titles 64 | :param level: Classification level(1,2,3), for other numbers you will get all of levels 65 | :param ntop: How many subjects in each level does the classifier select 66 | :param lang_zh: Whether the titles are Chinese or not. For True, it means you are using Chinese publications. 67 | :param translation_func: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function. It should be able to translate a list of strings in another language to Chinese. 68 | :return: A dictionary: 69 | 'level{x}'(x = 1, 2, 3)': 70 | { 71 | 'code': subject code 72 | 'name': subject name 73 | 'p': probability 74 | } 75 | ''' 76 | ret = {} 77 | words = self._tokenize(pub_titles, lang_zh=lang_zh, translatation_func=translatation_func) 78 | if words == []: 79 | return ret 80 | text = ' '.join(words) 81 | for i in range(0, 3): 82 | if i + 1 == level or level not in [1, 2, 3]: 83 | # level number equals its index plus one 84 | level_name = 'level{}'.format(i+1) 85 | ret[level_name] = self._clf[i].predict(text, ntop) 86 | # format 87 | for key, value in ret.items(): 88 | new_value = [] 89 | for label, prob in zip(value[0],value[1]): # combine each label and its prob into a pair 90 | subject_code = self._get_code(label) 91 | new_value.append({ 92 | 'code': subject_code, 93 | 'name': self._get_name(subject_code), 94 | 'p': prob 95 | }) 96 | ret[key] = new_value 97 | return ret -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | # The directory of models 3 | src_path = os.path.dirname(os.path.abspath(__file__)) 4 | base_path = os.path.dirname(src_path) 5 | model_path = os.path.join(base_path, 'model') 6 | ''' 7 | Please put your api key here 8 | You can know how to get it in https://console.faceplusplus.com/documents/7079083. 9 | api_key = { 10 | 'api_key': '', 11 | 'api_secret': '' 12 | } 13 | ''' 14 | api_key = None 15 | -------------------------------------------------------------------------------- /src/expertrec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | ExpertRec is a class which is used to recommend some experts in the given text's field. 4 | Usage: 5 | >>> e = ExpertRec() 6 | >>> print(e.search('natural language processing')) 7 | ''' 8 | import os 9 | import fastText 10 | import joblib 11 | import json 12 | import numpy as np 13 | import heapq 14 | from utils.acautomaton import ACAutomaton 15 | from config import model_path 16 | 17 | data_path = os.path.join(model_path, 'expert') 18 | 19 | 20 | class ExpertRec: 21 | 22 | def __init__(self): 23 | self._model = fastText.load_model(os.path.join(data_path, 'model_aminer')) 24 | self._words = self._model.get_labels() 25 | self._index_mat = joblib.load(os.path.join(data_path, 'index_mat.pkl')) 26 | self._id2person = json.load(open(os.path.join(data_path, 'pid_list.json'), encoding='utf-8')) 27 | self.ac = ACAutomaton(self._words) 28 | self.base_url = 'http://www.aminer.cn/profile/{}' 29 | 30 | def doc2vec(self, text): 31 | # Convert text to vector. 32 | words = self.ac.search(text.lower().replace(' ', '_')) 33 | s = ' '.join([w.replace(' ', '_') for w in words]) 34 | vec = self._model.get_sentence_vector(s) 35 | return vec 36 | 37 | def search(self, text, num=20): 38 | ''' 39 | Recommend some experts in the given text's field. 40 | :param text: The text. 41 | :param num: The number of the recommended experts. 42 | :return: A list of dictionaries: 43 | { 44 | 'id': The expert's ID in AMiner(http://www.aminer.cn/), 45 | 'url': The expert's AMiner homepage. 46 | 'L2 distance': Similarity. The smaller the L2 distance is , the more likely the expert is interested in the given text's field. 47 | } 48 | ''' 49 | vec = self.doc2vec(text) 50 | dist_mat = self._index_mat - vec.T 51 | dist = np.linalg.norm(dist_mat, axis=1) 52 | ret = [{ 53 | 'id': self._id2person[i], 54 | 'url': self.base_url.format(self._id2person[i]), 55 | 'L2 distance': d 56 | } for i, d in enumerate(dist)] 57 | return heapq.nsmallest(num, ret, lambda x: x['L2 distance']) 58 | 59 | 60 | if __name__ == '__main__': 61 | e = ExpertRec() 62 | print(e.search('natural language processing')) -------------------------------------------------------------------------------- /src/gender.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | Gender is a class which is used to predict a person's gender. 4 | Usage: 5 | >>> g = Gender() 6 | >>> print(g.predict(name='Jie Tang', org='Tsinghua University', image_url='http://www.cs.tsinghua.edu.cn/publish/cs/4616/20110330101939787483549/20190321114128398502759.jpg')) 7 | ''' 8 | import os 9 | import json 10 | import re 11 | import pickle 12 | import requests 13 | from urllib.parse import quote_plus 14 | from utils.crawler import baidu_parse, google_parse, getHTMLText 15 | from config import model_path, api_key 16 | 17 | data_path = os.path.join(model_path, 'gender') 18 | 19 | 20 | class Gender: 21 | 22 | _face_url = 'https://api-us.faceplusplus.com/facepp/v3/detect' 23 | 24 | def __init__(self): 25 | self._name_model = json.load(open(os.path.join(data_path, 'model_name.json'))) 26 | self._search_model = pickle.load(open(os.path.join(data_path, 'model_page.pk'), 'rb'), encoding='latin1') 27 | 28 | @staticmethod 29 | def get_firstname(name): 30 | # get first name from a full name 31 | try: 32 | name = name.lower() 33 | return name.split(' ')[0] 34 | except Exception: 35 | return '' 36 | 37 | @staticmethod 38 | def get_words(content): 39 | # get words from an article 40 | r = re.compile(r'[a-zA-Z]+|\.\.\.') 41 | words = re.findall(r, content) 42 | return [str(word.lower()) for word in words] 43 | 44 | def name_score(self, name): 45 | ''' 46 | Predict a person's gender according to his or her name. 47 | :param name: The person's name 48 | :return: A dictionary: 49 | { 50 | 'male': probability that the person is male 51 | 'female': probability that the person is female 52 | } 53 | ''' 54 | firstname = self.get_firstname(name) 55 | if firstname in self._name_model.keys(): 56 | name_gender = self._name_model[firstname] 57 | return { 58 | 'male': 1 if name_gender == 'male' else 0, 59 | 'female': 1 if name_gender == 'female' else 0 60 | } 61 | else: 62 | return { 63 | 'male': 0.5, 64 | 'female': 0.5 65 | } 66 | 67 | def search_score(self, name, org, source='google'): 68 | ''' 69 | Predict a person's gender using search engine. 70 | :param name: The person's name 71 | :param org: The person's organization 72 | :param source: Search engine, baidu or google 73 | :return: A dictionary: 74 | { 75 | 'male': Probability that the person is male, 76 | 'female': probability that the person is female 77 | } 78 | ''' 79 | query = quote_plus('{} {} his OR her'.format(name, org)) 80 | if source == 'baidu': 81 | url = 'https://www.baidu.com/s?wd={}&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en&rn=100'.format(query) 82 | elif source == 'google': 83 | url = 'https://www.google.com.hk/search?q={}&hl=en'.format(query) 84 | else: 85 | return { 86 | 'male': 0.5, 87 | 'female': 0.5 88 | } 89 | html = getHTMLText(url) 90 | if source == 'baidu': 91 | page_info = baidu_parse(html) 92 | else: 93 | page_info = google_parse(html) 94 | if not page_info: 95 | return { 96 | 'male': 0.5, 97 | 'female': 0.5 98 | } 99 | featureHis = self._get_feature('his', page_info, name) 100 | featureHer = self._get_feature('her', page_info, name) 101 | numSnippets = max(len(page_info), 1) 102 | tottf = max(float(featureHis['tf']+featureHer['tf']), 1.0) 103 | feature = [ 104 | featureHis['tf']/tottf, featureHer['tf']/tottf, 105 | featureHis['df']/numSnippets, featureHer['tf']/numSnippets, 106 | int(featureHis['isNameInTitle']), int(featureHer['isNameInTitle']), 107 | int(featureHis['isInFirstSnippt']), int(featureHer['isInFirstSnippt']) 108 | ] 109 | # print(feature) 110 | mproba = self._search_model.predict_proba([feature])[0][1] 111 | return { 112 | 'male': mproba, 113 | 'female': 1 - mproba 114 | } 115 | 116 | def face_score(self, image_url=None, image_file=None): 117 | ''' 118 | Predict a person's gender his or her photo. 119 | :param image_url: The photo's url 120 | :param image_file: The photo's local path 121 | :return: A dictionary: 122 | { 123 | 'male': Probability that the person is male, 124 | 'female': Probability that the person is female 125 | } 126 | ''' 127 | try: 128 | data = { 129 | 'api_key': api_key['api_key'], 130 | 'api_secret': api_key['api_secret'], 131 | 'return_landmark': '0', 132 | 'return_attributes': 'gender' 133 | } 134 | if image_url is not None: 135 | data['image_url'] = image_url 136 | r = requests.post(Gender._face_url, data=data) 137 | elif image_file is not None: 138 | files = {'image_file': open(image_file, 'rb')} 139 | r = requests.post(Gender._face_url, data=data, files=files) 140 | else: 141 | return { 142 | 'male': 0.5, 143 | 'female': 0.5 144 | } 145 | # print(r.json()) 146 | rdict = r.json() 147 | f = rdict.get('faces', [])[0] 148 | gender = f.get('attributes', {}).get('gender', {}).get('value') 149 | return { 150 | 'male': 1 if gender == 'Male' else 0, 151 | 'female': 1 if gender == 'Female' else 0 152 | } 153 | except Exception as ex: 154 | print(ex) 155 | return { 156 | 'male': 0.5, 157 | 'female': 0.5 158 | } 159 | 160 | def predict(self, name, org, source='google', image_url=None, image_file=None): 161 | ''' 162 | Predict a person's gender. 163 | :param name: The person's name 164 | :param org: The person's organization 165 | :param source: Search engine, baidu or google 166 | :param image_url: The photo's url 167 | :param image_file: The photo's local path 168 | :return: A dictionary: 169 | { 170 | 'male': Probability that the person is male, 171 | 'female': Probability that the person is female, 172 | 'name': Probabilities from the person's name, 173 | 'search': Probabilities from search engine, 174 | 'face': Probabilities from the person's photo 175 | } 176 | ''' 177 | ret = {} 178 | weight = { 179 | 'name': 1, 180 | 'search': 1, 181 | 'face': 1.1 182 | } 183 | ret['name'] = self.name_score(name) 184 | ret['search'] = self.search_score(name, org, source=source) 185 | ret['face'] = self.face_score(image_url, image_file) 186 | sum_p = 0 187 | male_v = 0 188 | for name, data in ret.items(): 189 | if data['male'] != 0.5: 190 | male_v += data['male'] * weight[name] 191 | sum_p += weight[name] 192 | if sum_p > 0: 193 | male_p = male_v / sum_p 194 | else: 195 | male_p = 0.5 196 | ret['male'] = round(male_p, 2) 197 | ret['female'] = round(1 - male_p, 2) 198 | return ret 199 | 200 | def _get_feature(self, feature_name, page_info, name): 201 | # Extract some features about gender from the web page. 202 | feature = { 203 | 'tf': 0, 204 | 'df': 0, 205 | 'isNameInTitle': False, 206 | 'isInFirstSnippt': False 207 | } 208 | words_name = self.get_words(name) 209 | top3Snippets = [] 210 | for pos, snippet in enumerate(page_info): 211 | words_title = self.get_words(snippet['title']) 212 | words_content = self.get_words(snippet['content']) 213 | if pos < 3: 214 | top3Snippets.extend(words_content) 215 | num = words_content.count(feature_name) 216 | if feature['isNameInTitle'] is False: 217 | if num > 0 and words_name[0] in words_title: 218 | feature['isNameInTitle'] = True 219 | feature['tf'] += num 220 | if num > 0: 221 | feature['df'] += 1 222 | if feature_name in top3Snippets: 223 | feature['isInFirstSnippt'] = True 224 | return feature -------------------------------------------------------------------------------- /src/jobhopping.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | JobHopping is a class which is used to predict where a scholar may hop to. 4 | Usage: 5 | >>> j = JobHopping() 6 | >>> print(j.predict('tsinghua university')) 7 | ''' 8 | import pickle 9 | import os 10 | import torch 11 | import torch.nn as nn 12 | import heapq 13 | import numpy as np 14 | from prediction_api.src.config import model_path 15 | import torch.nn.functional as F 16 | 17 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 18 | data_path = os.path.join(model_path, 'jobhopping') 19 | 20 | 21 | class GRUfn(nn.Sequential): 22 | 23 | def __init__(self, input_size, hidden_size, output_size): 24 | super(GRUfn, self).__init__() 25 | self.hidden_size = hidden_size 26 | self.input_size = input_size 27 | self.sig = nn.Sigmoid() 28 | self.cr = nn.GRU(input_size=input_size, hidden_size=hidden_size) 29 | self.fn = nn.Linear(hidden_size, output_size) 30 | self.fn2 = nn.Linear(hidden_size, output_size) 31 | 32 | def forward(self, x, y=None, batch=256): 33 | if y is not None: 34 | x, y = self.cr(x, y) 35 | else: 36 | x, y = self.cr(x) 37 | x = torch.nn.utils.rnn.pad_packed_sequence(x) 38 | r = torch.transpose(x[0], 0, 1) 39 | y = y.view(batch, self.hidden_size) 40 | ind = x[1].view(batch, 1, 1) 41 | ind = ind - 1 42 | ind = ind.expand(-1, -1, self.hidden_size) 43 | t = torch.gather(r, 1, ind) 44 | t = t.view(batch, self.hidden_size) 45 | t = self.fn(t) 46 | y = self.fn2(y) 47 | t = t + y 48 | t = self.sig(t) 49 | return t 50 | 51 | 52 | class JobHopping: 53 | 54 | def __init__(self): 55 | self._id2name = {} 56 | self._name2id = {} 57 | self._model_data = torch.load(os.path.join(data_path, 'model')) 58 | self._affi = self._model_data['affi_tensor'] 59 | 60 | with open(os.path.join(data_path, 'orgID2orgname'), 'rb') as file: 61 | _data = pickle.load(file) 62 | for i, v in enumerate(_data): 63 | self._id2name[i] = v.split('+')[0] 64 | self._name2id.setdefault(v.split('+')[0], i) 65 | 66 | self._INPUT_DIM = 128 67 | self._OUTPUT_DIM = len(self._id2name.keys()) 68 | self._model = GRUfn(self._INPUT_DIM, 512, self._OUTPUT_DIM) 69 | self._model.load_state_dict(self._model_data['state_dict']) 70 | 71 | def predict(self, name_squence, ntop=3): 72 | ''' 73 | get a scholar's possible future affiliation according to 74 | his current affiliation's name 75 | :param name: the scholar's affiliation name 76 | :param ntop: How many possible affiliations will the method return 77 | :return: A list of dictionaries: 78 | { 79 | 'name': the most likely future affiliation's name 80 | 'p': the probability 81 | } 82 | ''' 83 | 84 | name_squence = [x.lower() for x in name_squence] 85 | name2id_squence = [self._name2id[name] for name in name_squence if name in self._name2id.keys()] 86 | # if len(name_squence) != len(name2id_squence): 87 | # return None 88 | temp_squence = name2id_squence 89 | name2id_squence = [] 90 | if len(temp_squence) != 0: 91 | name2id_squence.append(temp_squence[0]) 92 | [name2id_squence.append(term) for index, term in enumerate(temp_squence) if index != 0 and term != temp_squence[index - 1]] 93 | else: 94 | return None 95 | # 去掉重复环路 96 | name2id_squence = self._delete_ring(name2id_squence) 97 | zb = self._id2PackedSequence(name2id_squence) 98 | fout = self._model(zb, batch=1) 99 | # softmax_fout = F.softmax(fout,1) 100 | # ans = heapq.nlargest(ntop, enumerate(softmax_fout.data.numpy()[0]), key=lambda x:x[1]) 101 | ans = heapq.nlargest(ntop, enumerate(fout.data.numpy()[0]), key=lambda x:x[1]) 102 | ret = [] 103 | for id, p in ans: 104 | ret.append({ 105 | 'name': self._id2name[id], 106 | 'p': p, 107 | }) 108 | self._softmax(ret) 109 | return ret 110 | def _delete_ring(self,id_squence): 111 | clear_squence = id_squence 112 | itmes = 1000 113 | while True: 114 | res = self._getNumofCommonSubstr(clear_squence, clear_squence) 115 | if res[1] < 2: 116 | break 117 | a = "_".join([str(ss) for ss in res[0]]) 118 | b = "_".join([str(ss) for ss in clear_squence]) 119 | temp = b 120 | times = 1000 121 | while times > 1: 122 | if b.rfind(a) != -1: 123 | temp = b 124 | b = self._rreplace(b, a, "_", 1) 125 | times -= 1 126 | else: 127 | break 128 | clear_squence = [int(term) for term in temp.split("_") if term != ""] 129 | # id_squence = [int(s) for s in clear_squence] 130 | # clear_squence = id_squence 131 | return clear_squence 132 | 133 | def _getNumofCommonSubstr(self,str1, str2): 134 | lstr1 = len(str1) 135 | lstr2 = len(str2) 136 | record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位 137 | maxNum = 0 # 最长匹配长度 138 | p = 0 # 匹配的起始位 139 | 140 | for i in range(lstr1): 141 | for j in range(lstr2): 142 | if str1[i] == str2[j] and abs(i - j) > maxNum: 143 | # 相同则累加 144 | record[i + 1][j + 1] = record[i][j] + 1 145 | if record[i + 1][j + 1] > maxNum: 146 | # 获取最大匹配长度 147 | maxNum = record[i + 1][j + 1] 148 | # 记录最大匹配长度的终止位置 149 | p = i + 1 150 | # return p - maxNum,p, maxNum 151 | return str1[p - maxNum:p], maxNum 152 | 153 | def _rreplace(self,st, old, new, *max): 154 | count = len(st) 155 | if max and str(max[0]).isdigit(): 156 | count = max[0] 157 | return new.join(st.rsplit(old, count)) 158 | 159 | def _id2PackedSequence(self, affi_id): 160 | # 输入的形状可以是 (T×B×*)。T 是最长序列长度,B 是 batch size,* 代表任意维度 (可以是 0)。如果 batch_first=True 的话,那么相应的 input size 就是 (B×T×*)。 161 | ret = torch.zeros(1, len(affi_id), self._INPUT_DIM) 162 | indices = torch.tensor(affi_id, device='cpu', dtype=torch.long) 163 | ret[0] = torch.index_select(self._affi, 0, indices) 164 | return torch.nn.utils.rnn.pack_padded_sequence(ret, [len(affi_id)],batch_first=True) 165 | 166 | def _softmax(self, affis): 167 | # Softmax is a generalization of logistic function that "squashes"(maps) a vector of arbitrary real values to a vector of real values in the range (0, 1) that add up to 1. 168 | s = sum(map(lambda x: np.exp(x['p']), affis)) 169 | for dict in affis: 170 | dict['p'] = round(np.exp(dict['p'])/s, 2) -------------------------------------------------------------------------------- /src/paperranker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | PaperRanker is a class which is used to predict how much possibility there is that a publication is belong to a professor. 4 | Usage: 5 | >>> ret, res = pr.label(a, b, threshold=0.5) 6 | ''' 7 | from classifier import Classifier 8 | from Levenshtein import jaro_winkler 9 | from utils.translator import youdao_translate 10 | import copy 11 | import time 12 | 13 | 14 | class PaperRanker: 15 | 16 | def __init__(self, use_clf=False): 17 | self.clf = Classifier() 18 | self.use_clf = use_clf 19 | self.weight = { 20 | 'coauthor_score': 0.7 if use_clf else 0.9, 21 | 'pubyear_score': 0.1, 22 | } 23 | if use_clf: 24 | self.weight['field_score'] = 0.2 25 | print(self.weight) 26 | 27 | def ranking(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate): 28 | ''' 29 | Predict how much possibility there is that a publication is belong to a professor. 30 | :param correct_pubs: A list of this professor's publications. 31 | :param unsure_pubs: A list of unsure publications 32 | :param threshold: If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication. 33 | :param trans: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function. It should be able to translate a list of strings in another language to Chinese. 34 | :return: (a,b), two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities. 35 | ''' 36 | ret = copy.deepcopy(unsure_pubs) 37 | ret = self.coauthor_score(correct_pubs, ret) 38 | if self.use_clf: 39 | ret = self.field_score(correct_pubs, ret, trans=trans) 40 | ret = self.pubyear_score(ret) 41 | for pub in ret: 42 | pub['score'] = 0 43 | for name, weight in self.weight.items(): 44 | pub['score'] += pub[name] * weight 45 | res = [pub for pub in ret if pub['score'] < threshold] 46 | ret = [pub for pub in ret if pub['score'] >= threshold] 47 | sorted(ret, key=lambda x: x['score'], reverse=True) 48 | sorted(res, key=lambda x: x['score'], reverse=True) 49 | return ret, res 50 | 51 | def label(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate): 52 | ''' 53 | Use iterative algorithm to predict how much possibility there is that a publication is belong to a professor. 54 | :param correct_pubs: A list of this professor's publications. 55 | :param unsure_pubs: A list of unsure publications 56 | :param threshold: If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication. 57 | :param trans: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function. It should be able to translate a list of strings in another language to Chinese. 58 | :return: (a,b), two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities. 59 | ''' 60 | co = [] 61 | uns = copy.deepcopy(unsure_pubs) 62 | cnt = 1 63 | while True: 64 | print('round {}'.format(cnt)) 65 | cnt += 1 66 | ret, res = self.ranking(co + correct_pubs, uns, threshold, trans=trans) 67 | co = co + ret 68 | uns = res 69 | if len(ret) == 0: 70 | break 71 | time.sleep(4) 72 | return co, uns 73 | 74 | def coauthor_score(self, correct_pubs, unsure_pubs): 75 | authors = set() 76 | for pub in correct_pubs: 77 | authors = authors.union(set(pub['authors'])) 78 | for pub in unsure_pubs: 79 | num = 0 80 | for name_b in pub['authors']: 81 | score = 0 82 | for name_a in authors: 83 | score = max(score, self.name_match(name_a, name_b)) 84 | num += score 85 | if len(pub['authors']) == 0: 86 | pub['coauthor_score'] = 0 87 | else: 88 | pub['coauthor_score'] = min(1.0, max(num/len(pub['authors']), num*0.12)) 89 | return unsure_pubs 90 | 91 | def field_score(self, correct_pubs, unsure_pubs, trans=youdao_translate): 92 | titles = list(map(lambda x: x['title'], correct_pubs)) 93 | distribution = self.clf.classify(pub_titles=titles, level=1, ntop=5, translatation_func=trans) 94 | codes = set(map(lambda x: x['code'], distribution['level1'])) 95 | cnt = 0 96 | for pub in unsure_pubs: 97 | cnt += 1 98 | now_dist = self.clf.classify(pub_titles=[pub['title']], level=1, ntop=5, translatation_func=trans) 99 | if now_dist: 100 | intersect = set(map(lambda x: x['code'], now_dist['level1']))&codes 101 | else: 102 | intersect = [] 103 | pub['field_score'] = len(intersect)/5 104 | return unsure_pubs 105 | 106 | def pubyear_score(self, unsure_pubs): 107 | for pub in unsure_pubs: 108 | if pub['year']: 109 | year = int(pub['year']) 110 | pub['pubyear_score'] = min(1.0, (year - 1950) / (2019 - 1950)) 111 | else: 112 | pub['pubyear_score'] = 0.5 113 | return unsure_pubs 114 | 115 | def name_match(self, name_a, name_b): 116 | ''' 117 | For example, we cannot totally assert Professor 'J. Tang' and Professor 'Jie Tang' are the same person. 118 | We use this function to estimate how much possibility that two professors' name belong to one person. 119 | ''' 120 | name_a = name_a.lower().strip().replace('.', '').replace('-', '').replace(u'\xa0', '') 121 | name_b = name_b.lower().strip().replace('.', '').replace('-', '') 122 | if name_a == name_b: 123 | return 1 124 | elif name_a[0] != name_b[0]: 125 | return 0 126 | lastname_a = name_a.split(' ')[-1] 127 | lastname_b = name_b.split(' ')[-1] 128 | if lastname_a != lastname_b: 129 | return 0 130 | firstname_a = name_a.split(' ')[0] 131 | firstname_b = name_b.split(' ')[0] 132 | if len(firstname_a) != 1 and len(firstname_b) != 1: 133 | return 0 134 | return jaro_winkler(name_a, name_b) -------------------------------------------------------------------------------- /src/tors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | Predict a scholar's identity (teacher or student) and his or her degree. 4 | usage: 5 | >>> identity = TorS() 6 | >>> print(identity.predict(pc=10, cn=10000, hi=40, gi=0, year_range=14)) 7 | ''' 8 | import os 9 | import math 10 | import json 11 | import numpy as np 12 | import tensorflow as tf 13 | import pandas as pd 14 | from config import model_path 15 | 16 | data_path = os.path.join(model_path, 'student') 17 | 18 | 19 | class TorS: 20 | 21 | def __init__(self): 22 | self.feature_cols = [tf.feature_column.numeric_column(f) for f in ['pc', 'cn', 'hi', 'gi', 'year_range']] 23 | self._md = tf.estimator.DNNClassifier( 24 | hidden_units=[10, 10], 25 | feature_columns=self.feature_cols, 26 | model_dir=data_path, 27 | ) 28 | 29 | def predict(self, pc=0, cn=0, hi=0, gi=0, year_range=0): 30 | ''' 31 | Predict whether a scholar is a teacher or a student, and then predict his degree. 32 | :param pc: Number of papers 33 | :param cn: Citation number 34 | :param hi: H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times. 35 | :param gi: G-index. Given a set of articles ranked in decreasing order of the number of citations that they received, 36 | the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations. 37 | :param year_range: Time range of papers 38 | :return: A dictionary: 39 | { 40 | 'label': 'student' or 'teacher', 41 | 'degree': 'undergraduate', 'master' or 'doctor' 42 | 'p': probability 43 | } 44 | ''' 45 | features = dict(pc=pc, cn=cn, hi=hi, gi=gi, year_range=year_range) 46 | input = pd.read_json(json.dumps([features])) 47 | output = self._md.predict(input_fn=lambda: self._pre_progress(input)) 48 | ans = [(int(item['class_ids'][0]), item['probabilities'][item['class_ids'][0]]) for item in output][0] 49 | label = 'student' if ans[0] == 1 else 'teacher' 50 | if label == 'teacher': 51 | degree = 'doctor' 52 | else: 53 | degree = 'master' if pc >= 2 else 'undergraduate' 54 | ret = { 55 | 'label': label, 56 | 'degree': degree, 57 | 'p': round(float(ans[1]), 4) 58 | } 59 | return ret 60 | 61 | def _pre_progress(self, features): 62 | # Normalize and pass features to nn classifier for prediction 63 | max_year_range = 53 # 53 is the max year_range in training set. 64 | normalized_features = pd.DataFrame() 65 | for feature in ['pc', 'cn', 'hi', 'gi']: 66 | normalized_features[feature] = features[feature].apply(lambda x: math.log(x + 1.0)) 67 | normalized_features['year_range'] = features['year_range'].apply(lambda x: x / max_year_range) 68 | ret = {key: np.array(value) for key, value in dict(normalized_features).items()} 69 | ds = tf.data.Dataset.from_tensor_slices(ret) 70 | ds = ds.batch(1).repeat(1) 71 | ret = ds.make_one_shot_iterator().get_next() 72 | return ret 73 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/acautomaton.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Introduction: 3 | The Aho-Corasick automaton is a data structure that can quickly do a multiple-keyword search across text. It’s described in the classic paper ‘Efficient string matching: an aid to bibliographic search’: http://portal.acm.org/citation.cfm?id=360855&dl=ACM&coll=GUIDE. 4 | ''' 5 | 6 | import queue 7 | 8 | 9 | class node: 10 | def __init__(self, ch): 11 | self.ch = ch 12 | self.fail = None 13 | self.tail = -1 14 | self.len = 0 15 | self.children = {} 16 | 17 | 18 | class ACAutomaton: 19 | 20 | def __init__(self, patterns=[]): 21 | self.root = node('') 22 | self.count = 0 23 | self.patterns = patterns 24 | if patterns: 25 | for pattern in patterns: 26 | self.insert(pattern) 27 | self.getfail() 28 | 29 | def insert(self, pattern): 30 | # Insert a new pattern to the trie. 31 | p = self.root 32 | for i in pattern: 33 | if i not in p.children.keys(): 34 | child = node(i) 35 | p.children[i] = child 36 | p = child 37 | else: 38 | p = p.children[i] 39 | p.tail = self.count 40 | p.len = len(pattern) 41 | self.count += 1 42 | return self.count 43 | 44 | def getfail(self): 45 | # Use BFS algorithm to initialize 'fail' points. 46 | q = queue.Queue() 47 | q.put(self.root) 48 | while not q.empty(): 49 | top = q.get() 50 | for i in top.children.values(): 51 | if top == self.root: 52 | i.fail = self.root 53 | else: 54 | p = top.fail 55 | while p: 56 | if i.ch in p.children.keys(): 57 | i.fail = p.children[i.ch] 58 | break 59 | p = p.fail 60 | if not p: 61 | i.fail = self.root 62 | q.put(i) 63 | 64 | def search(self, text): 65 | # Do a multiple-keyword search across text 66 | p = self.root 67 | ret = [] 68 | for i, ch in enumerate(text): 69 | while ch not in p.children.keys() and p is not self.root: 70 | p = p.fail 71 | if ch in p.children.keys(): 72 | p = p.children[ch] 73 | else: 74 | p = self.root 75 | tmp = p 76 | while tmp is not self.root: 77 | if tmp.tail >= 0: 78 | ret.append((i-tmp.len+1, -tmp.len)) 79 | break 80 | else: 81 | tmp = tmp.fail 82 | ''' 83 | In this project, we need to extract some patterns from the given text and these patterns should not intersect with each other. 84 | For example, 'ac' and 'a' are both substrings of 'acb'. In this case, we just need 'ac'. 85 | Here we use greedy algorithm to maximize the keywords' length. 86 | ''' 87 | ret.sort() 88 | ans = set() 89 | end = -1 90 | for pos, l in ret: 91 | length = -l 92 | if pos > end: 93 | ans.add(text[pos: pos + length]) 94 | end = pos + length - 1 95 | return list(ans) 96 | -------------------------------------------------------------------------------- /src/utils/crawler.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | ''' 3 | Here are some utilities which can help us search something from Baidu or Google. 4 | ''' 5 | import requests 6 | from bs4 import BeautifulSoup 7 | import json 8 | from scrapy.selector import Selector 9 | import re 10 | 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, compress', 14 | 'Accept-Language': 'en-us;q=0.5,en;q=0.3', 15 | 'Cache-Control': 'max-age=0', 16 | 'Connection': 'keep-alive', 17 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0' 18 | } 19 | 20 | 21 | def getHTMLText(url): 22 | try: 23 | r = requests.get(url, headers=headers) 24 | r.raise_for_status() 25 | r.encoding = r.apparent_encoding 26 | return r.text 27 | except Exception: 28 | return '' 29 | 30 | 31 | def baidu_parse(html): 32 | ulist = [] 33 | soup = BeautifulSoup(html, 'lxml') 34 | items = soup.find_all('div', {'class': 'result c-container'}) 35 | if not items: 36 | items = soup.find_all('div', {'class': 'result c-container '}) 37 | for node in items: 38 | try: 39 | abstract_node = node.find('div', {'class': 'c-abstract c-abstract-en'}) 40 | if not abstract_node: 41 | abstract_node = node.find('div', {'class': 'c-abstract'}) 42 | ctools = node.find('div', {'class': 'c-tools'}) 43 | abstract = abstract_node.text 44 | title = json.loads(ctools['data-tools'].replace('\\', ''))['title'] 45 | ulist.append({ 46 | 'title': title, 47 | 'content': abstract 48 | }) 49 | except Exception as ex: 50 | print(str(ex)) 51 | return ulist 52 | 53 | 54 | def google_parse(html): 55 | page = Selector(text=html) 56 | rs = [] 57 | for ans in page.css('div.g'): 58 | title = ''.join(ans.css('h3').css('*::text').extract()) 59 | content = ''.join(ans.css('span.st').css('*::text').extract()) 60 | url = ans.css('*.r a::attr(href)').extract() 61 | try: 62 | url = re.findall('(http.*)', url[0]) 63 | url = re.sub('&.*', '', url[0]) 64 | rs.append({ 65 | 'url': url, 66 | 'content': content, 67 | 'title': title, 68 | }) 69 | except Exception: 70 | pass 71 | return rs 72 | 73 | 74 | # url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en' 75 | # html = getHTMLText(url) 76 | # print(baidu_parse(html)) -------------------------------------------------------------------------------- /src/utils/translator.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def youdao_translate(text): 5 | url = 'http://fanyi.youdao.com/translate' 6 | if type(text) == list: 7 | src = ','.join(text) 8 | else: 9 | src = text 10 | data = { 11 | 'doctype': 'json', 12 | 'type': 'EN2ZH_CN', 13 | 'i': src, 14 | } 15 | rs = requests.get(url=url, params=data) 16 | try: 17 | trans_data = rs.json()['translateResult'] 18 | tgt = [t['tgt'] for t in trans_data[0]] 19 | return tgt 20 | except Exception: 21 | # print('There is an error in translation') 22 | return [] 23 | 24 | 25 | if __name__ == '__main__': 26 | print(youdao_translate(['test','apple'])) -------------------------------------------------------------------------------- /test/nsfc_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | sys.path.append('../src') 4 | from classifier import Classifier 5 | 6 | clf = Classifier() 7 | f = open('nsfc_test.json', 'r', encoding='utf-8') 8 | s = f.read() 9 | j = json.loads(s, encoding='utf-8') 10 | data = j['nsfc'] 11 | for level in [1, 2, 3]: 12 | cnt = 0 13 | top1 = 0 14 | top5 = 0 15 | length = level * 2 + 1 16 | for item in data: 17 | if len(item['sid']) < length: 18 | continue 19 | subject = clf.classify([item['title']], level=level, lang_zh=True) 20 | if subject == {}: 21 | continue 22 | cnt += 1 23 | if subject['level{}'.format(level)][0]['code'] == item['sid'][0:length]: 24 | top1 += 1 25 | for ret in subject['level{}'.format(level)]: 26 | if ret['code'] == item['sid'][0:length]: 27 | top5 += 1 28 | break 29 | print('level', level, ':', top1/cnt, ' ', top5/cnt, ' ', cnt) 30 | -------------------------------------------------------------------------------- /test/pageranker_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import random 4 | sys.path.append('../src') 5 | from paperranker import PaperRanker 6 | from baidu_translator import baidu_translate 7 | 8 | 9 | pr = PaperRanker(use_clf=True) 10 | with open('pageranker_test.json', 'r', encoding='utf-8') as f: 11 | s = f.read() 12 | correct = json.loads(s, encoding='utf-8')[0]['confirmed'] 13 | with open('542edff0dabfae498ae3c756.json', 'r', encoding='utf-8') as f: 14 | s = f.read() 15 | wrong = json.loads(s, encoding='utf-8') 16 | random.shuffle(correct) 17 | random.shuffle(wrong) 18 | a = correct[0:40] 19 | b = correct[40:] + wrong 20 | ret, res = pr.label(a, b, threshold=0.5, trans=baidu_translate) 21 | tp = 0 22 | fp = 0 23 | fn = 0 24 | tn = 0 25 | for item in ret: 26 | if item['flag'] == '1': 27 | tp += 1 28 | else: 29 | fp += 1 30 | print(str(tp)+' '+str(fp)) 31 | for item in res: 32 | if item['flag'] == '1': 33 | fn += 1 34 | else: 35 | tn += 1 36 | print(str(fn)+' '+str(tn)) 37 | --------------------------------------------------------------------------------