├── .DS_Store
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── doc
    ├── API list.xlsx
    ├── Expert_Recommendation.md
    ├── Gender_Prediction.md
    ├── Identity_Prediction.md
    ├── Jobhopping_Prediction.md
    ├── NSFC_AI_Subject_Classifier.md
    ├── NSFC_Subject_Classifier.md
    ├── Paper_Ranker.md
    └── paper_ranker.png
├── model
    └── README.md
├── src
    ├── __init__.py
    ├── aiclassifier.py
    ├── classifier.py
    ├── config.py
    ├── expertrec.py
    ├── gender.py
    ├── jobhopping.py
    ├── paperranker.py
    ├── tors.py
    └── utils
    │   ├── __init__.py
    │   ├── acautomaton.py
    │   ├── crawler.py
    │   └── translator.py
└── test
    ├── nsfc_test.py
    └── pageranker_test.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/*
 2 | .vscode
 3 | .idea/*
 4 | .idea
 5 | *.pkl
 6 | *.csv
 7 | *.bin
 8 | *.json
 9 | *.html
10 | *.pk
11 | jobhopping/*
12 | __pycache__
13 | __pycache__/*
14 | model/student/model.ckpt-1000.meta
15 | model/student/model.ckpt-1000.index
16 | model/student/model.ckpt-1000.data-00000-of-00001
17 | model/student/graph.pbtxt
18 | model/student/eval/events.out.tfevents.1539419480.Juliuss-MacBook-Pro.local
19 | model/student/eval/events.out.tfevents.1539416954.Juliuss-MacBook-Pro.local
20 | model/student/checkpoint
21 | model/jobhopping/orgID2orgname
22 | model/jobhopping/model
23 | model/student/orgID2orgname
24 | model/student/model
25 | doc/~$API list.xlsx
26 | model/expert/model_aminer
27 | test/baidu_translator.py
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 AMiner Open Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Prediction API
 2 | ![](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg)
 3 | 
 4 | ## Introduction
 5 | 
 6 | _AMiner Prediction API_ is a toolkit for science data prediction, such as scholar portrait property prediction. The toolkit aims to utilize science data and machine learning algorithms to provide more intelligent functionality for global researchers. All algorithms and models which the toolkit uses are derived from [AMiner](https://aminer.cn).
 7 | 
 8 | ## Pre-requirement
 9 | 
10 | [`Anaconda`](https://www.anaconda.com/) is strongly recommended for environment configuration. Additionally, some libraries are requied. Use following commands to install these libraries:
11 | 
12 | |                           Library                            |                      Command                      |
13 | | :----------------------------------------------------------: | :-----------------------------------------------: |
14 | |              [`fastText`](https://fasttext.cc/)              |   `conda install -c mbednarski fasttext` *****    |
15 | |         [`Scikit-learn`](https://scikit-learn.org/)          |     `conda install -c anaconda scikit-learn`      |
16 | |          [`Jieba`](https://github.com/fxsjy/jieba)           |       `conda install -c conda-forge jieba`        |
17 | |         [`Requests`](https://2.python-requests.org/)         |      `conda install -c conda-forge requests`      |
18 | |            [`Tensorflow`](http://tensorflow.org/)            |     `conda install -c conda-forge tensorflow`     |
19 | |               [`Pytorch`](http://pytorch.org/)               |        `conda install -c pytorch pytorch`         |
20 | |              [`Numpy`](http://numpy.scipy.org/)              |       `conda install -c conda-forge numpy`        |
21 | |            [`Pandas`](http://pandas.pydata.org/)             |       `conda install -c conda-forge pandas`       |
22 | | [`BeautifulSoup4`](http://www.crummy.com/software/BeautifulSoup/) |   `conda install -c conda-forge beautifulsoup4`   |
23 | |               [`Scrapy`](https://scrapy.org/)                |       `conda install -c conda-forge scrapy`       |
24 | |                        `Levenshtein`                         | `conda install -c conda-forge python-levenshtein` |
25 | 
26 | > ***** If you are using **OSX**, you should use following command to intall `fastText`
27 | >
28 | > ```bash
29 | > conda install -c conda-forge fasttext
30 | > ```
31 | 
32 | ## Models Download
33 | 
34 | Toolkit depends on some pre-trained model files which can be downloaded at following address:
35 | 
36 | [Download](https://lfs.aminer.cn/misc/model.zip)
37 | 
38 | Extract it and move all files into `model` directory before testing code.
39 | 
40 | ## Document
41 | 
42 | [NSFC Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_Subject_Classifier.md)
43 | 
44 | [NSFC AI Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_AI_Subject_Classifier.md)
45 | 
46 | [Gender Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Gender_Prediction.md)
47 | 
48 | [Identity Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Identity_Prediction.md)
49 | 
50 | [Jobhopping Prediction](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Jobhopping_Prediction.md)
51 | 
52 | [Expert Recommendation](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Expert_Recommendation.md)
53 | 
54 | [Paper Ranker](https://github.com/AMinerOpen/prediction_api/blob/master/doc/Paper_Ranker.md)
55 | 
56 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/__init__.py


--------------------------------------------------------------------------------
/doc/API list.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/doc/API list.xlsx


--------------------------------------------------------------------------------
/doc/Expert_Recommendation.md:
--------------------------------------------------------------------------------
  1 | # Expert recommendation
  2 | 
  3 | ## Introduction
  4 | 
  5 | ExpertRec is a class which is used to recommend some experts in the given text's field.
  6 | 
  7 | ## Method
  8 | 
  9 | ### search
 10 | 
 11 | ```python
 12 | search(text, num=20)
 13 | ```
 14 | 
 15 | #### Introduction
 16 | 
 17 | Recommend some experts in the given text's field.
 18 | 
 19 | #### Parameters
 20 | 
 21 | ##### text
 22 | 
 23 | The text.
 24 | 
 25 | ##### num
 26 | 
 27 | The number of the recommended experts.
 28 | 
 29 | #### Return value
 30 | 
 31 |  A list of dictionaries:
 32 | 
 33 | ```python
 34 | {
 35 |     'id': The expert's ID in AMiner(http://www.aminer.cn/),
 36 |     'url': The expert's AMiner homepage.
 37 |     'L2 distance': Similarity. The smaller the L2 distance is , the more likely the expert is interested in the given text's field.
 38 | }
 39 | ```
 40 | 
 41 | #### An example
 42 | 
 43 | ```python
 44 | e = ExpertRec()
 45 | rt = e.search('natural language processing')
 46 | ```
 47 | 
 48 | `rt`:
 49 | 
 50 | ```python
 51 | [
 52 |     {
 53 |         'id': '544572eddabfae862da1d4e0', 
 54 |         'url': 'http://www.aminer.cn/profile/544572eddabfae862da1d4e0', 
 55 |         'L2 distance': 0.0
 56 |     }, 
 57 |     {
 58 |         'id': '53f438eadabfaee0d9b7cce4', 
 59 |         'url': 'http://www.aminer.cn/profile/53f438eadabfaee0d9b7cce4', 
 60 |         'L2 distance': 0.26824072
 61 |     }, 
 62 |     {
 63 |         'id': '53f432cbdabfaeb1a7bcfd9a', 
 64 |         'url': 'http://www.aminer.cn/profile/53f432cbdabfaeb1a7bcfd9a', 
 65 |         'L2 distance': 0.31506824
 66 |     }, 
 67 |     {
 68 |         'id': '53f432b7dabfaeb2ac02dc61',
 69 |         'url': 'http://www.aminer.cn/profile/53f432b7dabfaeb2ac02dc61',
 70 |         'L2 distance': 0.3284118
 71 |     }, 
 72 |     {
 73 |         'id': '53f43757dabfaeecd696742f', 
 74 |         'url': 'http://www.aminer.cn/profile/53f43757dabfaeecd696742f',
 75 | 		'L2 distance': 0.34276736
 76 |     }
 77 | ]
 78 | ```
 79 | 
 80 | ## API
 81 | 
 82 | ### https://innovaapi.aminer.cn/tools/v1/predict/experts
 83 | 
 84 | ![](https://img.shields.io/badge/http-post-blue.svg)
 85 | 
 86 | An online version of method `search`
 87 | 
 88 | #### Request body
 89 | 
 90 | ##### text
 91 | 
 92 | The text
 93 | 
 94 | ##### num
 95 | 
 96 | The number of the recommended experts.
 97 | 
 98 | #### Return value
 99 | 
100 | In the `Response` object, there will be three fields.
101 | 
102 | ##### status
103 | 
104 | `0`: Success
105 | 
106 | `1`: There are some errors.
107 | 
108 | ##### message
109 | 
110 | `success`: Success
111 | 
112 | If there are some errors, you will get the error information.
113 | 
114 | ##### data
115 | 
116 | The return value from the method.
117 | 
118 | #### An example
119 | 
120 | ```http
121 | POST /tools/v1/predict/experts? HTTP/1.1
122 | Host: innovaapi.aminer.cn
123 | Content-Type: application/json
124 | User-Agent: PostmanRuntime/7.13.0
125 | Accept: */*
126 | Cache-Control: no-cache
127 | Postman-Token: 05b4af12-9cf0-4cf9-a45c-6f8fd2a9d0a9,867a4a93-f753-4799-9aa9-96b8208f5067
128 | Host: innovaapi.aminer.cn
129 | accept-encoding: gzip, deflate
130 | content-length: 60
131 | Connection: keep-alive
132 | cache-control: no-cache
133 | 
134 | {
135 |     "text": "natural language processing",
136 |     "num": 20
137 | }
138 | ```
139 | 
140 | `Response`：
141 | 
142 | ```json
143 | {
144 |     "status": 0,
145 |     "message": "success",
146 |     "data": [
147 |         {
148 |             "id": "544572eddabfae862da1d4e0",
149 |             "url": "http://www.aminer.cn/profile/544572eddabfae862da1d4e0",
150 |             "L2 distance": 0
151 |         },
152 |         {
153 |             "id": "53f438eadabfaee0d9b7cce4",
154 |             "url": "http://www.aminer.cn/profile/53f438eadabfaee0d9b7cce4",
155 |             "L2 distance": 0.27
156 |         },
157 |         {
158 |             "id": "53f432cbdabfaeb1a7bcfd9a",
159 |             "url": "http://www.aminer.cn/profile/53f432cbdabfaeb1a7bcfd9a",
160 |             "L2 distance": 0.32
161 |         },
162 |         {
163 |             "id": "53f432b7dabfaeb2ac02dc61",
164 |             "url": "http://www.aminer.cn/profile/53f432b7dabfaeb2ac02dc61",
165 |             "L2 distance": 0.33
166 |         },
167 |         {
168 |             "id": "53f43757dabfaeecd696742f",
169 |             "url": "http://www.aminer.cn/profile/53f43757dabfaeecd696742f",
170 |             "L2 distance": 0.34
171 |         },
172 |         {
173 |             "id": "53f556d8dabfae963d25e88d",
174 |             "url": "http://www.aminer.cn/profile/53f556d8dabfae963d25e88d",
175 |             "L2 distance": 0.35
176 |         },
177 |         {
178 |             "id": "53f4dc08dabfaef7e077b586",
179 |             "url": "http://www.aminer.cn/profile/53f4dc08dabfaef7e077b586",
180 |             "L2 distance": 0.35
181 |         },
182 |         {
183 |             "id": "5448db69dabfae87b7e87eb5",
184 |             "url": "http://www.aminer.cn/profile/5448db69dabfae87b7e87eb5",
185 |             "L2 distance": 0.38
186 |         },
187 |         {
188 |             "id": "53f430c6dabfaeb2ac014a3a",
189 |             "url": "http://www.aminer.cn/profile/53f430c6dabfaeb2ac014a3a",
190 |             "L2 distance": 0.39
191 |         },
192 |         {
193 |             "id": "53f42cebdabfaee02ac5a471",
194 |             "url": "http://www.aminer.cn/profile/53f42cebdabfaee02ac5a471",
195 |             "L2 distance": 0.4
196 |         },
197 |         {
198 |             "id": "53f43940dabfaefedbae3ddb",
199 |             "url": "http://www.aminer.cn/profile/53f43940dabfaefedbae3ddb",
200 |             "L2 distance": 0.4
201 |         },
202 |         {
203 |             "id": "53f44514dabfaee43ec789c9",
204 |             "url": "http://www.aminer.cn/profile/53f44514dabfaee43ec789c9",
205 |             "L2 distance": 0.41
206 |         },
207 |         {
208 |             "id": "53f4616fdabfaee4dc839eba",
209 |             "url": "http://www.aminer.cn/profile/53f4616fdabfaee4dc839eba",
210 |             "L2 distance": 0.41
211 |         },
212 |         {
213 |             "id": "53f7c250dabfae938c6d865c",
214 |             "url": "http://www.aminer.cn/profile/53f7c250dabfae938c6d865c",
215 |             "L2 distance": 0.41
216 |         },
217 |         {
218 |             "id": "53f43403dabfaee1c0a86645",
219 |             "url": "http://www.aminer.cn/profile/53f43403dabfaee1c0a86645",
220 |             "L2 distance": 0.41
221 |         },
222 |         {
223 |             "id": "53f44194dabfaee2a1d254c8",
224 |             "url": "http://www.aminer.cn/profile/53f44194dabfaee2a1d254c8",
225 |             "L2 distance": 0.41
226 |         },
227 |         {
228 |             "id": "53f430dddabfaeb1a7bb7664",
229 |             "url": "http://www.aminer.cn/profile/53f430dddabfaeb1a7bb7664",
230 |             "L2 distance": 0.41
231 |         },
232 |         {
233 |             "id": "542c4a3bdabfae2b4e1fe347",
234 |             "url": "http://www.aminer.cn/profile/542c4a3bdabfae2b4e1fe347",
235 |             "L2 distance": 0.41
236 |         },
237 |         {
238 |             "id": "53f482cedabfaec09f2a3dfb",
239 |             "url": "http://www.aminer.cn/profile/53f482cedabfaec09f2a3dfb",
240 |             "L2 distance": 0.42
241 |         },
242 |         {
243 |             "id": "53f42945dabfaeb22f3d3d86",
244 |             "url": "http://www.aminer.cn/profile/53f42945dabfaeb22f3d3d86",
245 |             "L2 distance": 0.42
246 |         }
247 |     ]
248 | }
249 | ```
250 | 
251 | 


--------------------------------------------------------------------------------
/doc/Gender_Prediction.md:
--------------------------------------------------------------------------------
  1 | # Gender Prediction
  2 | 
  3 | ## Introduction
  4 | 
  5 | Gender is a class which is used to predict a person's gender.
  6 | 
  7 | If you want to use face detection to help you predict a person's gender, you can get an api-key from [Face++]( https://console.faceplusplus.com/documents/7079083), and then put the api-key into the `config.py`
  8 | 
  9 | ## Method
 10 | 
 11 | ### predict
 12 | 
 13 | ```python
 14 | predict(self, name, org, source='google', image_url=None, image_file=None)
 15 | ```
 16 | 
 17 | #### Introduction
 18 | 
 19 | Predict a person's gender.  We use name, results from search engine and the person's photo as features to predict his or her gender. About the photos, you can choose either online photos or local photos.
 20 | 
 21 | #### Parameters
 22 | 
 23 | ##### name
 24 | 
 25 | **string**, the person's name
 26 | 
 27 | ##### org
 28 | 
 29 | **string**, the person's organization
 30 | 
 31 | ##### source
 32 | 
 33 | Use `google` or `baidu` as the search engine.
 34 | 
 35 | It is strongly recommended to use Google because the model is trained accoring to the results from Google.
 36 | 
 37 | ##### image_url
 38 | 
 39 | The photo's online url.
 40 | 
 41 | ##### image_file
 42 | 
 43 | The photo's local path.
 44 | 
 45 | #### An example
 46 | 
 47 | ```python
 48 | g = Gender()
 49 | gen = g.predict(name='Jie Tang', org='Tsinghua University', image_url='http://www.cs.tsinghua.edu.cn/publish/cs/4616/20110330101939787483549/20190321114128398502759.jpg')
 50 | ```
 51 | 
 52 | gen:
 53 | 
 54 | ```python
 55 | {   
 56 |     'name': {
 57 |         'male': 0.5, 
 58 |         'female': 0.5
 59 |     }, 
 60 |     'search': {
 61 |         'male': 0.9173952287088033, 
 62 |         'female': 0.0826047712911967
 63 |     }, 
 64 |     'face': {
 65 |         'male': 1, 
 66 |         'female': 0
 67 |     }, 
 68 |     'male': 0.96, 
 69 |     'female': 0.04
 70 | }
 71 | ```
 72 | 
 73 | ## API
 74 | 
 75 | ### https://innovaapi.aminer.cn/tools/v1/predict
 76 | 
 77 | ![](https://img.shields.io/badge/http-get-brightgreen.svg)
 78 | 
 79 | An online version of the method `predict`
 80 | 
 81 | #### Parameters
 82 | 
 83 | ##### name
 84 | 
 85 | **string**, the person's name
 86 | 
 87 | ##### org
 88 | 
 89 | **string**, the person's organization
 90 | 
 91 | ##### image_url
 92 | 
 93 | The photo's online url.
 94 | 
 95 | #### Return value
 96 | 
 97 | In the `Response` object, there will be three fields.
 98 | 
 99 | ##### status
100 | 
101 | `0`: Success
102 | 
103 | `1`: There are some errors.
104 | 
105 | ##### message
106 | 
107 | `success`: Success
108 | 
109 | If there are some errors, you will get the error information.
110 | 
111 | ##### data
112 | 
113 | The return value from the method.
114 | 
115 | #### An example
116 | 
117 | https://innovaapi.aminer.cn/tools/v1/predict/gender?name=Feifei%20Li&org=Stanford%20University
118 | 
119 | `Response`:
120 | 
121 | ```json
122 | {
123 |     "status": 0,
124 |     "message": "success",
125 |     "data": {
126 |         "male": 0.07,
127 |         "female": 0.93,
128 |         "name": {
129 |             "male": 0,
130 |             "female": 1
131 |         },
132 |         "search": {
133 |             "male": 0.13,
134 |             "female": 0.87
135 |         },
136 |         "face": {
137 |             "male": 0.5,
138 |             "female": 0.5
139 |         }
140 |     }
141 | }
142 | ```
143 | 
144 | 


--------------------------------------------------------------------------------
/doc/Identity_Prediction.md:
--------------------------------------------------------------------------------
  1 | # Identity Prediction
  2 | 
  3 | ## Introduction
  4 | 
  5 | Predict a scholar's identity (teacher or student) and his or her degree.
  6 | 
  7 | ## Method
  8 | 
  9 | ### predict
 10 | 
 11 | ```python
 12 | predict(pc=0, cn=0, hi=0, gi=0, year_range=0)
 13 | ```
 14 | 
 15 | #### Introduction
 16 | 
 17 | Predict whether a scholar is a teacher or a student, and then predict his degree.
 18 | 
 19 | #### Parameters
 20 | 
 21 | ##### pc
 22 | 
 23 | Number of papers
 24 | 
 25 | ##### cn
 26 | 
 27 | Citation number
 28 | 
 29 | ##### hi
 30 | 
 31 | H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times.
 32 | 
 33 | ##### gi
 34 | 
 35 | G-index. Given a set of articles ranked in decreasing order of the number of citations that they received, the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations.
 36 | 
 37 | ##### year_range
 38 | 
 39 | Time range of papers.
 40 | 
 41 | #### Return value
 42 | 
 43 | A dictionary:
 44 | 
 45 | ```python
 46 | {
 47 |     'label': 'student' or 'teacher',
 48 |     'degree': 'undergraduate', 'master' or 'doctor'
 49 |     'p': probability
 50 | }
 51 | ```
 52 | 
 53 | #### An example
 54 | 
 55 | ```python
 56 | identity = TorS()
 57 | i = identity.predict(pc=10, cn=10000, hi=40, gi=0, year_range=14)
 58 | ```
 59 | 
 60 | `i`:
 61 | 
 62 | ```python
 63 | {'label': 'teacher', 'degree': 'doctor', 'p': 0.9993}
 64 | ```
 65 | 
 66 | ## API
 67 | 
 68 | ### https://innovaapi.aminer.cn/tools/v1/predict/identity
 69 | 
 70 | ![](https://img.shields.io/badge/http-get-brightgreen.svg)
 71 | 
 72 | An online version of the method `predict`
 73 | 
 74 | #### Parameters
 75 | 
 76 | ##### pc
 77 | 
 78 | Number of papers
 79 | 
 80 | ##### cn
 81 | 
 82 | Citation number
 83 | 
 84 | ##### hi
 85 | 
 86 | H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times.
 87 | 
 88 | ##### gi
 89 | 
 90 | G-index. Given a set of articles ranked in decreasing order of the number of citations that they received, the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations.
 91 | 
 92 | ##### year_range
 93 | 
 94 | Time range of papers.
 95 | 
 96 | #### Return value
 97 | 
 98 | In the `Response` object, there will be three fields.
 99 | 
100 | ##### status
101 | 
102 | `0`: Success
103 | 
104 | `1`: There are some errors.
105 | 
106 | ##### message
107 | 
108 | `success`: Success
109 | 
110 | If there are some errors, you will get the error information.
111 | 
112 | ##### data
113 | 
114 | The return value from the method.
115 | 
116 | #### An example
117 | 
118 | https://innovaapi.aminer.cn/tools/v1/predict/identity?pc=10&cn=10000&hi=40&gi=0&year_range=14
119 | 
120 | `Response`:
121 | 
122 | ```json
123 | {
124 |     "status": 0,
125 |     "message": "success",
126 |     "data": {
127 |         "label": "teacher",
128 |         "degree": "doctor",
129 |         "p": 0.9993
130 |     }
131 | }
132 | ```
133 | 
134 | 


--------------------------------------------------------------------------------
/doc/Jobhopping_Prediction.md:
--------------------------------------------------------------------------------
  1 | # Jobhopping Prediction
  2 | 
  3 | ## Introduction
  4 | 
  5 | JobHopping is a class which is used to predict where a scholar may hop to.
  6 | 
  7 | ## Method
  8 | 
  9 | ### predict
 10 | 
 11 | ```python
 12 | predict(name_squence, ntop=3)
 13 | ```
 14 | 
 15 | #### Introduction
 16 | 
 17 | Get a scholar's possible future affiliation according to a list(squence) of affiliation's name  where he had worked.
 18 | 
 19 | #### Parameters
 20 | 
 21 | ##### name_squence
 22 | 
 23 | a list of the scholar's institution he had worded
 24 | 
 25 | ##### ntop
 26 | 
 27 | How many possible affiliations will the method return.
 28 | 
 29 | #### Return value
 30 | 
 31 | A list of dictionaries
 32 | 
 33 | ```python
 34 |  {
 35 |      'name': the most likely future affiliation's name
 36 |      'p': the probability
 37 |  }
 38 | ```
 39 | 
 40 | #### An example
 41 | 
 42 | ```python
 43 | j = JobHopping()
 44 | aff = j.predict(['tsinghua university','mazandaran university','birsa agricultural university'])
 45 | ```
 46 | 
 47 | `aff`:
 48 | 
 49 | ```python
 50 | [
 51 |     {
 52 |         'name': 'university of michigan',
 53 |         'p': 0.33
 54 |     }, 
 55 |     {
 56 |         'name': 'university of cambridge',
 57 |         'p': 0.33
 58 |     }, 
 59 |     {
 60 |         'name': 'university of california berkeley',
 61 |         'p': 0.33
 62 |     }
 63 | ]
 64 | ```
 65 | 
 66 | ## API
 67 | 
 68 | ### https://innovaapi.aminer.cn/tools/v1/predict/career
 69 | 
 70 | ![](https://img.shields.io/badge/http-get-brightgreen.svg)
 71 | 
 72 | An online version of method `predict`
 73 | 
 74 | #### Parameters
 75 | ##### per_name
 76 | 
 77 | The scholar's name
 78 | 
 79 | ##### org_name
 80 | 
 81 | The scholar's affiliation name
 82 | 
 83 | #### Return value
 84 | 
 85 | In the `Response` object, there will be three fields.
 86 | 
 87 | ##### status
 88 | 
 89 | `0`: Success
 90 | 
 91 | `1`: There are some errors.
 92 | 
 93 | ##### message
 94 | 
 95 | `success`: Success
 96 | 
 97 | If there are some errors, you will get the error infomation.
 98 | 
 99 | ##### data
100 | 
101 | The return value from the method.
102 | 
103 | #### An example
104 | 
105 | https://innovaapi.aminer.cn/tools/v1/predict/career?per_name=XXX&org_name=XXX
106 | 
107 | Return Value:
108 | 
109 | ```json
110 | {
111 |     "status": 0,
112 |     "message": "success",
113 |     "data": [
114 |         {
115 |             "name": "university of michigan",
116 |             "p": 0.33
117 |         },
118 |         {
119 |             "name": "university of california berkeley",
120 |             "p": 0.33
121 |         },
122 |         {
123 |             "name": "stanford university",
124 |             "p": 0.33
125 |         }
126 |     ]
127 | }
128 | ```
129 | 
130 | 


--------------------------------------------------------------------------------
/doc/NSFC_AI_Subject_Classifier.md:
--------------------------------------------------------------------------------
  1 | # NSFC AI Subject Classifier
  2 | 
  3 | ## Introduction
  4 | 
  5 | AIClassifier is a class which is used to classify AI subjects according to some keywords. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 
  6 | 
  7 | ## Method
  8 | 
  9 | ### get_tree
 10 | 
 11 | ```python
 12 | get_tree(words):
 13 | ```
 14 | 
 15 | #### Introduction
 16 | 
 17 | Get a subject tree according to some keywords.
 18 | 
 19 | #### Parameters
 20 | 
 21 | ##### words
 22 | 
 23 | A **list** of keywords.
 24 | 
 25 | #### Return value
 26 | 
 27 | A list of **dictionary**
 28 | 
 29 | ```python
 30 | [
 31 |     {
 32 |         "name": subject name,
 33 |         "value": probability,
 34 |         "children": subtrees. They also have the same structure. If this is a leaf node, it won't have this field
 35 |     }
 36 | ]
 37 | ```
 38 | 
 39 | #### An example
 40 | 
 41 | ```python
 42 | ai_nsfc = AIClassifier()
 43 | words = ['search engine']
 44 | subject = ai_nsfc.get_tree(words)
 45 | print(subject)
 46 | ```
 47 | 
 48 | Return value:
 49 | 
 50 | ```python
 51 | [
 52 |     {
 53 |         'name': '人工智能', 
 54 |         'value': 1.0, 
 55 |         'children': [
 56 |             {
 57 |                 'name': '自然语言处理', 
 58 |                 'value': 0.6236383458601308, 
 59 |                 'children': [
 60 |                     {'name': '文本检索、挖掘与信息抽取', 'value': 0.6106190412551927}
 61 |                 ]
 62 |             },
 63 |             {
 64 |                 'name': '知识表示与处理', 
 65 |                 'value': 0.3763616541398693, 
 66 |                 'children': [
 67 |                     {'name': '知识发现与数据挖掘', 'value': 0.3893809587448072}
 68 |                 ]
 69 |             }
 70 |         ]
 71 |     }
 72 | ]
 73 | ```
 74 | 
 75 | ### classify_level
 76 | 
 77 | ```python
 78 | classify_level(words, level=1, lang_zh=False):
 79 | ```
 80 | 
 81 | #### Introduction
 82 | 
 83 |  Classify which subjects these keywords belong to.
 84 | 
 85 | #### Parameters
 86 | 
 87 | ##### words
 88 | 
 89 | A **list** of keywords.
 90 | 
 91 | ##### level
 92 | 
 93 | Classification level(1,2,3), for other numbers you will get a  `[]`.
 94 | 
 95 | [NSFC](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html) uses a three-level classification. Use graph theory as an example, 
 96 | 
 97 | ```
 98 | A01 mathematics
 99 | - A0116 combinatorial mathematics
100 |   - A011602 graph theory
101 | ```
102 | 
103 | ##### lang_zh
104 | 
105 | Whether the return values are Chinese or not.
106 | 
107 | #### Return Value
108 | 
109 | A **list** of strings contains related subject names at the level
110 | 
111 | #### An example
112 | 
113 | ```python
114 | ai_nsfc = AIClassifier()
115 | words = ['search engine']
116 | subject = ai_nsfc.classify_level(words, level=3)
117 | print(subject)
118 | ```
119 | 
120 | Return Value:
121 | 
122 | ```python
123 | ['Text Retrieval, Mining And Information Extraction', 'Knowledge Discovery And Data Mining']
124 | ```
125 | 
126 | ### classify
127 | 
128 |  Get the classification of the keywords and a subject tree
129 | 
130 | #### Parameters
131 | 
132 | ##### words
133 | 
134 | A **list** of keywords. Accept both Chinese words and English words.
135 | 
136 | #### Return value
137 | 
138 | A dictionary contains four items:
139 | 
140 | ```
141 | 'level{x}'(x = 1, 2, 3): Related subjects of the words on level x.
142 | 'tree': Subject trees of the given words(a list of dictionary).
143 | ```
144 | 
145 | #### An example
146 | 
147 | ```python
148 | ai_nsfc = AIClassifier()
149 | words = ['search engine']
150 | subject = ai_nsfc.classify(words)
151 | print(subject)
152 | ```
153 | 
154 | Return Value:
155 | 
156 | ```python
157 | {
158 |     'level1': [
159 |         {'p': 1.0, 'name': 'Artificial Intelligence', 'name_zh': '人工智能'}
160 |     ], 
161 |     'level2': [
162 |         {'p': 0.6236383458601308,'name': 'Natural Language Processing', 'name_zh': '自然语言处理'},
163 |         {'p': 0.3763616541398693, 'name': 'Knowledge Representation And Processing', 'name_zh': '知识表示与处理'}
164 |     ], 
165 |     'level3': [
166 |         {'p': 0.6106190412551927, 'name': 'Text Retrieval, Mining And Information Extraction', 'name_zh': '文本检索、挖掘与信息抽取'},
167 |         {'p': 0.3893809587448072, 'name': 'Knowledge Discovery And Data Mining', 'name_zh': '知识发现与数据挖掘'}
168 |     ], 
169 |     'tree': [
170 |         {
171 |             'name': '人工智能', 
172 |             'value': 1.0, 
173 |          	'children': [
174 |                 {
175 |                     'name': '自然语言处理', 
176 |                     'value': 0.6236383458601308, 
177 |                     'children': [
178 |                         {'name': '文本检索、挖掘与信息抽取', 'value': 0.6106190412551927}
179 |                     ]
180 |                 },
181 |                 {
182 |                     'name': '知识表示与处理', 
183 |                     'value': 0.3763616541398693, 
184 |                     'children': [
185 |                         {'name': '知识发现与数据挖掘', 'value': 0.3893809587448072}
186 |                     ]
187 |                 }
188 |             ]
189 |         }
190 |     ]
191 | }
192 | ```
193 | 
194 | ## API
195 | 
196 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc/ai
197 | 
198 | ![](https://img.shields.io/badge/http-post-blue.svg)
199 | 
200 | An online version of the method "classify"
201 | 
202 | ### Request body
203 | 
204 | ##### words
205 | 
206 | A **list** of key words. Accept both Chinese words and English words.
207 | 
208 | #### Return value
209 | 
210 | In the `Response` object, there will be three fields.
211 | 
212 | ##### status
213 | 
214 | `0`: Success
215 | 
216 | `1`: There are some errors.
217 | 
218 | ##### message
219 | 
220 | `success`: Success
221 | 
222 | If there are some errors, you will get the error information.
223 | 
224 | ##### data
225 | 
226 | The return value from the method.
227 | 
228 | ### An example
229 | 
230 | ```http
231 | POST /tools/v1/predict/nsfc/ai? HTTP/1.1
232 | Host: innovaapi.aminer.cn
233 | Content-Type: application/json
234 | User-Agent: PostmanRuntime/7.13.0
235 | Accept: */*
236 | Cache-Control: no-cache
237 | Postman-Token: 72d90554-ead1-4606-be9e-ce64a9b38391,354aaeaa-976a-406c-902b-d3d1e52389f7
238 | Host: innovaapi.aminer.cn
239 | accept-encoding: gzip, deflate
240 | content-length: 49
241 | Connection: keep-alive
242 | cache-control: no-cache
243 | 
244 | {
245 |     "words": [
246 |             "search engine"
247 |     ]
248 | }
249 | ```
250 | 
251 | Return Message:
252 | 
253 | ```json
254 | {
255 |     "status": 0,
256 |     "message": "success",
257 |     "data": {
258 |         "level1": [
259 |             {
260 |                 "p": 1.0,
261 |                 "name": "Artificial Intelligence",
262 |                 "name_zh": "人工智能"
263 |             }
264 |         ],
265 |         "level2": [
266 |             {
267 |                 "p": 0.6236383458601308,
268 |                 "name": "Natural Language Processing",
269 |                 "name_zh": "自然语言处理"
270 |             },
271 |             {
272 |                 "p": 0.3763616541398693,
273 |                 "name": "Knowledge Representation And Processing",
274 |                 "name_zh": "知识表示与处理"
275 |             }
276 |         ],
277 |         "level3": [
278 |             {
279 |                 "p": 0.6106190412551927,
280 |                 "name": "Text Retrieval, Mining And Information Extraction",
281 |                 "name_zh": "文本检索、挖掘与信息抽取"
282 |             },
283 |             {
284 |                 "p": 0.3893809587448072,
285 |                 "name": "Knowledge Discovery And Data Mining",
286 |                 "name_zh": "知识发现与数据挖掘"
287 |             }
288 |         ],
289 |         "tree": [
290 |             {
291 |                 "name": "人工智能",
292 |                 "value": 1.0,
293 |                 "children": [
294 |                     {
295 |                         "name": "自然语言处理",
296 |                         "value": 0.6236383458601308,
297 |                         "children": [
298 |                             {
299 |                                 "name": "文本检索、挖掘与信息抽取",
300 |                                 "value": 0.6106190412551927
301 |                             }
302 |                         ]
303 |                     },
304 |                     {
305 |                         "name": "知识表示与处理",
306 |                         "value": 0.3763616541398693,
307 |                         "children": [
308 |                             {
309 |                                 "name": "知识发现与数据挖掘",
310 |                                 "value": 0.3893809587448072
311 |                             }
312 |                         ]
313 |                     }
314 |                 ]
315 |             }
316 |         ]
317 |     }
318 | }
319 | ```
320 | 
321 | 


--------------------------------------------------------------------------------
/doc/NSFC_Subject_Classifier.md:
--------------------------------------------------------------------------------
  1 | # NSFC Subject Classifier
  2 | 
  3 | ## Introduction
  4 | 
  5 | Classifier is a class which is used to classify publications according to their subjects. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 
  6 | 
  7 | ## Method
  8 | 
  9 | ### classify
 10 | 
 11 | ```python
 12 | classify(pub_titles, level=0, ntop=5, lang_zh=False, translatation_func=youdao_translate)
 13 | ```
 14 | #### Introduction
 15 | 
 16 | Use publications' titles to classify which subjects these publications belong to.
 17 | 
 18 | #### Parameters
 19 | 
 20 | ##### pub_titles
 21 | 
 22 | A **list** of **strings**. The titles of publications.
 23 | 
 24 | ##### level
 25 | 
 26 | Classification level(1,2,3), for other numbers you will get all three levels.
 27 | 
 28 | [NSFC](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html) uses a three-level classification. Use graph theory as an example, 
 29 | 
 30 | ```
 31 | A01 mathematics
 32 | - A0116 combinatorial mathematics
 33 |   - A011602 graph theory
 34 | ```
 35 | 
 36 | ##### ntop
 37 | 
 38 | The number of possible subjects you want to get.
 39 | 
 40 | ##### lang_zh
 41 | 
 42 | Whether the titles are Chinese or not. For `True`, it means you are using Chinese publications.
 43 | 
 44 | ##### translation_func
 45 | 
 46 | In fact, the classifier can only work on **Chinese** words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function.  It should be able to translate a list of **strings** in another language to Chinese.
 47 | 
 48 | In default, we provide a translator based on [youdao api](http://fanyi.youdao.com/).  But you cannot use this translator too often because it is only a free version. 
 49 | 
 50 | #### Return value
 51 | 
 52 | A **dictionary**
 53 | 
 54 | ```python
 55 | 'level{x}'(x = 1, 2, 3)':
 56 | {
 57 |     'code': subject code
 58 |     'name': subject name
 59 |     'p': probability
 60 | }
 61 | ```
 62 | 
 63 | If there are some errors in the method, you will get a `{}`
 64 | 
 65 | #### An example
 66 | 
 67 | ```python
 68 | nsfc = Classifier()
 69 | pub_titles = ['基于多通道卷积神经网络的中文微博情感分析']
 70 | subject = nsfc.classify(pub_titles)
 71 | ```
 72 | 
 73 | `subject`:
 74 | 
 75 | ```python
 76 | {
 77 |     'level1': [
 78 |         {'code': 'F02', 'name': '计算机科学', 'p': 0.9745969772338867},
 79 |         {'code': 'F01', 'name': '电子学与信息系统', 'p': 0.02385014481842518},
 80 |         {'code': 'B05', 'name': '分析化学', 'p': 0.0005464374553412199},
 81 |         {'code': 'F03', 'name': '自动化', 'p': 0.00039022043347358704},
 82 |         {'code': 'H18', 'name': '影像医学与生物医学工程', 'p': 0.0001973187318071723}
 83 |     ], 
 84 |     'level2': [
 85 |         {'code': 'F0206', 'name': '自然语言理解与机器翻译', 'p': 0.8545559048652649},
 86 |         {'code': 'F0205', 'name': '计算机应用技术', 'p': 0.08089018613100052},
 87 |         {'code': 'F0305', 'name': '人工智能与知识工程', 'p': 0.023599255830049515},
 88 |         {'code': 'B0512', 'name': '化学计量学与化学信息学', 'p': 0.0228357}
 89 |     ], 
 90 |     'level3': [
 91 |         {'code': 'F020601', 'name': '计算语言学', 'p': 0.9999170303344727},
 92 |         {'code': 'F020504', 'name': '生物信息计算', 'p': 4.625070505426265e-05},
 93 |         {'code': 'F020506', 'name': '人机界面技术', 'p': 2.3111495465855114e-05},
 94 |         {'code': 'F010403', 'name': '物联网', 'p': 2.2251791961025447e-05},
 95 |         {'code': 'F010303', 'name': '协作通信', 'p': 2.0015930203953758e-05}
 96 |     ]
 97 | }
 98 | ```
 99 | 
100 | ## API
101 | 
102 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc
103 | 
104 | ![](https://img.shields.io/badge/http-post-blue.svg)
105 | 
106 | An online version of the method `classify`
107 | 
108 | #### Request body
109 | 
110 | ##### titles
111 | 
112 | A **list** of **strings**. The titles of publications.
113 | 
114 | #### Return value
115 | 
116 | In the `Response` object, there will be three fields.
117 | 
118 | ##### status
119 | 
120 | `0`: Success
121 | 
122 | `1`: There are some errors.
123 | 
124 | ##### message
125 | 
126 | `success`: Success
127 | 
128 | If there are some errors, you will get the error information.
129 | 
130 | ##### data
131 | 
132 | The return value from the method.
133 | 
134 | #### An example
135 | 
136 | ```http
137 | POST /tools/v1/predict/nsfc? HTTP/1.1
138 | Host: innovaapi.aminer.cn
139 | Content-Type: application/json
140 | User-Agent: PostmanRuntime/7.13.0
141 | Accept: */*
142 | Cache-Control: no-cache
143 | Postman-Token: 5f0fbe87-e333-40b1-b9c3-23f64c137c15,1927af8e-4a86-4319-8024-684d6b9e46f7
144 | Host: innovaapi.aminer.cn
145 | accept-encoding: gzip, deflate
146 | content-length: 100
147 | Connection: keep-alive
148 | cache-control: no-cache
149 | 
150 | {
151 |     "titles": [
152 |         "基于多通道卷积神经网络的中文微博情感分析"
153 |     ]
154 | }
155 | ```
156 | 
157 | Return Message:
158 | 
159 | ```json
160 | {
161 |     "status": 0,
162 |     "message": "success",
163 |     "data": {
164 |         "level1": [
165 |             {
166 |                 "code": "F02",
167 |                 "name": "计算机科学",
168 |                 "p": 0.9745969772338867
169 |             },
170 |             {
171 |                 "code": "F01",
172 |                 "name": "电子学与信息系统",
173 |                 "p": 0.02385014481842518
174 |             },
175 |             {
176 |                 "code": "B05",
177 |                 "name": "分析化学",
178 |                 "p": 0.0005464374553412199
179 |             },
180 |             {
181 |                 "code": "F03",
182 |                 "name": "自动化",
183 |                 "p": 0.00039022043347358704
184 |             },
185 |             {
186 |                 "code": "H18",
187 |                 "name": "影像医学与生物医学工程",
188 |                 "p": 0.0001973187318071723
189 |             }
190 |         ],
191 |         "level2": [
192 |             {
193 |                 "code": "F0206",
194 |                 "name": "自然语言理解与机器翻译",
195 |                 "p": 0.8545559048652649
196 |             },
197 |             {
198 |                 "code": "F0205",
199 |                 "name": "计算机应用技术",
200 |                 "p": 0.08089018613100052
201 |             },
202 |             {
203 |                 "code": "F0305",
204 |                 "name": "人工智能与知识工程",
205 |                 "p": 0.023599255830049515
206 |             },
207 |             {
208 |                 "code": "B0512",
209 |                 "name": "化学计量学与化学信息学",
210 |                 "p": 0.022835755720734596
211 |             },
212 |             {
213 |                 "code": "F0104",
214 |                 "name": "通信网络",
215 |                 "p": 0.01253295037895441
216 |             }
217 |         ],
218 |         "level3": [
219 |             {
220 |                 "code": "F020601",
221 |                 "name": "计算语言学",
222 |                 "p": 0.9999170303344727
223 |             },
224 |             {
225 |                 "code": "F020504",
226 |                 "name": "生物信息计算",
227 |                 "p": 0.00004625070505426265
228 |             },
229 |             {
230 |                 "code": "F020506",
231 |                 "name": "人机界面技术",
232 |                 "p": 0.000023111495465855114
233 |             },
234 |             {
235 |                 "code": "F010403",
236 |                 "name": "物联网",
237 |                 "p": 0.000022251791961025447
238 |             },
239 |             {
240 |                 "code": "F010303",
241 |                 "name": "协作通信",
242 |                 "p": 0.000020015930203953758
243 |             }
244 |         ]
245 |     }
246 | }
247 | ```
248 | 
249 | ### https://innovaapi.aminer.cn/tools/v1/predict/nsfc/person
250 | 
251 | ![](https://img.shields.io/badge/http-get-brightgreen.svg)
252 | 
253 | Get a professor's research interests according to his publications' titles.
254 | 
255 | #### Parameters
256 | 
257 | ##### pid
258 | 
259 | the professor's id in [AMiner](https://aminer.cn).
260 | 
261 | For example, you want to know Qiang Yang's research interests. First, you should search Qiang Yang in  [AMiner](https://aminer.cn), and get his page url https://www.aminer.cn/profile/qiang-yang/53f48041dabfae963d25910a. His id in [AMiner](https://aminer.cn) is the suffix of the url string `53f48041dabfae963d25910a`.
262 | 
263 | #### Return value
264 | 
265 | In the `Response` object, there will be three fields.
266 | 
267 | ##### status
268 | 
269 | `0`: Success
270 | 
271 | `1`: There are some errors.
272 | 
273 | ##### message
274 | 
275 | `success`: Success
276 | 
277 | If there are some errors, you will get the error information.
278 | 
279 | ##### data
280 | 
281 | The return value from the method.
282 | 
283 | #### An example
284 | 
285 | https://innovaapi.aminer.cn/tools/v1/predict/nsfc/person?pid=53f48041dabfae963d25910a
286 | 
287 | ## Accuracy
288 | 
289 | | level |  top1  |  top5  |
290 | | :---: | :----: | :----: |
291 | |   1   | 0.5079 | 0.8331 |
292 | |   2   | 0.3629 | 0.6668 |
293 | |   3   | 0.3342 | 0.6317 |
294 | 
295 | 


--------------------------------------------------------------------------------
/doc/Paper_Ranker.md:
--------------------------------------------------------------------------------
  1 | # Paper Ranker
  2 | 
  3 | ## Introduction
  4 | 
  5 | There are some professors that have the same name. In this case, we have difficulties in distinguishing whether a publication is long to a professor. PaperRanker is a class which is used to predict how much possibility there is that a publication is belong to a professor.
  6 | 
  7 | ![](paper_ranker.png)
  8 | 
  9 | Our idea is that we can use some correct publications and coauthor relationship to solve this problem. If a correct coauthor relationship exists in an unsure publication, the possibility that this publication is belong to that professor is much higher.
 10 | 
 11 | ## Definition
 12 | 
 13 | ```python
 14 | def __init__(self, use_clf=False)
 15 | ```
 16 | 
 17 | Based on the idea that a professor's interest won't change too much, we also try to use [NSFC Subject Classifier](https://github.com/AMinerOpen/prediction_api/blob/master/doc/NSFC_Subject_Classifier.md) to help us to predict. If you want to use it, you should set `use_clf=True`. 
 18 | 
 19 | ## Method
 20 | 
 21 | ### label
 22 | 
 23 | ```python
 24 | def label(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate)
 25 | ```
 26 | 
 27 | #### Introduction
 28 | 
 29 | Use iterative algorithm to predict how much possibility there is that a publication is belong to a professor.
 30 | 
 31 | #### Paramters
 32 | 
 33 | ##### correct_pubs
 34 | 
 35 |  A list of this professor's publications. This is a list of dictionaries. And the dictionary should have following fields.
 36 | 
 37 | |  name   |        Introdcution         |
 38 | | :-----: | :-------------------------: |
 39 | |  title  | `string`, publication title |
 40 | | authors | A list of `string`, authors |
 41 | |  year   | `integer`, publication year |
 42 | 
 43 | For example:
 44 | 
 45 | ```python
 46 | {
 47 |     "title": "Study of quantitative elastography with supersonic shear imaging in the diagnosis of breast tumours",
 48 |     "year": 2013,
 49 |     "authors": [
 50 |         "Zhili Wang",
 51 |         "Junlai Li",
 52 |         "Min Li",
 53 |         "Yan Huang",
 54 |         "WenBo Wan",
 55 |         "Jie Tang"
 56 |     ]
 57 | }
 58 | ```
 59 | 
 60 | ##### unsure_pubs
 61 | 
 62 |  A list of unsure publications. The format is similar to `correct_pubs`
 63 | 
 64 | ##### threshold
 65 | 
 66 | If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication.
 67 | 
 68 | ##### trans
 69 | 
 70 | In fact, the classifier can only work on **Chinese** words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function.  It should be able to translate a list of **strings** in another language to Chinese.
 71 | 
 72 | In default, we provide a translator based on [youdao api](http://fanyi.youdao.com/).  But you cannot use this translator too often because it is only a free version. 
 73 | 
 74 | #### Return value
 75 | 
 76 | `(a,b)`, two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities.
 77 | 
 78 | ### ranking
 79 | 
 80 | ```python
 81 | def ranking(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate)
 82 | ```
 83 | 
 84 | #### Introduction
 85 | 
 86 | Predict how much possibility there is that a publication is belong to a professor. The algorithm will be used just for one time.
 87 | 
 88 | #### Parameters
 89 | 
 90 | #### Return value
 91 | 
 92 | Similar to `label`
 93 | 
 94 | ## Test
 95 | 
 96 | The experiment is based on the publications from [Jie Tang  (唐杰)](http://www.aminer.cn/profile/jie-tang/53f46a3edabfaee43ed05f08) and [Jie Tang  (唐捷)](http://www.aminer.cn/profile/jie-tang/542edff0dabfae498ae3c756) .
 97 | 
 98 | I use [baidu translation](http://api.fanyi.baidu.com/api/trans/product/index) as the translator and set `threshold = 0.5`
 99 | 
100 | | Precision Rate | Recall Rate | F1 Score |
101 | | :------------: | :---------: | :------: |
102 | |     0.960      |    0.705    |  0.813   |


--------------------------------------------------------------------------------
/doc/paper_ranker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/doc/paper_ranker.png


--------------------------------------------------------------------------------
/model/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/model/README.md


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/src/__init__.py


--------------------------------------------------------------------------------
/src/aiclassifier.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Introduction:
  4 |     AIClassifier is a class which is used to classify AI subjects.
  5 |     It depends on the classification of Natural Science Foundation of China(NSFC).
  6 | Usage:
  7 |     >>> ai_nsfc = AIClassifier()
  8 |     >>> words = ['search engine']
  9 |     >>> subject = ai_nsfc.classify(words)
 10 | '''
 11 | import os
 12 | from config import model_path
 13 | from sklearn.externals import joblib
 14 | from collections.abc import Iterable
 15 | 
 16 | 
 17 | data_path = os.path.join(model_path, 'nsfc')
 18 | 
 19 | 
 20 | class AIClassifier:
 21 | 
 22 |     def __init__(self, path=data_path):
 23 |         self._mat = []
 24 |         for level in range(3):
 25 |             file = os.path.join(path, 'ai_lev{}_w.pkl'.format(level))
 26 |             self._mat.append(joblib.load(file))
 27 | 
 28 |         file_id2name = os.path.join(path, 'id2name.pkl')
 29 |         file_id2father = os.path.join(path, 'id2father.pkl')
 30 |         self._id2name = joblib.load(file_id2name)
 31 |         self._id2father = joblib.load(file_id2father)
 32 | 
 33 |     def classify(self, words):
 34 |         '''
 35 |         Get the classification of the key words and its subject tree
 36 |         :param words: A key words list. Accept both English words and Chinese words
 37 |         :return:
 38 |                 A dictionary contains four items:
 39 |                 'level{x}'(x = 1, 2, 3): Related subjects of the words on level x.
 40 |                 'tree': Subject trees of the given words(a list of dictionary).
 41 |         '''
 42 |         distribution = self._get_all_level_distribution(words)
 43 |         ret = self._get_all_info(words, distribution)
 44 |         subject_tree = self.get_tree(words, distribution)
 45 |         ret['tree'] = subject_tree
 46 |         return ret
 47 | 
 48 |     def get_tree(self, words, _distribution=None):
 49 |         '''
 50 |         Get a related subject tree using the key words
 51 |         :param words: A key words list. Accept both English words and Chinese words
 52 |         :param _distribution: A param designed to reuse codes.
 53 |         :return: Subject trees of the given words(a list of dictionary).
 54 |         '''
 55 |         subject_tree = {}
 56 |         if _distribution is None:
 57 |             _distribution = self._get_all_level_distribution(words)
 58 |         for level in range(0, 3):
 59 |             for k, p in _distribution[level].items():
 60 |                 self._insert_subject2tree(k, p, subject_tree)
 61 |         return self._format_tree(subject_tree)
 62 | 
 63 |     def classify_level(self, words, level=1, lang_zh=False):
 64 |         '''
 65 |         Get the most likely subject names at the given level according to some key words
 66 |         :param words: A key words list. Accept both English words and Chinese words
 67 |         :param level: classification level(1, 2 or 3)
 68 |         :param zh: Whether to use Chinese subject names 
 69 |         :return: A list contains related subject names at the level
 70 |         '''
 71 |         if level not in [1, 2, 3]:
 72 |             return []
 73 |         main_subjects = self._get_all_level_distribution(words)
 74 |         if lang_zh:
 75 |             ret_iter = map(lambda x: self._get_zh_name(x), main_subjects[level - 1].keys())
 76 |         else:
 77 |             ret_iter = map(lambda x: self._get_name(x), main_subjects[level - 1].keys())
 78 |         return list(ret_iter)
 79 | 
 80 |     def _get_father_id(self, nsfc_id):
 81 |         return self._id2father.get(nsfc_id)
 82 | 
 83 |     def _get_name(self, nsfc_id):
 84 |         return self._id2name[nsfc_id][0]
 85 | 
 86 |     def _get_zh_name(self, nsfc_id):
 87 |         return self._id2name[nsfc_id][1]
 88 | 
 89 |     def _get_ancestors_list(self, nsfc_id):
 90 |         # get all of ancestors of a node on the subject tree
 91 |         ancestors = []
 92 |         father_id = self._get_father_id(nsfc_id)
 93 |         if father_id:
 94 |             ancestors.append(self._get_zh_name(father_id))
 95 |             # from top to bottom
 96 |             return self._get_ancestors_list(father_id) + ancestors
 97 |         else:
 98 |             return ancestors
 99 | 
100 |     def _get_all_info(self, words, distribution=None):
101 |         # Get the most likely subject names and their values at three levels
102 |         ret = {}
103 |         if distribution is None:
104 |             distribution = self._get_all_level_distribution(words)
105 |         for level in range(0, 3):
106 |             level_name = 'level{}'.format(level + 1)
107 |             ret[level_name] = []
108 |             for k, p in distribution[level].items():
109 |                 ret[level_name].append({
110 |                     'p': p,
111 |                     'name': self._get_name(k),
112 |                     'name_zh': self._get_zh_name(k)
113 |                 })
114 |         return ret
115 | 
116 |     def _insert_subject2tree(self, nsfc_id, prob, tree):
117 |         # insert a new node to a subject tree
118 |         ancestors = self._get_ancestors_list(nsfc_id)
119 |         point = tree
120 |         node_name = self._get_zh_name(nsfc_id)
121 |         if ancestors:
122 |             for ancestor in ancestors:
123 |                 point = point.setdefault(ancestor, {'value': None})
124 |                 point = point.setdefault('child', {})
125 |         point[node_name] = {'name': node_name, 'value': prob}
126 | 
127 |     def _get_all_level_distribution(self, words):
128 |         restrict = None
129 |         ret = []
130 |         for i in range(0, 3):
131 |             distri = self._get_distribution(words, i, restrict)
132 |             main_subjects = self._get_main_subject(distri)
133 |             ret.append(main_subjects)
134 |             # ensure there is no repetition
135 |             restrict = main_subjects.keys()
136 |         return ret
137 | 
138 |     def _get_distribution(self, words, level, restrict=None, ban=None):
139 |         # get the weight distribution at the given level
140 |         rs = {}
141 |         if words and isinstance(words, Iterable):
142 |             for w in words:
143 |                 data = self._mat[level].get(w.lower(), {})
144 |                 for sub_id, v in data.items():
145 |                     if ban is None or sub_id not in ban:
146 |                         if restrict is None or self._id2father.get(sub_id) in restrict:
147 |                             rs.setdefault(sub_id, 0)
148 |                             rs[sub_id] += v
149 |             self._norm(rs)
150 |         return rs
151 | 
152 |     def _norm(self, dict_data):
153 |         # normalize the value of a dictionary
154 |         s = sum(dict_data.values())
155 |         for k, v in dict_data.items():
156 |             dict_data[k] = v / s
157 | 
158 |     def _format_tree(self, tree):
159 |         '''
160 |         In order to insert a node to a tree, we use subject names as keys.
161 |         This function can format a subject tree(dictionary) by using the nodes itself as keys.
162 |         '''
163 |         new_tree = []
164 |         for k, v in tree.items():
165 |             child = v.get('child')
166 |             if child:
167 |                 new_child = self._format_tree(child)
168 |                 new_tree.append({'name': k, 'value': v['value'], 'children': new_child})
169 |             else:
170 |                 new_tree.append({'name': k, 'value': v['value']})
171 |         return new_tree
172 | 
173 |     def _get_main_subject(self, distribution, thresh_prob=0.6, min_prob=0.1, dec_drop=10):
174 |         # select the most possible subjects
175 |         dis_len = len(distribution)
176 |         if dis_len == 0:
177 |             return {}
178 |         sorted_distribution = sorted(distribution.items(), key=lambda x: -x[1])
179 |         # after sorting, the dict becomes a list of pairs, item[0]: nsfc id, item[1]: its value
180 |         ret = {sorted_distribution[0][0]: sorted_distribution[0][1]}
181 |         sum_value = sorted_distribution[0][1]
182 |         for i in range(1, dis_len):
183 |             prev_value = sorted_distribution[i-1][1]
184 |             now_value = sorted_distribution[i][1]
185 |             if now_value < min_prob or (prev_value - now_value) / now_value > dec_drop:
186 |                 break
187 |             ret[sorted_distribution[i][0]] = sorted_distribution[i][1]
188 |             sum_value += now_value
189 |             if sum_value > thresh_prob:
190 |                 break
191 |         self._norm(ret)
192 |         return ret
193 | 
194 | if __name__ == '__main__':
195 |     words = [
196 |         'Controlled Experiment',
197 |         'Fit Tables.',
198 |         'Executable Test Case',
199 |         'Source Code',
200 |         'Static Analysis',
201 |         'Comprehension Task',
202 |         'Legacy System',
203 |         'Web Applications',
204 |         'Genetic Algorithm',
205 |         'Test Case',
206 |         'Security Testing',
207 |         'Empirical Study',
208 |         'Acceptance Testing',
209 |         'Data Model',
210 |         'Fit Table',
211 |         'Case Study',
212 |         'Crosscutting Concern',
213 |         'Web Application',
214 |         'Empirical Studies',
215 |         'Aspect Oriented Programming'
216 |     ]
217 |     aic = AIClassifier()
218 |     print(aic.classify(words))


--------------------------------------------------------------------------------
/src/classifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | Introduction:
 4 |     Classifier is a class which is used to classify publications according to their subjects. It depends on the classification of [Natural Science Foundation of China(NSFC)](http://www.nsfc.gov.cn/nsfc/cen/xmzn/2019xmzn/15/index.html). 
 5 | Usage:
 6 |     >>> nsfc = Classifier()
 7 |     >>> pub_titles = ['Annotating gene sets by mining large literature collections with protein networks.']
 8 |     >>> subject = nsfc.classify(pub_titles)
 9 | '''
10 | 
11 | import os
12 | import fastText
13 | import re
14 | import jieba
15 | from config import model_path
16 | from utils.translator import youdao_translate
17 | 
18 | data_path = os.path.join(model_path, 'nsfc')
19 | 
20 | 
21 | class Classifier:
22 | 
23 |     def __init__(self, path=data_path):
24 |         self._clf = [
25 |             fastText.load_model(os.path.join(path, 'clf0.bin')),
26 |             fastText.load_model(os.path.join(path, 'clf1.bin')),
27 |             fastText.load_model(os.path.join(path, 'clf2.bin')),
28 |         ]
29 |         self._zh_chars = re.compile(r'[^\u4e00-\u9fff]+')
30 |         self._id2name = dict()
31 |         with open(os.path.join(data_path, 'nsfc_subject.csv'), encoding='utf-8') as f:
32 |             for line in f:
33 |                 _id, _name = line[:-1].split(',') # encode csv
34 |                 self._id2name[_id] = _name
35 | 
36 |     def _get_name(self, code):
37 |         # Get the name from the given subject code
38 |         return self._id2name[code]
39 | 
40 |     def _get_code(self, label):
41 |         '''
42 |         In the model, we use '__label__' + NSFC subject code as labels.
43 |         This function can extract the subject code from a label.
44 |         '''
45 |         return label[9:]
46 | 
47 |     def _tokenize(self, pubs, lang_zh=False, translatation_func=youdao_translate):
48 |         # Convert a sequence of characters into a sequence of tokens
49 |         if not lang_zh:
50 |             text_zh = translatation_func(pubs)
51 |         else:
52 |             text_zh = pubs
53 |         words = []
54 |         for s in text_zh:
55 |             # delete all characters which are not Chinese
56 |             all_zh = self._zh_chars.sub('', s)
57 |             words.extend(jieba.lcut(all_zh))
58 |         return words
59 | 
60 |     def classify(self, pub_titles, level=0, ntop=5, lang_zh=False, translatation_func=youdao_translate):
61 |         '''
62 |         Use publications' titles to classify which subjects these publications belong to.
63 |         :param pub_titles: A list of publication titles
64 |         :param level: Classification level(1,2,3), for other numbers you will get all of levels
65 |         :param ntop: How many subjects in each level does the classifier select
66 |         :param lang_zh: Whether the titles are Chinese or not. For True, it means you are using Chinese publications.
67 |         :param translation_func: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function.  It should be able to translate a list of strings in another language to Chinese.
68 |         :return: A dictionary:
69 |                  'level{x}'(x = 1, 2, 3)':
70 |                      {
71 |                         'code': subject code
72 |                         'name': subject name
73 |                         'p': probability
74 |                      }
75 |         '''
76 |         ret = {}
77 |         words = self._tokenize(pub_titles, lang_zh=lang_zh, translatation_func=translatation_func)
78 |         if words == []:
79 |             return ret
80 |         text = ' '.join(words)
81 |         for i in range(0, 3):
82 |             if i + 1 == level or level not in [1, 2, 3]:
83 |             #  level number equals its index plus one
84 |                 level_name = 'level{}'.format(i+1)
85 |                 ret[level_name] = self._clf[i].predict(text, ntop)
86 |         # format
87 |         for key, value in ret.items():
88 |             new_value = []
89 |             for label, prob in zip(value[0],value[1]):  # combine each label and its prob into a pair
90 |                 subject_code = self._get_code(label)
91 |                 new_value.append({
92 |                     'code': subject_code,
93 |                     'name': self._get_name(subject_code),
94 |                     'p': prob
95 |                 })
96 |             ret[key] = new_value
97 |         return ret


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # The directory of models
 3 | src_path = os.path.dirname(os.path.abspath(__file__))
 4 | base_path = os.path.dirname(src_path)
 5 | model_path = os.path.join(base_path, 'model')
 6 | '''
 7 | Please put your api key here
 8 | You can know how to get it in https://console.faceplusplus.com/documents/7079083.
 9 | api_key = {
10 |     'api_key': '',
11 |     'api_secret': ''
12 | }
13 | '''
14 | api_key = None
15 | 


--------------------------------------------------------------------------------
/src/expertrec.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Introduction:
 3 |     ExpertRec is a class which is used to recommend some experts in the given text's field.
 4 | Usage:
 5 | >>> e = ExpertRec()
 6 | >>> print(e.search('natural language processing'))
 7 | '''
 8 | import os
 9 | import fastText
10 | import joblib
11 | import json
12 | import numpy as np
13 | import heapq
14 | from utils.acautomaton import ACAutomaton
15 | from config import model_path
16 | 
17 | data_path = os.path.join(model_path, 'expert')
18 | 
19 | 
20 | class ExpertRec:
21 | 
22 |     def __init__(self):
23 |         self._model = fastText.load_model(os.path.join(data_path, 'model_aminer'))
24 |         self._words = self._model.get_labels()
25 |         self._index_mat = joblib.load(os.path.join(data_path, 'index_mat.pkl'))
26 |         self._id2person = json.load(open(os.path.join(data_path, 'pid_list.json'), encoding='utf-8'))
27 |         self.ac = ACAutomaton(self._words)
28 |         self.base_url = 'http://www.aminer.cn/profile/{}'
29 | 
30 |     def doc2vec(self, text):
31 |         # Convert text to vector.
32 |         words = self.ac.search(text.lower().replace(' ', '_'))
33 |         s = ' '.join([w.replace(' ', '_') for w in words])
34 |         vec = self._model.get_sentence_vector(s)
35 |         return vec
36 | 
37 |     def search(self, text, num=20):
38 |         '''
39 |         Recommend some experts in the given text's field.
40 |         :param text: The text.
41 |         :param num: The number of the recommended experts.
42 |         :return: A list of dictionaries:
43 |                 {
44 |                     'id': The expert's ID in AMiner(http://www.aminer.cn/),
45 |                     'url': The expert's AMiner homepage.
46 |                     'L2 distance': Similarity. The smaller the L2 distance is , the more likely the expert is interested in the given text's field.
47 |                 }
48 |         '''
49 |         vec = self.doc2vec(text)
50 |         dist_mat = self._index_mat - vec.T
51 |         dist = np.linalg.norm(dist_mat, axis=1)
52 |         ret = [{
53 |           'id': self._id2person[i],
54 |           'url': self.base_url.format(self._id2person[i]),
55 |           'L2 distance': d
56 |         } for i, d in enumerate(dist)]
57 |         return heapq.nsmallest(num, ret, lambda x: x['L2 distance'])
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     e = ExpertRec()
62 |     print(e.search('natural language processing'))


--------------------------------------------------------------------------------
/src/gender.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Introduction:
  3 |     Gender is a class which is used to predict a person's gender.
  4 | Usage:
  5 | >>> g = Gender()
  6 | >>> print(g.predict(name='Jie Tang', org='Tsinghua University', image_url='http://www.cs.tsinghua.edu.cn/publish/cs/4616/20110330101939787483549/20190321114128398502759.jpg'))
  7 | '''
  8 | import os
  9 | import json
 10 | import re
 11 | import pickle
 12 | import requests
 13 | from urllib.parse import quote_plus
 14 | from utils.crawler import baidu_parse, google_parse, getHTMLText
 15 | from config import model_path, api_key
 16 | 
 17 | data_path = os.path.join(model_path, 'gender')
 18 | 
 19 | 
 20 | class Gender:
 21 | 
 22 |     _face_url = 'https://api-us.faceplusplus.com/facepp/v3/detect'
 23 | 
 24 |     def __init__(self):
 25 |         self._name_model = json.load(open(os.path.join(data_path, 'model_name.json')))
 26 |         self._search_model = pickle.load(open(os.path.join(data_path, 'model_page.pk'), 'rb'), encoding='latin1')
 27 | 
 28 |     @staticmethod
 29 |     def get_firstname(name):
 30 |         # get first name from a full name
 31 |         try:
 32 |             name = name.lower()
 33 |             return name.split(' ')[0]
 34 |         except Exception:
 35 |             return ''
 36 | 
 37 |     @staticmethod
 38 |     def get_words(content):
 39 |         # get words from an article
 40 |         r = re.compile(r'[a-zA-Z]+|\.\.\.')
 41 |         words = re.findall(r, content)
 42 |         return [str(word.lower()) for word in words]
 43 | 
 44 |     def name_score(self, name):
 45 |         '''
 46 |         Predict a person's gender according to his or her name.
 47 |         :param name: The person's name
 48 |         :return: A dictionary:
 49 |                 {
 50 |                     'male': probability that the person is male
 51 |                     'female': probability that the person is female
 52 |                 }
 53 |         '''
 54 |         firstname = self.get_firstname(name)
 55 |         if firstname in self._name_model.keys():
 56 |             name_gender = self._name_model[firstname]
 57 |             return {
 58 |                 'male': 1 if name_gender == 'male' else 0,
 59 |                 'female': 1 if name_gender == 'female' else 0
 60 |             }
 61 |         else:
 62 |             return {
 63 |                 'male': 0.5,
 64 |                 'female': 0.5
 65 |             }
 66 | 
 67 |     def search_score(self, name, org, source='google'):
 68 |         '''
 69 |         Predict a person's gender using search engine.
 70 |         :param name: The person's name
 71 |         :param org: The person's organization
 72 |         :param source: Search engine, baidu or google
 73 |         :return: A dictionary:
 74 |                 {
 75 |                     'male': Probability that the person is male,
 76 |                     'female': probability that the person is female
 77 |                 }
 78 |         '''
 79 |         query = quote_plus('{} {} his OR her'.format(name, org))
 80 |         if source == 'baidu':
 81 |             url = 'https://www.baidu.com/s?wd={}&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en&rn=100'.format(query)
 82 |         elif source == 'google':
 83 |             url = 'https://www.google.com.hk/search?q={}&hl=en'.format(query)
 84 |         else:
 85 |             return {
 86 |                 'male': 0.5,
 87 |                 'female': 0.5
 88 |             }
 89 |         html = getHTMLText(url)
 90 |         if source == 'baidu':
 91 |             page_info = baidu_parse(html)
 92 |         else:
 93 |             page_info = google_parse(html)
 94 |         if not page_info:
 95 |             return {
 96 |                 'male': 0.5,
 97 |                 'female': 0.5
 98 |             }
 99 |         featureHis = self._get_feature('his', page_info, name)
100 |         featureHer = self._get_feature('her', page_info, name)
101 |         numSnippets = max(len(page_info), 1)
102 |         tottf = max(float(featureHis['tf']+featureHer['tf']), 1.0)
103 |         feature = [
104 |             featureHis['tf']/tottf, featureHer['tf']/tottf,
105 |             featureHis['df']/numSnippets, featureHer['tf']/numSnippets,
106 |             int(featureHis['isNameInTitle']), int(featureHer['isNameInTitle']),
107 |             int(featureHis['isInFirstSnippt']), int(featureHer['isInFirstSnippt'])
108 |         ]
109 |         # print(feature)
110 |         mproba = self._search_model.predict_proba([feature])[0][1]
111 |         return {
112 |             'male': mproba,
113 |             'female': 1 - mproba
114 |         }
115 | 
116 |     def face_score(self, image_url=None, image_file=None):
117 |         '''
118 |         Predict a person's gender his or her photo.
119 |         :param image_url: The photo's url
120 |         :param image_file: The photo's local path
121 |         :return: A dictionary:
122 |                 {
123 |                     'male': Probability that the person is male,
124 |                     'female': Probability that the person is female
125 |                 }
126 |         '''
127 |         try:
128 |             data = {
129 |                 'api_key': api_key['api_key'],
130 |                 'api_secret': api_key['api_secret'],
131 |                 'return_landmark': '0',
132 |                 'return_attributes': 'gender'
133 |             }
134 |             if image_url is not None:
135 |                 data['image_url'] = image_url
136 |                 r = requests.post(Gender._face_url, data=data)
137 |             elif image_file is not None:
138 |                 files = {'image_file': open(image_file, 'rb')}
139 |                 r = requests.post(Gender._face_url, data=data, files=files)
140 |             else:
141 |                 return {
142 |                     'male': 0.5,
143 |                     'female': 0.5
144 |                 }
145 |             # print(r.json())
146 |             rdict = r.json()
147 |             f = rdict.get('faces', [])[0]
148 |             gender = f.get('attributes', {}).get('gender', {}).get('value')
149 |             return {
150 |                 'male': 1 if gender == 'Male' else 0,
151 |                 'female': 1 if gender == 'Female' else 0
152 |             }
153 |         except Exception as ex:
154 |             print(ex)
155 |             return {
156 |                 'male': 0.5,
157 |                 'female': 0.5
158 |             }
159 |                 
160 |     def predict(self, name, org, source='google', image_url=None, image_file=None):
161 |         '''
162 |         Predict a person's gender.
163 |         :param name: The person's name
164 |         :param org: The person's organization
165 |         :param source: Search engine, baidu or google
166 |         :param image_url: The photo's url
167 |         :param image_file: The photo's local path
168 |         :return: A dictionary:
169 |                 {
170 |                     'male': Probability that the person is male,
171 |                     'female': Probability that the person is female,
172 |                     'name': Probabilities from the person's name,
173 |                     'search': Probabilities from search engine,
174 |                     'face':  Probabilities from the person's photo
175 |                 }
176 |         '''
177 |         ret = {}
178 |         weight = {
179 |             'name': 1,
180 |             'search': 1,
181 |             'face': 1.1
182 |         }
183 |         ret['name'] = self.name_score(name)
184 |         ret['search'] = self.search_score(name, org, source=source)
185 |         ret['face'] = self.face_score(image_url, image_file)
186 |         sum_p = 0
187 |         male_v = 0
188 |         for name, data in ret.items():
189 |             if data['male'] != 0.5:
190 |                 male_v += data['male'] * weight[name]
191 |                 sum_p += weight[name]
192 |         if sum_p > 0:
193 |             male_p = male_v / sum_p
194 |         else:
195 |             male_p = 0.5
196 |         ret['male'] = round(male_p, 2)
197 |         ret['female'] = round(1 - male_p, 2)
198 |         return ret
199 | 
200 |     def _get_feature(self, feature_name, page_info, name):
201 |         # Extract some features about gender from the web page.
202 |         feature = {
203 |             'tf': 0,
204 |             'df': 0,
205 |             'isNameInTitle': False,
206 |             'isInFirstSnippt': False
207 |         }
208 |         words_name = self.get_words(name)
209 |         top3Snippets = []
210 |         for pos, snippet in enumerate(page_info):
211 |             words_title = self.get_words(snippet['title'])
212 |             words_content = self.get_words(snippet['content'])
213 |             if pos < 3:
214 |                 top3Snippets.extend(words_content)
215 |             num = words_content.count(feature_name)
216 |             if feature['isNameInTitle'] is False:
217 |                 if num > 0 and words_name[0] in words_title:
218 |                     feature['isNameInTitle'] = True
219 |             feature['tf'] += num
220 |             if num > 0:
221 |                 feature['df'] += 1
222 |         if feature_name in top3Snippets:
223 |             feature['isInFirstSnippt'] = True
224 |         return feature


--------------------------------------------------------------------------------
/src/jobhopping.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Introduction:
  3 |     JobHopping is a class which is used to predict where a scholar may hop to.
  4 | Usage:
  5 | >>> j = JobHopping()
  6 | >>> print(j.predict('tsinghua university'))
  7 | '''
  8 | import pickle
  9 | import os
 10 | import torch
 11 | import torch.nn as nn
 12 | import heapq
 13 | import numpy as np
 14 | from prediction_api.src.config import model_path
 15 | import torch.nn.functional as F
 16 | 
 17 | os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 18 | data_path = os.path.join(model_path, 'jobhopping')
 19 | 
 20 | 
 21 | class GRUfn(nn.Sequential):
 22 | 
 23 |     def __init__(self, input_size, hidden_size, output_size):
 24 |         super(GRUfn, self).__init__()
 25 |         self.hidden_size = hidden_size
 26 |         self.input_size = input_size
 27 |         self.sig    = nn.Sigmoid()
 28 |         self.cr = nn.GRU(input_size=input_size, hidden_size=hidden_size)
 29 |         self.fn = nn.Linear(hidden_size, output_size)
 30 |         self.fn2 = nn.Linear(hidden_size, output_size)
 31 | 
 32 |     def forward(self, x, y=None, batch=256):
 33 |         if y is not None:
 34 |             x, y = self.cr(x, y)
 35 |         else:
 36 |             x, y = self.cr(x)
 37 |         x = torch.nn.utils.rnn.pad_packed_sequence(x)
 38 |         r = torch.transpose(x[0], 0, 1)
 39 |         y = y.view(batch, self.hidden_size)
 40 |         ind = x[1].view(batch, 1, 1)
 41 |         ind = ind - 1
 42 |         ind = ind.expand(-1, -1, self.hidden_size)
 43 |         t = torch.gather(r, 1, ind)
 44 |         t = t.view(batch, self.hidden_size)
 45 |         t = self.fn(t)
 46 |         y = self.fn2(y)
 47 |         t = t + y
 48 |         t = self.sig(t)
 49 |         return t
 50 | 
 51 | 
 52 | class JobHopping:
 53 | 
 54 |     def __init__(self):
 55 |         self._id2name = {}
 56 |         self._name2id = {}
 57 |         self._model_data = torch.load(os.path.join(data_path, 'model'))
 58 |         self._affi = self._model_data['affi_tensor']
 59 | 
 60 |         with open(os.path.join(data_path, 'orgID2orgname'), 'rb') as file:
 61 |             _data = pickle.load(file)
 62 |             for i, v in enumerate(_data):
 63 |                 self._id2name[i] = v.split('+')[0]
 64 |                 self._name2id.setdefault(v.split('+')[0], i)
 65 | 
 66 |         self._INPUT_DIM = 128
 67 |         self._OUTPUT_DIM = len(self._id2name.keys())
 68 |         self._model = GRUfn(self._INPUT_DIM, 512, self._OUTPUT_DIM)
 69 |         self._model.load_state_dict(self._model_data['state_dict'])
 70 | 
 71 |     def predict(self, name_squence, ntop=3):
 72 |         '''
 73 |         get a scholar's possible future affiliation according to
 74 |         his current affiliation's name
 75 |         :param name: the scholar's affiliation name
 76 |         :param ntop: How many possible affiliations will the method return
 77 |         :return: A list of dictionaries:
 78 |                 {
 79 |                     'name': the most likely future affiliation's name
 80 |                     'p': the probability
 81 |                 }
 82 |         '''
 83 | 
 84 |         name_squence = [x.lower() for x in name_squence]
 85 |         name2id_squence = [self._name2id[name] for name in name_squence if name in self._name2id.keys()]
 86 |         # if len(name_squence) != len(name2id_squence):
 87 |         #     return None
 88 |         temp_squence = name2id_squence
 89 |         name2id_squence = []
 90 |         if len(temp_squence) != 0:
 91 |             name2id_squence.append(temp_squence[0])
 92 |             [name2id_squence.append(term) for index, term in enumerate(temp_squence) if index != 0 and term != temp_squence[index - 1]]
 93 |         else:
 94 |             return None
 95 |         # 去掉重复环路
 96 |         name2id_squence = self._delete_ring(name2id_squence)
 97 |         zb = self._id2PackedSequence(name2id_squence)
 98 |         fout = self._model(zb, batch=1)
 99 |         # softmax_fout = F.softmax(fout,1)
100 |         # ans = heapq.nlargest(ntop, enumerate(softmax_fout.data.numpy()[0]), key=lambda x:x[1])
101 |         ans = heapq.nlargest(ntop, enumerate(fout.data.numpy()[0]), key=lambda x:x[1])
102 |         ret = []
103 |         for id, p in ans:
104 |             ret.append({
105 |                 'name': self._id2name[id],
106 |                 'p': p,
107 |             })
108 |         self._softmax(ret)
109 |         return ret
110 |     def _delete_ring(self,id_squence):
111 |         clear_squence = id_squence
112 |         itmes = 1000
113 |         while True:
114 |             res = self._getNumofCommonSubstr(clear_squence, clear_squence)
115 |             if res[1] < 2:
116 |                 break
117 |             a = "_".join([str(ss) for ss in res[0]])
118 |             b = "_".join([str(ss) for ss in clear_squence])
119 |             temp = b
120 |             times = 1000
121 |             while times > 1:
122 |                 if b.rfind(a) != -1:
123 |                     temp = b
124 |                     b = self._rreplace(b, a, "_", 1)
125 |                     times -= 1
126 |                 else:
127 |                     break
128 |             clear_squence = [int(term) for term in temp.split("_") if term != ""]
129 |             # id_squence = [int(s) for s in clear_squence]
130 |         # clear_squence = id_squence
131 |         return clear_squence
132 | 
133 |     def _getNumofCommonSubstr(self,str1, str2):
134 |         lstr1 = len(str1)
135 |         lstr2 = len(str2)
136 |         record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)]  # 多一位
137 |         maxNum = 0  # 最长匹配长度
138 |         p = 0  # 匹配的起始位
139 | 
140 |         for i in range(lstr1):
141 |             for j in range(lstr2):
142 |                 if str1[i] == str2[j] and abs(i - j) > maxNum:
143 |                     # 相同则累加
144 |                     record[i + 1][j + 1] = record[i][j] + 1
145 |                     if record[i + 1][j + 1] > maxNum:
146 |                         # 获取最大匹配长度
147 |                         maxNum = record[i + 1][j + 1]
148 |                         # 记录最大匹配长度的终止位置
149 |                         p = i + 1
150 |         # return p - maxNum,p, maxNum
151 |         return str1[p - maxNum:p], maxNum
152 | 
153 |     def _rreplace(self,st, old, new, *max):
154 |         count = len(st)
155 |         if max and str(max[0]).isdigit():
156 |             count = max[0]
157 |         return new.join(st.rsplit(old, count))
158 | 
159 |     def _id2PackedSequence(self, affi_id):
160 |         # 输入的形状可以是 (T×B×*)。T 是最长序列长度，B 是 batch size，* 代表任意维度 (可以是 0)。如果 batch_first=True 的话，那么相应的 input size 就是 (B×T×*)。
161 |         ret = torch.zeros(1, len(affi_id), self._INPUT_DIM)
162 |         indices = torch.tensor(affi_id, device='cpu', dtype=torch.long)
163 |         ret[0] = torch.index_select(self._affi, 0, indices)
164 |         return torch.nn.utils.rnn.pack_padded_sequence(ret, [len(affi_id)],batch_first=True)
165 | 
166 |     def _softmax(self, affis):
167 |         # Softmax is a generalization of logistic function that "squashes"(maps) a vector of arbitrary real values to a vector of real values in the range (0, 1) that add up to 1.
168 |         s = sum(map(lambda x: np.exp(x['p']), affis))
169 |         for dict in affis:
170 |             dict['p'] = round(np.exp(dict['p'])/s, 2)


--------------------------------------------------------------------------------
/src/paperranker.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Introduction:
  3 | PaperRanker is a class which is used to predict how much possibility there is that a publication is belong to a professor.
  4 | Usage:
  5 | >>> ret, res = pr.label(a, b, threshold=0.5)
  6 | '''
  7 | from classifier import Classifier
  8 | from Levenshtein import jaro_winkler
  9 | from utils.translator import youdao_translate
 10 | import copy
 11 | import time
 12 | 
 13 | 
 14 | class PaperRanker:
 15 | 
 16 |     def __init__(self, use_clf=False):
 17 |         self.clf = Classifier()
 18 |         self.use_clf = use_clf
 19 |         self.weight = {
 20 |            'coauthor_score': 0.7 if use_clf else 0.9,
 21 |            'pubyear_score': 0.1,
 22 |         }
 23 |         if use_clf:
 24 |             self.weight['field_score'] = 0.2
 25 |         print(self.weight)
 26 | 
 27 |     def ranking(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate):
 28 |         '''
 29 |         Predict how much possibility there is that a publication is belong to a professor.
 30 |         :param correct_pubs: A list of this professor's publications.
 31 |         :param unsure_pubs: A list of unsure publications
 32 |         :param threshold: If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication.
 33 |         :param trans: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function.  It should be able to translate a list of strings in another language to Chinese.
 34 |         :return: (a,b), two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities. 
 35 |         '''
 36 |         ret = copy.deepcopy(unsure_pubs)
 37 |         ret = self.coauthor_score(correct_pubs, ret)
 38 |         if self.use_clf:
 39 |             ret = self.field_score(correct_pubs, ret, trans=trans)
 40 |         ret = self.pubyear_score(ret)
 41 |         for pub in ret:
 42 |             pub['score'] = 0
 43 |             for name, weight in self.weight.items():
 44 |                 pub['score'] += pub[name] * weight
 45 |         res = [pub for pub in ret if pub['score'] < threshold]
 46 |         ret = [pub for pub in ret if pub['score'] >= threshold]
 47 |         sorted(ret, key=lambda x: x['score'], reverse=True)
 48 |         sorted(res, key=lambda x: x['score'], reverse=True)
 49 |         return ret, res
 50 | 
 51 |     def label(self, correct_pubs, unsure_pubs, threshold=0.5, trans=youdao_translate):
 52 |         '''
 53 |         Use iterative algorithm to predict how much possibility there is that a publication is belong to a professor.
 54 |         :param correct_pubs: A list of this professor's publications.
 55 |         :param unsure_pubs: A list of unsure publications
 56 |         :param threshold: If the possibility of a publication is smaller than this threshold, it won't consider as a correct publication.
 57 |         :param trans: In fact, the classifier can only work on Chinese words because of the classification standard and the training data. In order to handle publications in other languages, you need to provide a translation function.  It should be able to translate a list of strings in another language to Chinese.
 58 |         :return: (a,b), two list of unsure publications and their possibilities. The first one has high possibilities, and the second one has low possibilities. 
 59 |         '''
 60 |         co = []
 61 |         uns = copy.deepcopy(unsure_pubs)
 62 |         cnt = 1
 63 |         while True:
 64 |             print('round {}'.format(cnt))
 65 |             cnt += 1
 66 |             ret, res = self.ranking(co + correct_pubs, uns, threshold, trans=trans)
 67 |             co = co + ret
 68 |             uns = res
 69 |             if len(ret) == 0:
 70 |                 break
 71 |             time.sleep(4)
 72 |         return co, uns
 73 | 
 74 |     def coauthor_score(self, correct_pubs, unsure_pubs):
 75 |         authors = set()
 76 |         for pub in correct_pubs:
 77 |             authors = authors.union(set(pub['authors']))
 78 |         for pub in unsure_pubs:
 79 |             num = 0
 80 |             for name_b in pub['authors']:
 81 |                 score = 0
 82 |                 for name_a in authors:
 83 |                     score = max(score, self.name_match(name_a, name_b))
 84 |                 num += score
 85 |             if len(pub['authors']) == 0:
 86 |                 pub['coauthor_score'] = 0
 87 |             else:
 88 |                 pub['coauthor_score'] = min(1.0, max(num/len(pub['authors']), num*0.12))
 89 |         return unsure_pubs
 90 | 
 91 |     def field_score(self, correct_pubs, unsure_pubs, trans=youdao_translate):
 92 |         titles = list(map(lambda x: x['title'], correct_pubs))
 93 |         distribution = self.clf.classify(pub_titles=titles, level=1, ntop=5, translatation_func=trans)
 94 |         codes = set(map(lambda x: x['code'], distribution['level1']))
 95 |         cnt = 0
 96 |         for pub in unsure_pubs:
 97 |             cnt += 1
 98 |             now_dist = self.clf.classify(pub_titles=[pub['title']], level=1, ntop=5, translatation_func=trans)
 99 |             if now_dist:
100 |                 intersect = set(map(lambda x: x['code'], now_dist['level1']))&codes
101 |             else:
102 |                 intersect = []
103 |             pub['field_score'] = len(intersect)/5
104 |         return unsure_pubs
105 | 
106 |     def pubyear_score(self, unsure_pubs):
107 |         for pub in unsure_pubs:
108 |             if pub['year']:
109 |                 year = int(pub['year'])
110 |                 pub['pubyear_score'] = min(1.0, (year - 1950) / (2019 - 1950))
111 |             else:
112 |                 pub['pubyear_score'] = 0.5
113 |         return unsure_pubs
114 | 
115 |     def name_match(self, name_a, name_b):
116 |         '''
117 |         For example, we cannot totally assert Professor 'J. Tang' and Professor 'Jie Tang' are the same person.
118 |         We use this function to estimate how much possibility that two professors' name belong to one person.
119 |         '''
120 |         name_a = name_a.lower().strip().replace('.', '').replace('-', '').replace(u'\xa0', '')
121 |         name_b = name_b.lower().strip().replace('.', '').replace('-', '')
122 |         if name_a == name_b:
123 |             return 1
124 |         elif name_a[0] != name_b[0]:
125 |             return 0
126 |         lastname_a = name_a.split(' ')[-1]
127 |         lastname_b = name_b.split(' ')[-1]
128 |         if lastname_a != lastname_b:
129 |             return 0
130 |         firstname_a = name_a.split(' ')[0]
131 |         firstname_b = name_b.split(' ')[0]
132 |         if len(firstname_a) != 1 and len(firstname_b) != 1:
133 |             return 0
134 |         return jaro_winkler(name_a, name_b)


--------------------------------------------------------------------------------
/src/tors.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Introduction:
 3 |     Predict a scholar's identity (teacher or student) and his or her degree.
 4 | usage:
 5 | >>> identity = TorS()
 6 | >>> print(identity.predict(pc=10, cn=10000, hi=40, gi=0, year_range=14))
 7 | '''
 8 | import os
 9 | import math
10 | import json
11 | import numpy as np
12 | import tensorflow as tf
13 | import pandas as pd
14 | from config import model_path
15 | 
16 | data_path = os.path.join(model_path, 'student')
17 | 
18 | 
19 | class TorS:
20 | 
21 |     def __init__(self):
22 |         self.feature_cols = [tf.feature_column.numeric_column(f) for f in ['pc', 'cn', 'hi', 'gi', 'year_range']]
23 |         self._md = tf.estimator.DNNClassifier(
24 |             hidden_units=[10, 10],
25 |             feature_columns=self.feature_cols,
26 |             model_dir=data_path,
27 |         )
28 | 
29 |     def predict(self, pc=0, cn=0, hi=0, gi=0, year_range=0):
30 |         '''
31 |         Predict whether a scholar is a teacher or a student, and then predict his degree.
32 |         :param pc: Number of papers
33 |         :param cn: Citation number
34 |         :param hi: H-index. Eg, an h-index of 25 means the researcher has 25 papers, each of which has been cited 25+ times.
35 |         :param gi: G-index. Given a set of articles ranked in decreasing order of the number of citations that they received,
36 |                    the g-index is the (unique) largest number such that the top g articles received (together) at least g^2 citations.
37 |         :param year_range: Time range of papers
38 |         :return: A dictionary:
39 |                 {
40 |                     'label': 'student' or 'teacher',
41 |                     'degree': 'undergraduate', 'master' or 'doctor'
42 |                     'p': probability
43 |                 }
44 |         '''
45 |         features = dict(pc=pc, cn=cn, hi=hi, gi=gi, year_range=year_range)
46 |         input = pd.read_json(json.dumps([features]))
47 |         output = self._md.predict(input_fn=lambda: self._pre_progress(input))
48 |         ans = [(int(item['class_ids'][0]), item['probabilities'][item['class_ids'][0]]) for item in output][0]
49 |         label = 'student' if ans[0] == 1 else 'teacher'
50 |         if label == 'teacher':
51 |             degree = 'doctor'
52 |         else:
53 |             degree = 'master' if pc >= 2 else 'undergraduate'
54 |         ret = {
55 |             'label': label,
56 |             'degree': degree,
57 |             'p': round(float(ans[1]), 4)
58 |         }
59 |         return ret
60 | 
61 |     def _pre_progress(self, features):
62 |         # Normalize and pass features to nn classifier for prediction
63 |         max_year_range = 53  # 53 is the max year_range in training set.
64 |         normalized_features = pd.DataFrame()
65 |         for feature in ['pc', 'cn', 'hi', 'gi']:
66 |             normalized_features[feature] = features[feature].apply(lambda x: math.log(x + 1.0))
67 |         normalized_features['year_range'] = features['year_range'].apply(lambda x: x / max_year_range)
68 |         ret = {key: np.array(value) for key, value in dict(normalized_features).items()}
69 |         ds = tf.data.Dataset.from_tensor_slices(ret)
70 |         ds = ds.batch(1).repeat(1)
71 |         ret = ds.make_one_shot_iterator().get_next()
72 |         return ret
73 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMinerOpen/prediction_api/344b9ef6cc6ea315d3e65a374d382d1a62f51ca6/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/acautomaton.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Introduction:
 3 | The Aho-Corasick automaton is a data structure that can quickly do a multiple-keyword search across text. It’s described in the classic paper ‘Efficient string matching: an aid to bibliographic search’: http://portal.acm.org/citation.cfm?id=360855&dl=ACM&coll=GUIDE. 
 4 | '''
 5 | 
 6 | import queue
 7 | 
 8 | 
 9 | class node:
10 |     def __init__(self, ch):
11 |         self.ch = ch
12 |         self.fail = None
13 |         self.tail = -1
14 |         self.len = 0
15 |         self.children = {}
16 | 
17 | 
18 | class ACAutomaton:
19 | 
20 |     def __init__(self, patterns=[]):
21 |         self.root = node('')
22 |         self.count = 0
23 |         self.patterns = patterns
24 |         if patterns:
25 |             for pattern in patterns:
26 |                 self.insert(pattern)
27 |             self.getfail()
28 | 
29 |     def insert(self, pattern):
30 |         # Insert a new pattern to the trie.
31 |         p = self.root
32 |         for i in pattern:
33 |             if i not in p.children.keys():
34 |                 child = node(i)
35 |                 p.children[i] = child
36 |                 p = child
37 |             else:
38 |                 p = p.children[i]
39 |         p.tail = self.count
40 |         p.len = len(pattern)
41 |         self.count += 1
42 |         return self.count
43 | 
44 |     def getfail(self):
45 |         # Use BFS algorithm to initialize 'fail' points.
46 |         q = queue.Queue()
47 |         q.put(self.root)
48 |         while not q.empty():
49 |             top = q.get()
50 |             for i in top.children.values():
51 |                 if top == self.root:
52 |                     i.fail = self.root
53 |                 else:
54 |                     p = top.fail
55 |                     while p:
56 |                         if i.ch in p.children.keys():
57 |                             i.fail = p.children[i.ch]
58 |                             break
59 |                         p = p.fail
60 |                     if not p:
61 |                         i.fail = self.root
62 |                 q.put(i)
63 | 
64 |     def search(self, text):
65 |         # Do a multiple-keyword search across text
66 |         p = self.root
67 |         ret = []
68 |         for i, ch in enumerate(text):
69 |             while ch not in p.children.keys() and p is not self.root:
70 |                 p = p.fail
71 |             if ch in p.children.keys():
72 |                 p = p.children[ch]
73 |             else:
74 |                 p = self.root
75 |             tmp = p
76 |             while tmp is not self.root:
77 |                 if tmp.tail >= 0:
78 |                     ret.append((i-tmp.len+1, -tmp.len))
79 |                     break
80 |                 else:
81 |                     tmp = tmp.fail
82 |         '''
83 |         In this project, we need to extract some patterns from the given text and these patterns should not intersect with each other.
84 |         For example, 'ac' and 'a' are both substrings of 'acb'. In this case, we just need 'ac'.
85 |         Here we use greedy algorithm to maximize the keywords' length.
86 |         '''
87 |         ret.sort()
88 |         ans = set()
89 |         end = -1
90 |         for pos, l in ret:
91 |             length = -l
92 |             if pos > end:
93 |                 ans.add(text[pos: pos + length])
94 |                 end = pos + length - 1
95 |         return list(ans)
96 | 


--------------------------------------------------------------------------------
/src/utils/crawler.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | '''
 3 | Here are some utilities which can help us search something from Baidu or Google.
 4 | '''
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | import json
 8 | from scrapy.selector import Selector
 9 | import re
10 | 
11 | headers = {
12 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
13 |     'Accept-Encoding': 'gzip, deflate, compress',
14 |     'Accept-Language': 'en-us;q=0.5,en;q=0.3',
15 |     'Cache-Control': 'max-age=0',
16 |     'Connection': 'keep-alive',
17 |     'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
18 | }
19 | 
20 | 
21 | def getHTMLText(url):
22 |     try:
23 |         r = requests.get(url, headers=headers)
24 |         r.raise_for_status()
25 |         r.encoding = r.apparent_encoding
26 |         return r.text
27 |     except Exception:
28 |         return ''
29 | 
30 | 
31 | def baidu_parse(html):
32 |     ulist = []
33 |     soup = BeautifulSoup(html, 'lxml')
34 |     items = soup.find_all('div', {'class': 'result c-container'})
35 |     if not items:
36 |         items = soup.find_all('div', {'class': 'result c-container '})
37 |     for node in items:
38 |         try:
39 |             abstract_node = node.find('div', {'class': 'c-abstract c-abstract-en'})
40 |             if not abstract_node:
41 |                 abstract_node = node.find('div', {'class': 'c-abstract'})
42 |             ctools = node.find('div', {'class': 'c-tools'})
43 |             abstract = abstract_node.text
44 |             title = json.loads(ctools['data-tools'].replace('\\', ''))['title']
45 |             ulist.append({
46 |                 'title': title,
47 |                 'content': abstract
48 |             })
49 |         except Exception as ex:
50 |             print(str(ex))
51 |     return ulist
52 | 
53 | 
54 | def google_parse(html):
55 |     page = Selector(text=html)
56 |     rs = []
57 |     for ans in page.css('div.g'):
58 |         title = ''.join(ans.css('h3').css('*::text').extract())
59 |         content = ''.join(ans.css('span.st').css('*::text').extract())
60 |         url = ans.css('*.r a::attr(href)').extract()
61 |         try:
62 |             url = re.findall('(http.*)', url[0])
63 |             url = re.sub('&.*', '', url[0])
64 |             rs.append({
65 |                 'url': url,
66 |                 'content': content,
67 |                 'title': title,
68 |             })
69 |         except Exception:
70 |             pass
71 |     return rs
72 | 
73 | 
74 | # url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en'
75 | # html = getHTMLText(url)
76 | # print(baidu_parse(html))


--------------------------------------------------------------------------------
/src/utils/translator.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | def youdao_translate(text):
 5 |     url = 'http://fanyi.youdao.com/translate'
 6 |     if type(text) == list:
 7 |         src = ','.join(text)
 8 |     else:
 9 |         src = text
10 |     data = {
11 |         'doctype': 'json',
12 |         'type': 'EN2ZH_CN',
13 |         'i': src,
14 |     }
15 |     rs = requests.get(url=url, params=data)
16 |     try:
17 |         trans_data = rs.json()['translateResult']
18 |         tgt = [t['tgt'] for t in trans_data[0]]
19 |         return tgt
20 |     except Exception:
21 |         # print('There is an error in translation')
22 |         return []
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     print(youdao_translate(['test','apple']))


--------------------------------------------------------------------------------
/test/nsfc_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | sys.path.append('../src')
 4 | from classifier import Classifier
 5 | 
 6 | clf = Classifier()
 7 | f = open('nsfc_test.json', 'r', encoding='utf-8')
 8 | s = f.read()
 9 | j = json.loads(s, encoding='utf-8')
10 | data = j['nsfc']
11 | for level in [1, 2, 3]:
12 |     cnt = 0
13 |     top1 = 0
14 |     top5 = 0
15 |     length = level * 2 + 1
16 |     for item in data:
17 |         if len(item['sid']) < length:
18 |             continue
19 |         subject = clf.classify([item['title']], level=level, lang_zh=True)
20 |         if subject == {}:
21 |             continue
22 |         cnt += 1
23 |         if subject['level{}'.format(level)][0]['code'] == item['sid'][0:length]:
24 |             top1 += 1
25 |         for ret in subject['level{}'.format(level)]:
26 |             if ret['code'] == item['sid'][0:length]:
27 |                 top5 += 1
28 |                 break
29 |     print('level', level, ':', top1/cnt, ' ', top5/cnt, ' ', cnt)
30 |         


--------------------------------------------------------------------------------
/test/pageranker_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import random
 4 | sys.path.append('../src')
 5 | from paperranker import PaperRanker
 6 | from baidu_translator import baidu_translate
 7 | 
 8 | 
 9 | pr = PaperRanker(use_clf=True)
10 | with open('pageranker_test.json', 'r', encoding='utf-8') as f:
11 |     s = f.read()
12 |     correct = json.loads(s, encoding='utf-8')[0]['confirmed']
13 | with open('542edff0dabfae498ae3c756.json', 'r', encoding='utf-8') as f:
14 |     s = f.read()
15 |     wrong = json.loads(s, encoding='utf-8')
16 | random.shuffle(correct)
17 | random.shuffle(wrong)
18 | a = correct[0:40]
19 | b = correct[40:] + wrong
20 | ret, res = pr.label(a, b, threshold=0.5, trans=baidu_translate)
21 | tp = 0
22 | fp = 0
23 | fn = 0
24 | tn = 0
25 | for item in ret:
26 |     if item['flag'] == '1':
27 |         tp += 1
28 |     else:
29 |         fp += 1
30 | print(str(tp)+' '+str(fp))
31 | for item in res:
32 |     if item['flag'] == '1':
33 |         fn += 1
34 |     else:
35 |         tn += 1
36 | print(str(fn)+' '+str(tn))
37 | 


--------------------------------------------------------------------------------