├── testsets ├── CA8 │ └── dataset_statistics.xlsx ├── README.md └── CA_translated │ └── ca_translated.txt ├── evaluation ├── ana_eval_dense.py └── ana_eval_sparse.py ├── LICENSE └── README.md /testsets/CA8/dataset_statistics.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chinese-Word-Vectors/master/testsets/CA8/dataset_statistics.xlsx -------------------------------------------------------------------------------- /evaluation/ana_eval_dense.py: -------------------------------------------------------------------------------- 1 | # We reuse a fraction of code in http://bitbucket.org/omerlevy/hyperwords. 2 | # Using the numpy and similarity matrix largely speed up the evaluation process, 3 | # compared with evaluation scripts in word2vec and GloVe 4 | 5 | import numpy as np 6 | import argparse 7 | import random 8 | 9 | 10 | def read_vectors(path, topn): # read top n word vectors, i.e. top is 10000 11 | lines_num, dim = 0, 0 12 | vectors = {} 13 | iw = [] 14 | wi = {} 15 | with open(path, encoding='utf-8', errors='ignore') as f: 16 | first_line = True 17 | for line in f: 18 | if first_line: 19 | first_line = False 20 | dim = int(line.rstrip().split()[1]) 21 | continue 22 | lines_num += 1 23 | tokens = line.rstrip().split(' ') 24 | vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]]) 25 | iw.append(tokens[0]) 26 | if topn != 0 and lines_num >= topn: 27 | break 28 | for i, w in enumerate(iw): 29 | wi[w] = i 30 | return vectors, iw, wi, dim 31 | 32 | 33 | def read_analogy(path, iw): 34 | analogy = {} 35 | analogy_type = "" 36 | with open(path) as f: 37 | for line in f: 38 | oov = 0 39 | if line.strip().split()[0] == ':': 40 | analogy_type = line.strip().split()[1] 41 | analogy[analogy_type] = {} 42 | analogy[analogy_type]["questions"] = [] 43 | analogy[analogy_type]["total"] = 0 44 | analogy[analogy_type]["seen"] = 0 45 | continue 46 | analogy_question = line.strip().split() 47 | for w in analogy_question[:3]: 48 | if w not in iw: 49 | oov = 1 50 | if oov == 1: 51 | analogy[analogy_type]["total"] += 1 52 | continue 53 | analogy[analogy_type]["total"] += 1 54 | analogy[analogy_type]["seen"] += 1 55 | analogy[analogy_type]["questions"].append(analogy_question) 56 | 57 | for t in analogy: 58 | analogy[t]['iw'] = [] 59 | analogy[t]['wi'] = {} 60 | for question in analogy[t]["questions"]: 61 | for w in question: 62 | if w not in analogy[t]['iw']: 63 | analogy[t]['iw'].append(w) 64 | for i, w in enumerate(analogy[t]['iw']): 65 | analogy[t]['wi'][w] = i 66 | return analogy 67 | 68 | 69 | def normalize(matrix): 70 | norm = np.sqrt(np.sum(matrix * matrix, axis=1)) 71 | matrix = matrix / norm[:, np.newaxis] 72 | return matrix 73 | 74 | 75 | def guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c): 76 | sim_a = sims[analogy[analogy_type]["wi"][word_a]] 77 | sim_b = sims[analogy[analogy_type]["wi"][word_b]] 78 | sim_c = sims[analogy[analogy_type]["wi"][word_c]] 79 | 80 | add_sim = -sim_a+sim_b+sim_c 81 | add_sim[wi[word_a]] = 0 82 | add_sim[wi[word_b]] = 0 83 | add_sim[wi[word_c]] = 0 84 | guess_add = iw[np.nanargmax(add_sim)] 85 | 86 | mul_sim = sim_b * sim_c * np.reciprocal(sim_a+0.01) 87 | mul_sim[wi[word_a]] = 0 88 | mul_sim[wi[word_b]] = 0 89 | mul_sim[wi[word_c]] = 0 90 | guess_mul = iw[np.nanargmax(mul_sim)] 91 | return guess_add, guess_mul 92 | 93 | 94 | def main(): 95 | vectors_path = "embedding_sample/dense_small.txt" 96 | analogy_path = "CA8/morphological.txt" 97 | topn = 0 98 | results = {} # Records the results 99 | myParser = argparse.ArgumentParser() 100 | myParser.add_argument('-v', '--vectors', type=str, help="Vectors path") 101 | myParser.add_argument('-a', '--analogy', type=str, help="Analogy benchmark path") 102 | myParser.add_argument('-t', '--topn', type=int, help="Read top n word vectors") 103 | args = myParser.parse_args() 104 | if args.vectors: 105 | vectors_path = args.vectors 106 | if args.analogy: 107 | analogy_path = args.analogy 108 | if args.topn: 109 | topn = args.topn 110 | 111 | vectors, iw, wi, dim = read_vectors(vectors_path, topn) # Read top n word vectors. Read all vectors when topn is 0 112 | analogy = read_analogy(analogy_path, iw) # Read analogy questions 113 | 114 | # Turn vectors into numpy format and normalize them 115 | matrix = np.zeros(shape=(len(iw), dim), dtype=np.float32) 116 | for i, word in enumerate(iw): 117 | matrix[i, :] = vectors[word] 118 | matrix = normalize(matrix) 119 | 120 | for analogy_type in analogy.keys(): # Calculate the accuracy for each relation type 121 | correct_add_num, correct_mul_num = 0, 0 122 | analogy_matrix = matrix[[wi[w] if w in wi else random.randint(0, len(wi)-1) for w in analogy[analogy_type]["iw"]]] 123 | sims = analogy_matrix.dot(matrix.T) 124 | sims = (sims + 1)/2 # Transform similarity scores to positive numbers (for mul evaluation) 125 | for question in analogy[analogy_type]["questions"]: # Loop for each analogy question 126 | word_a, word_b, word_c, word_d = question 127 | guess_add, guess_mul = guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c) 128 | if guess_add == word_d: 129 | correct_add_num += 1 130 | if guess_mul == word_d: 131 | correct_mul_num += 1 132 | cov = float(analogy[analogy_type]["seen"]) / analogy[analogy_type]["total"] 133 | if analogy[analogy_type]["seen"] == 0: 134 | acc_add = 0 135 | acc_mul = 0 136 | print(analogy_type + " add/mul: " + str(round(0.0, 3)) + "/" + str(round(0.0, 3))) 137 | else: 138 | acc_add = float(correct_add_num) / analogy[analogy_type]["seen"] 139 | acc_mul = float(correct_mul_num) / analogy[analogy_type]["seen"] 140 | print(analogy_type + " add/mul: " + str(round(acc_add, 3)) + "/" + str(round(acc_mul, 3))) 141 | # Store the results 142 | results[analogy_type] = {} 143 | results[analogy_type]["coverage"] = [cov, analogy[analogy_type]["seen"], analogy[analogy_type]["total"]] 144 | results[analogy_type]["accuracy_add"] = [acc_add, correct_add_num, analogy[analogy_type]["seen"]] 145 | results[analogy_type]["accuracy_mul"] = [acc_mul, correct_mul_num, analogy[analogy_type]["seen"]] 146 | 147 | correct_add_num, correct_mul_num, seen = 0, 0, 0 148 | for analogy_type in results: 149 | correct_add_num += results[analogy_type]["accuracy_add"][1] 150 | correct_mul_num += results[analogy_type]["accuracy_mul"][1] 151 | seen += results[analogy_type]["coverage"][1] 152 | 153 | # print results 154 | if seen == 0: 155 | print("Total accuracy (add): " + str(round(0.0, 3))) 156 | print("Total accuracy (mul): " + str(round(0.0, 3))) 157 | else: 158 | print("Total accuracy (add): " + str(round(float(correct_add_num)/seen, 3))) 159 | print("Total accuracy (mul): " + str(round(float(correct_mul_num)/seen, 3))) 160 | 161 | 162 | if __name__ == '__main__': 163 | main() 164 | -------------------------------------------------------------------------------- /evaluation/ana_eval_sparse.py: -------------------------------------------------------------------------------- 1 | # We reuse a fraction of code in http://bitbucket.org/omerlevy/hyperwords. 2 | # Using the numpy and similarity matrix largely speed up the evaluation process, 3 | # compared with evaluation scripts in word2vec and GloVe 4 | 5 | import numpy as np 6 | import argparse 7 | import random 8 | from scipy.sparse import dok_matrix, csr_matrix 9 | 10 | 11 | def load_matrix(f_path): 12 | with open(f_path, errors='ignore') as f: 13 | row, col, data, iw = [], [], [], [] 14 | first_line = True 15 | lines_num = 0 16 | for line in f: 17 | if first_line: 18 | first_line = False 19 | words_num = int(line.rstrip().split()[0]) 20 | dim = int(line.rstrip().split()[1]) 21 | continue 22 | line = line.rstrip().split(' ') 23 | word = line[0] 24 | iw.append(word) 25 | vector = line[1:] 26 | for v in vector: 27 | row.append(lines_num) 28 | col.append(int(v.split(":")[0])) 29 | data.append(float(v.split(":")[1])) 30 | lines_num += 1 31 | wi = {} 32 | for i in range(len(iw)): 33 | wi[iw[i]] = i 34 | row = np.array(row) 35 | col = np.array(col) 36 | data = np.array(data) 37 | matrix = csr_matrix((data, (row, col)), shape=(words_num, dim)) 38 | return matrix, iw, wi 39 | 40 | 41 | def load_vocabulary(path): 42 | with open(path) as f: 43 | vocab = [line.strip().split()[0] for line in f if len(line) > 0] 44 | return dict([(a, i) for i, a in enumerate(vocab)]), vocab 45 | 46 | 47 | def read_analogy(path, iw): 48 | analogy = {} 49 | analogy_type = "" 50 | with open(path) as f: 51 | for line in f: 52 | oov = 0 53 | if line.strip().split()[0] == ':': 54 | analogy_type = line.strip().split()[1] 55 | analogy[analogy_type] = {} 56 | analogy[analogy_type]["questions"] = [] 57 | analogy[analogy_type]["total"] = 0 58 | analogy[analogy_type]["seen"] = 0 59 | continue 60 | analogy_question = line.strip().split() 61 | for w in analogy_question[:3]: 62 | if w not in iw: 63 | oov = 1 64 | if oov == 1: 65 | analogy[analogy_type]["total"] += 1 66 | continue 67 | analogy[analogy_type]["total"] += 1 68 | analogy[analogy_type]["seen"] += 1 69 | analogy[analogy_type]["questions"].append(analogy_question) 70 | 71 | for t in analogy: 72 | analogy[t]['iw'] = [] 73 | analogy[t]['wi'] = {} 74 | for question in analogy[t]["questions"]: 75 | for w in question: 76 | if w not in analogy[t]['iw']: 77 | analogy[t]['iw'].append(w) 78 | for i, w in enumerate(analogy[t]['iw']): 79 | analogy[t]['wi'][w] = i 80 | return analogy 81 | 82 | 83 | def normalize(matrix): 84 | matrix2 = matrix.copy() 85 | matrix2.data **= 2 86 | norm = np.reciprocal(np.sqrt(np.array(matrix2.sum(axis=1))[:, 0])) 87 | normalizer = dok_matrix((len(norm), len(norm))) 88 | normalizer.setdiag(norm) 89 | matrix = normalizer.tocsr().dot(matrix) 90 | return matrix 91 | 92 | 93 | def guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c): 94 | sim_a = sims[analogy[analogy_type]["wi"][word_a]] 95 | sim_b = sims[analogy[analogy_type]["wi"][word_b]] 96 | sim_c = sims[analogy[analogy_type]["wi"][word_c]] 97 | 98 | add_sim = -sim_a+sim_b+sim_c 99 | add_sim[wi[word_a]] = 0 100 | add_sim[wi[word_b]] = 0 101 | add_sim[wi[word_c]] = 0 102 | guess_add = iw[np.nanargmax(add_sim)] 103 | 104 | mul_sim = sim_b * sim_c * np.reciprocal(sim_a+0.01) 105 | mul_sim[wi[word_a]] = 0 106 | mul_sim[wi[word_b]] = 0 107 | mul_sim[wi[word_c]] = 0 108 | guess_mul = iw[np.nanargmax(mul_sim)] 109 | return guess_add, guess_mul 110 | 111 | 112 | def main(): 113 | neg = 1 114 | vectors_path = "embedding_sample/sparse_small.txt" 115 | analogy_path = "CA8/morphological.txt" 116 | results = {} 117 | 118 | myParser = argparse.ArgumentParser() 119 | myParser.add_argument('-v', '--vectors', type=str, help="Vectors path") 120 | myParser.add_argument('-a', '--analogy', type=str, help="Analogy benchmark path") 121 | args = myParser.parse_args() 122 | if args.vectors: 123 | vectors_path = args.vectors 124 | if args.analogy: 125 | analogy_path = args.analogy 126 | 127 | matrix, iw, wi = load_matrix(vectors_path) # Read matrix into the memory 128 | matrix = normalize(matrix) 129 | analogy = read_analogy(analogy_path, iw) 130 | for analogy_type in analogy.keys(): # Calculate the accuracy for each relation type 131 | correct_add_num, correct_mul_num = 0, 0 132 | analogy_matrix = matrix[[wi[w] if w in wi else random.randint(0, len(wi)-1) for w in analogy[analogy_type]["iw"]]] 133 | sims = analogy_matrix.dot(matrix.T) 134 | sims = np.array(sims.todense()) 135 | for question in analogy[analogy_type]["questions"]: # Loop for each analogy question 136 | word_a, word_b, word_c, word_d = question 137 | guess_add, guess_mul = guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c) 138 | if guess_add == word_d: 139 | correct_add_num += 1 140 | if guess_mul == word_d: 141 | correct_mul_num += 1 142 | cov = float(analogy[analogy_type]["seen"]) / analogy[analogy_type]["total"] 143 | if analogy[analogy_type]["seen"] == 0: 144 | acc_add = 0 145 | acc_mul = 0 146 | print (analogy_type + " add/mul: " + str(round(0.0, 3)) + "/" + str(round(0.0, 3))) 147 | else: 148 | acc_add = float(correct_add_num) / analogy[analogy_type]["seen"] 149 | acc_mul = float(correct_mul_num) / analogy[analogy_type]["seen"] 150 | print (analogy_type + " add/mul: " + str(round(acc_add, 3)) + "/" + str(round(acc_mul, 3))) 151 | # Store the results 152 | results[analogy_type] = {} 153 | results[analogy_type]["coverage"] = [cov, analogy[analogy_type]["seen"], analogy[analogy_type]["total"]] 154 | results[analogy_type]["accuracy_add"] = [acc_add, correct_add_num, analogy[analogy_type]["seen"]] 155 | results[analogy_type]["accuracy_mul"] = [acc_mul, correct_mul_num, analogy[analogy_type]["seen"]] 156 | 157 | correct_add_num, correct_mul_num, seen = 0, 0, 0 158 | for analogy_type in results: 159 | correct_add_num += results[analogy_type]["accuracy_add"][1] 160 | correct_mul_num += results[analogy_type]["accuracy_mul"][1] 161 | seen += results[analogy_type]["coverage"][1] 162 | 163 | # print results 164 | print("Total accuracy (add): " + str(round(float(correct_add_num)/seen, 3))) 165 | print("Total accuracy (mul): " + str(round(float(correct_mul_num)/seen, 3))) 166 | 167 | 168 | if __name__ == '__main__': 169 | main() 170 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /testsets/README.md: -------------------------------------------------------------------------------- 1 | # Chinese Word Analogy Benchmarks 2 | The quality of word vectors is often evaluated by analogy question tasks. In this project, two benchmarks are exploited for evaluation. The first is CA-translated ([Chen et al., 2015](#reference)), where most analogy questions are directly translated from English benchmark. Although CA-translated has been widely used in many Chinese word embedding papers, it only contains questions of three semantic questions and covers 134 Chinese words. In contrast, CA8 ([Li et al., 2018](#reference)) is specifically designed for Chinese language. It contains 17813 analogy questions and covers comprehensive morphological and semantic relations. 3 | 4 | ## CA8 5 | 6 | CA8 incorporates comprehensive morphological and semantic relations in Chinese. Specifically, CA8-morphological (CA8-Mor) contains 10177 morphological questions, which are constructed based on two types of relations: reduplication and semi-affixation. CA8-semantic (CA8-Sem) contains 7636 semantic questions, which can be divided into 4 categories and 28 sub-categories. Detailed description is as follows: 7 | 8 |
| Morphological Questions: Reduplication | 11 |||||
| Category | 14 |Sub-category | 15 |POS | 16 |Morphological Function | 17 |Example | 18 |
| A | 21 |AA | 22 |Noun | 23 |Form kinship terms | 24 |爸 (dad) → 爸爸 (dad) | 25 |
| Yield every / each meaning | 28 |天 (day) → 天天 (everyday) | 29 ||||
| Measure | 32 |Yield every / each meaning | 33 |个 (-) → 个个 (every/each) | 34 |||
| Verb | 37 |Signal doing something a little bit | 38 |说 (say) → 说说 (say a little) | 39 |||
| Signal things happen briefly | 42 |看 (look) → 看看 (have a brief look) | 43 ||||
| Adjective | 46 |Intensify the adjective | 47 |大 (big) → 大大 (very big) | 48 |||
| Transform it to adverbs | 51 |慢 (slow) → 慢慢 (slowly) | 52 ||||
| A yi A | 55 |Verb | 56 |Signal trying to do something | 57 |吃 (eat) → 吃一吃 (try to eat) | 58 ||
| A lai A qu | 61 |Verb | 62 |Signal doing something repeatedly | 63 |飞 (fly) → 飞来飞去 (fly around) | 64 ||
| AB | 67 |AABB | 68 |Noun | 69 |Yield many / much meaning | 70 |山水 (mountain and river) → 山山水水 (many mountains and rivers) | 71 |
| Verb | 74 |Indicate a continuous action | 75 |说笑 (laugh and chat) → 说说笑笑 (laugh and chat for a while) | 76 |||
| Adjective | 79 |Intensify the adjective | 80 |清楚 (clear) → 清清楚楚 (very clear) | 81 |||
| Yield the meaning of not uniform | 84 |大小 (size) → 大大小小 (all sizes) | 85 ||||
| Adverb | 88 |Intensify the adverb | 89 |彻底 (completely) → 彻彻底底 (totally and completely) | 90 |||
| A li A B | 93 |Adjective | 94 |Oralize the adjective and yield derogatory meaning | 95 |慌张 (flurried) → 慌里慌张 (anxious) | 96 ||
| ABAB | 99 |Verb | 100 |Signal doing something a little bit | 101 |注意 (pay attention) → 注意注意 (pay a little attention) | 102 ||
| Adjective | 105 |Intensify the adjective | 106 |雪白 (white) → 雪白雪白 (very white) | 107 |||
| Transform it to a verb | 110 |高兴 (happy) → 高兴高兴 (make someone happy) | 111 ||||
| Morphological Questions: Semi-affixation | 119 |||
| Category | 122 |Semi-affix | 123 |Example | 124 |
| Semi-prefix | 127 |第 | 128 |一 (one) → 第一 (first) | 129 |
| 初 | 132 |一 (one) → 初一 (the first day of a lunar month) | 133 ||
| 十 | 136 |一 (one) → 十一 (eleven) | 137 ||
| 周 | 140 |一 (one) → 周一 (Monday) | 141 ||
| 星期 | 144 |一 (one) → 星期一 (Monday) | 145 ||
| 老 | 148 |虎 (tiger) → 老虎 (tiger) | 149 ||
| 小 | 152 |草 (grass) → 小草 (grass) | 153 ||
| 大 | 156 |海 (sea) → 大海 (large sea) | 157 ||
| 半 | 160 |导体 (conductor) → 半导体 (semiconductor) | 161 ||
| 单 | 164 |细胞 (cell) → 单细胞 (unicell) | 165 ||
| 超 | 168 |链接 (link) → 超链接 (hyperlink) | 169 ||
| 次 | 172 |大陆 (continent) → 次大陆 (subcontinent) | 173 ||
| 非 | 176 |常规 (conventional) → 非常规 (unconventional) | 177 ||
| 每 | 180 |次 (time) → 每次 (every time) | 181 ||
| 全 | 184 |明星 (star) → 全明星 (all star) | 185 ||
| 伪 | 188 |君子 (gentlemen) → 伪君子 (hypocrites) | 189 ||
| 亚 | 192 |热带 (tropical zone) → 亚热带 (sub-tropical zone) | 193 ||
| 洋 | 196 |酒 (wine) → 洋酒 (foreign wine) | 197 ||
| 总 | 200 |比分 (score) → 总比分 (total score) | 201 ||
| 反 | 204 |物质 (matter) → 反常规 (antimatter) | 205 ||
| 副 | 208 |总统 (president) → 副总统 (vice president) | 209 ||
| Semi-suffix | 213 |们 | 214 |我 (I) → 我们 (we) | 215 |
| 里 | 218 |这 (here) → 这里 (here) | 219 ||
| 些 | 222 |这 (this) → 这些 (these) | 223 ||
| 样 | 226 |这 (this) → 这样 (such) | 227 ||
| 个 | 230 |这 (this) → 这个 (this one) | 231 ||
| 边 | 234 |这 (this) → 这边 (here) | 235 ||
| 种 | 238 |这 (this) → 这种 (this kind) | 239 ||
| 次 | 242 |这 (this) → 这次 (this time) | 243 ||
| 儿 | 246 |这 (this) → 这儿 (here) | 247 ||
| 部 | 250 |东 (east) → 东部 (east) | 251 ||
| 中 | 254 |心 (heart) → 心中 (in the heart) | 255 ||
| 上 | 258 |山 (mountain) → 山上 (on the mountain) | 259 ||
| 面 | 262 |前 (front) → 前面 (in the front) | 263 ||
| 者 | 266 |强 (strong) → 强者 (the strong one) | 267 ||
| 家 | 270 |科学 (science) → 科学家 (scientist) | 271 ||
| 子 | 274 |胖 (fat) → 胖子 (a fat man) | 275 ||
| 头 | 278 |木 (wood) → 木头 (wood) | 279 ||
| 工 | 282 |木 (wood) → 木工 (carpentry) | 283 ||
| 匠 | 286 |木 (wood) → 木匠 (carpenter) | 287 ||
| 星 | 290 |笑 (laugh) → 笑星 (comedian) | 291 ||
| 手 | 294 |老 (old) → 老手 (old hand) | 295 ||
| 主义 | 298 |乐观 (optimistic) → 乐观主义 (optimism) | 299 ||
| 鬼 | 302 |吝啬 (stingy) → 吝啬鬼 (miser) | 303 ||
| 式 | 306 |中 (Chinese) → 中式 (Chinese style) | 307 ||
| 队 | 310 |考古 (archaeology) → 考古队 (archaeological team) | 311 ||
| 色 | 314 |黄 (yellow) → 黄色 (the yellow color) | 315 ||
| 学 | 318 |地质 (geology) → 地质学 (discipline of geology) | 319 ||
| 论 | 322 |宿命 (fate) → 宿命论 (fatalism) | 323 ||
| 站 | 326 |汽车 (bus) → 汽车站 (bus station) | 327 ||
| 仪 | 330 |光谱 (spectrum) → 光谱仪 (spectrograph) | 331 ||
| 界 | 334 |学术 (academic) → 学术界 (academia) | 335 ||
| 族 | 338 |追星 (chasing a star) → 追星族 (fans) | 339 ||
| 棍 | 342 |赌 (gamble) → 赌棍 (gambler) | 343 ||
| 灾 | 346 |雨 (rain) → 雨灾 (rain disaster) | 347 ||
| 气 | 350 |冷 (cold) → 冷气 (cold air) | 351 ||
| 性 | 354 |酸 (acid) → 酸性 (acidic) | 355 ||
| 厅 | 358 |歌 (song) → 歌厅 (KTV) | 359 ||
| 机 | 362 |复印 (copy) → 复印机 (copier) | 363 ||
| 法 | 366 |说 (say) → 说法 (saying) | 367 ||
| 剧 | 370 |粤 (Yue) → 粤剧 (Cantonese Opera) | 371 ||
| 长 | 374 |船 (ship) → 船长 (captain of a ship) | 375 ||
| Semantic Questions | 381 |||
| Category | 384 |Sub-category | 385 |Example | 386 |
| Geography | 389 |country - capital | 390 |中国 (China) - 北京 (Beijing) | 391 |
| country - currency | 394 |中国 (China) - 人民币 (Chinese yuan) | 395 ||
| province - abbreviation | 398 |广东 (Guangdong) - 粤 (Yue) | 399 ||
| province - capital | 402 |广东 (Guangdong) - 广州 (Guangzhou) | 403 ||
| province - drama | 406 |广东 (Guangdong) - 粤剧 (Cantonese Opera) | 407 ||
| province - channel | 410 |广东 (Guangdong) - 广东卫视 (Guangdong Satellite TV) | 411 ||
| province - university | 414 |浙江 (Zhejiang) - 浙江大学 (Zhejiang University) | 415 ||
| city - university | 418 |南京 (Nanjing) - 南京大学 (Nanjing University) | 419 ||
| university - abbreviation | 422 |师范大学 (Normal University) - 师大 (Normal University) | 423 ||
| History | 427 |dynasty - emperor | 428 |汉 (Han) - 刘邦 (Liu Bang) | 429 |
| dynasty - capital | 432 |秦 (Qin) - 咸阳 (Xian Yang) | 433 ||
| title - emperor | 436 |汉高祖 (Emperor Gaozu of Han) - 刘邦 (Liu Bang) | 437 ||
| celebrity - country | 440 |屈原 (Qu Yuan) - 楚国 (Country Chu) | 441 ||
| Nature | 445 |number | 446 |第一 (first) - 状元 (the first in an imperial examination) | 447 |
| time | 450 |春节 (Spring Festival) - 正月 (the first month in a lunar year) | 451 ||
| animal | 454 |公鸡 (cock) - 母鸡 (hen) | 455 ||
| plant | 458 |杏树 (apricot tree) - 杏 (apricot) | 459 ||
| ornament | 462 |手指 (finger) - 戒指 (ring) | 463 ||
| chemistry | 466 |盐 (salt) - 氯化钠 (sodium chloride) | 467 ||
| physics | 470 |冰 (ice) - 水蒸气 (steam) | 471 ||
| weather | 474 |小满 (Grain Full) - 夏天 (summer) | 475 ||
| reverse | 478 |松 (loose) - 紧 (tight) | 479 ||
| color | 482 |海 (sea) - 蓝色 (blue) | 483 ||
| People | 487 |company - founder | 488 |阿里巴巴 (Alibaba) - 马云 (Ma Yun) | 489 |
| work - scientist | 492 |地动仪 (seismograph) - 张衡 (Zhang Heng) | 493 ||
| work - writer | 496 |朝花夕拾 (Dawn Blossoms Plucked at Dusk) - 鲁迅 (Lu Xun) | 497 ||
| family - member | 500 |爷爷 (grandfather) - 孙子 (grandson) | 501 ||
| student - degree | 504 |小学 (elementary school) - 小学生 (schoolchild) | 505 ||
| Window Size | 58 |Dynamic Window | 59 |Sub-sampling | 60 |Low-Frequency Word | 61 |Iteration | 62 |Negative Sampling* | 63 |
| 5 | 66 |Yes | 67 |1e-5 | 68 |10 | 69 |5 | 70 |5 | 71 |
| Word2vec / Skip-Gram with Negative Sampling (SGNS) | 83 |||||
| Corpus | 86 |Context Features | 87 ||||
| Word | 90 |Word + Ngram | 91 |Word + Character | 92 |Word + Character + Ngram | 93 ||
| Baidu Encyclopedia 百度百科 | 96 |300d | 97 |300d | 98 |300d | 99 |300d / PWD: 5555 | 100 |
| Wikipedia_zh 中文维基百科 | 103 |300d | 104 |300d | 105 |300d | 106 |300d | 107 |
| People's Daily News 人民日报 | 110 |300d | 111 |300d | 112 |300d | 113 |300d | 114 |
| Sogou News 搜狗新闻 | 117 |300d | 118 |300d | 119 |300d | 120 |300d | 121 |
| Financial News 金融新闻 | 124 |300d | 125 |300d | 126 |300d | 127 |300d | 128 |
| Zhihu_QA 知乎问答 | 131 |300d | 132 |300d | 133 |300d | 134 |300d | 135 |
| Weibo 微博 | 138 |300d | 139 |300d | 140 |300d | 141 |300d | 142 |
| Literature 文学作品 | 145 |300d | 146 |300d / PWD: z5b4 | 147 |300d | 148 |300d / PWD: yenb | 149 |
| Complete Library in Four Sections 四库全书* |
152 | 300d | 153 |300d | 154 |NAN | 155 |NAN | 156 |
| Mixed-large 综合 Baidu Netdisk / Google Drive |
159 |
160 | 300d 161 | 300d 162 | |
163 |
164 | 300d 165 | 300d 166 | |
167 |
168 | 300d 169 | 300d 170 | |
171 |
172 | 300d 173 | 300d 174 | |
175 |
| Positive Pointwise Mutual Information (PPMI) | 181 |||||
| Corpus | 184 |Context Features | 185 ||||
| Word | 188 |Word + Ngram | 189 |Word + Character | 190 |Word + Character + Ngram | 191 ||
| Baidu Encyclopedia 百度百科 | 194 |Sparse | 195 |Sparse | 196 |Sparse | 197 |Sparse | 198 |
| Wikipedia_zh 中文维基百科 | 201 |Sparse | 202 |Sparse | 203 |Sparse | 204 |Sparse | 205 |
| People's Daily News 人民日报 | 208 |Sparse | 209 |Sparse | 210 |Sparse | 211 |Sparse | 212 |
| Sogou News 搜狗新闻 | 215 |Sparse | 216 |Sparse | 217 |Sparse | 218 |Sparse | 219 |
| Financial News 金融新闻 | 222 |Sparse | 223 |Sparse | 224 |Sparse | 225 |Sparse | 226 |
| Zhihu_QA 知乎问答 | 229 |Sparse | 230 |Sparse | 231 |Sparse | 232 |Sparse | 233 |
| Weibo 微博 | 236 |Sparse | 237 |Sparse | 238 |Sparse | 239 |Sparse | 240 |
| Literature 文学作品 | 243 |Sparse | 244 |Sparse | 245 |Sparse | 246 |Sparse | 247 |
| Complete Library in Four Sections 四库全书* |
250 | Sparse | 251 |Sparse | 252 |NAN | 253 |NAN | 254 |
| Mixed-large 综合 | 258 |Sparse | 259 |Sparse | 260 |Sparse | 261 |Sparse | 262 |
| Feature | 278 |Co-occurrence Type | 279 |Target Word Vectors | 280 |Context Word Vectors | 281 |
| Word | 285 |Word → Word | 286 |300d | 287 |300d | 288 |
| Ngram | 292 |Word → Ngram (1-2) | 293 |300d | 294 |300d | 295 |
| Word → Ngram (1-3) | 298 |300d | 299 |300d | 300 ||
| Ngram (1-2) → Ngram (1-2) | 303 |300d | 304 |300d | 305 ||
| Character | 309 |Word → Character (1) | 310 |300d | 311 |300d | 312 |
| Word → Character (1-2) | 315 |300d | 316 |300d | 317 ||
| Word → Character (1-4) | 320 |300d | 321 |300d | 322 ||
| Radical | 326 |Radical | 327 |300d | 328 |300d | 329 |
| Position | 333 |Word → Word (left/right) | 334 |300d | 335 |300d | 336 |
| Word → Word (distance) | 339 |300d | 340 |300d | 341 ||
| Global | 345 |Word → Text | 346 |300d | 347 |300d | 348 |
| Syntactic Feature | 352 |Word → POS | 353 |300d | 354 |300d | 355 |
| Word → Dependency | 358 |300d | 359 |300d | 360 |
| Corpus | 377 |Size | 378 |Tokens | 379 |Vocabulary Size | 380 |Description | 381 |
| Baidu Encyclopedia 百度百科 |
384 | 4.1G | 385 |745M | 386 |5422K | 387 |Chinese Encyclopedia data from https://baike.baidu.com/ |
388 |
| Wikipedia_zh 中文维基百科 |
391 | 1.3G | 392 |223M | 393 |2129K | 394 |Chinese Wikipedia data from https://dumps.wikimedia.org/ |
395 |
| People's Daily News 人民日报 |
398 | 3.9G | 399 |668M | 400 |1664K | 401 |News data from People's Daily(1946-2017) http://data.people.com.cn/ |
402 |
| Sogou News 搜狗新闻 |
405 | 3.7G | 406 |649M | 407 |1226K | 408 |News data provided by Sogou labs http://www.sogou.com/labs/ |
409 |
| Financial News 金融新闻 |
412 | 6.2G | 413 |1055M | 414 |2785K | 415 |Financial news collected from multiple news websites | 416 |
| Zhihu_QA 知乎问答 |
419 | 2.1G | 420 |384M | 421 |1117K | 422 |Chinese QA data from https://www.zhihu.com/ |
423 |
| Weibo 微博 |
426 | 0.73G | 427 |136M | 428 |850K | 429 |Chinese microblog data provided by NLPIR Lab http://www.nlpir.org/wordpress/download/weibo.7z |
430 |
| Literature 文学作品 |
433 | 0.93G | 434 |177M | 435 |702K | 436 |8599 modern Chinese literature works | 437 |
| Mixed-large 综合 |
440 | 22.6G | 441 |4037M | 442 |10653K | 443 |We build the large corpus by merging the above corpora. | 444 |
| Complete Library in Four Sections 四库全书 |
447 | 1.5G | 448 |714M | 449 |21.8K | 450 |The largest collection of texts in pre-modern China. | 451 |