├── testsets
    ├── CA8
    │   └── dataset_statistics.xlsx
    ├── README.md
    └── CA_translated
    │   └── ca_translated.txt
├── evaluation
    ├── ana_eval_dense.py
    └── ana_eval_sparse.py
├── LICENSE
└── README.md


/testsets/CA8/dataset_statistics.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chinese-Word-Vectors/master/testsets/CA8/dataset_statistics.xlsx


--------------------------------------------------------------------------------
/evaluation/ana_eval_dense.py:
--------------------------------------------------------------------------------
  1 | # We reuse a fraction of code in http://bitbucket.org/omerlevy/hyperwords.
  2 | # Using the numpy and similarity matrix largely speed up the evaluation process,
  3 | # compared with evaluation scripts in word2vec and GloVe
  4 | 
  5 | import numpy as np
  6 | import argparse
  7 | import random
  8 | 
  9 | 
 10 | def read_vectors(path, topn):  # read top n word vectors, i.e. top is 10000
 11 |     lines_num, dim = 0, 0
 12 |     vectors = {}
 13 |     iw = []
 14 |     wi = {}
 15 |     with open(path, encoding='utf-8', errors='ignore') as f:
 16 |         first_line = True
 17 |         for line in f:
 18 |             if first_line:
 19 |                 first_line = False
 20 |                 dim = int(line.rstrip().split()[1])
 21 |                 continue
 22 |             lines_num += 1
 23 |             tokens = line.rstrip().split(' ')
 24 |             vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
 25 |             iw.append(tokens[0])
 26 |             if topn != 0 and lines_num >= topn:
 27 |                 break
 28 |     for i, w in enumerate(iw):
 29 |         wi[w] = i
 30 |     return vectors, iw, wi, dim
 31 | 
 32 | 
 33 | def read_analogy(path, iw):
 34 |     analogy = {}
 35 |     analogy_type = ""
 36 |     with open(path) as f:
 37 |         for line in f:
 38 |             oov = 0
 39 |             if line.strip().split()[0] == ':':
 40 |                 analogy_type = line.strip().split()[1]
 41 |                 analogy[analogy_type] = {}
 42 |                 analogy[analogy_type]["questions"] = []
 43 |                 analogy[analogy_type]["total"] = 0
 44 |                 analogy[analogy_type]["seen"] = 0
 45 |                 continue
 46 |             analogy_question = line.strip().split()
 47 |             for w in analogy_question[:3]:
 48 |                 if w not in iw:
 49 |                     oov = 1
 50 |             if oov == 1:
 51 |                 analogy[analogy_type]["total"] += 1
 52 |                 continue
 53 |             analogy[analogy_type]["total"] += 1
 54 |             analogy[analogy_type]["seen"] += 1
 55 |             analogy[analogy_type]["questions"].append(analogy_question)
 56 | 
 57 |         for t in analogy:
 58 |             analogy[t]['iw'] = []
 59 |             analogy[t]['wi'] = {}
 60 |             for question in analogy[t]["questions"]:
 61 |                 for w in question:
 62 |                     if w not in analogy[t]['iw']:
 63 |                         analogy[t]['iw'].append(w)
 64 |             for i, w in enumerate(analogy[t]['iw']):
 65 |                 analogy[t]['wi'][w] = i
 66 |         return analogy
 67 | 
 68 | 
 69 | def normalize(matrix):
 70 |     norm = np.sqrt(np.sum(matrix * matrix, axis=1))
 71 |     matrix = matrix / norm[:, np.newaxis]
 72 |     return matrix
 73 | 
 74 | 
 75 | def guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c):
 76 |     sim_a = sims[analogy[analogy_type]["wi"][word_a]]
 77 |     sim_b = sims[analogy[analogy_type]["wi"][word_b]]
 78 |     sim_c = sims[analogy[analogy_type]["wi"][word_c]]
 79 | 
 80 |     add_sim = -sim_a+sim_b+sim_c
 81 |     add_sim[wi[word_a]] = 0
 82 |     add_sim[wi[word_b]] = 0
 83 |     add_sim[wi[word_c]] = 0
 84 |     guess_add = iw[np.nanargmax(add_sim)]
 85 | 
 86 |     mul_sim = sim_b * sim_c * np.reciprocal(sim_a+0.01)
 87 |     mul_sim[wi[word_a]] = 0
 88 |     mul_sim[wi[word_b]] = 0
 89 |     mul_sim[wi[word_c]] = 0
 90 |     guess_mul = iw[np.nanargmax(mul_sim)]
 91 |     return guess_add, guess_mul
 92 | 
 93 | 
 94 | def main():
 95 |     vectors_path = "embedding_sample/dense_small.txt"
 96 |     analogy_path = "CA8/morphological.txt"
 97 |     topn = 0
 98 |     results = {}  # Records the results
 99 |     myParser = argparse.ArgumentParser()
100 |     myParser.add_argument('-v', '--vectors', type=str, help="Vectors path")
101 |     myParser.add_argument('-a', '--analogy', type=str, help="Analogy benchmark path")
102 |     myParser.add_argument('-t', '--topn', type=int, help="Read top n word vectors")
103 |     args = myParser.parse_args()
104 |     if args.vectors:
105 |         vectors_path = args.vectors
106 |     if args.analogy:
107 |         analogy_path = args.analogy
108 |     if args.topn:
109 |         topn = args.topn
110 | 
111 |     vectors, iw, wi, dim = read_vectors(vectors_path, topn)  # Read top n word vectors. Read all vectors when topn is 0
112 |     analogy = read_analogy(analogy_path, iw)  # Read analogy questions
113 | 
114 |     # Turn vectors into numpy format and normalize them
115 |     matrix = np.zeros(shape=(len(iw), dim), dtype=np.float32)
116 |     for i, word in enumerate(iw):
117 |         matrix[i, :] = vectors[word]
118 |     matrix = normalize(matrix)
119 | 
120 |     for analogy_type in analogy.keys():  # Calculate the accuracy for each relation type
121 |         correct_add_num, correct_mul_num = 0, 0
122 |         analogy_matrix = matrix[[wi[w] if w in wi else random.randint(0, len(wi)-1) for w in analogy[analogy_type]["iw"]]]
123 |         sims = analogy_matrix.dot(matrix.T)
124 |         sims = (sims + 1)/2  # Transform similarity scores to positive numbers (for mul evaluation)
125 |         for question in analogy[analogy_type]["questions"]:  # Loop for each analogy question
126 |             word_a, word_b, word_c, word_d = question
127 |             guess_add, guess_mul = guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c)
128 |             if guess_add == word_d:
129 |                 correct_add_num += 1
130 |             if guess_mul == word_d:
131 |                 correct_mul_num += 1
132 |         cov = float(analogy[analogy_type]["seen"]) / analogy[analogy_type]["total"]
133 |         if analogy[analogy_type]["seen"] == 0:
134 |             acc_add = 0
135 |             acc_mul = 0
136 |             print(analogy_type + " add/mul: " + str(round(0.0, 3)) + "/" + str(round(0.0, 3)))
137 |         else:
138 |             acc_add = float(correct_add_num) / analogy[analogy_type]["seen"]
139 |             acc_mul = float(correct_mul_num) / analogy[analogy_type]["seen"]
140 |             print(analogy_type + " add/mul: " + str(round(acc_add, 3)) + "/" + str(round(acc_mul, 3)))
141 |         # Store the results
142 |         results[analogy_type] = {}
143 |         results[analogy_type]["coverage"] = [cov, analogy[analogy_type]["seen"], analogy[analogy_type]["total"]]
144 |         results[analogy_type]["accuracy_add"] = [acc_add, correct_add_num, analogy[analogy_type]["seen"]]
145 |         results[analogy_type]["accuracy_mul"] = [acc_mul, correct_mul_num, analogy[analogy_type]["seen"]]
146 | 
147 |     correct_add_num, correct_mul_num, seen = 0, 0, 0
148 |     for analogy_type in results:
149 |         correct_add_num += results[analogy_type]["accuracy_add"][1]
150 |         correct_mul_num += results[analogy_type]["accuracy_mul"][1]
151 |         seen += results[analogy_type]["coverage"][1]
152 | 
153 |     # print results
154 |     if seen == 0:
155 |         print("Total accuracy (add): " + str(round(0.0, 3)))
156 |         print("Total accuracy (mul): " + str(round(0.0, 3)))
157 |     else:
158 |         print("Total accuracy (add): " + str(round(float(correct_add_num)/seen, 3)))
159 |         print("Total accuracy (mul): " + str(round(float(correct_mul_num)/seen, 3)))
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 | 


--------------------------------------------------------------------------------
/evaluation/ana_eval_sparse.py:
--------------------------------------------------------------------------------
  1 | # We reuse a fraction of code in http://bitbucket.org/omerlevy/hyperwords.
  2 | # Using the numpy and similarity matrix largely speed up the evaluation process,
  3 | # compared with evaluation scripts in word2vec and GloVe
  4 | 
  5 | import numpy as np
  6 | import argparse
  7 | import random
  8 | from scipy.sparse import dok_matrix, csr_matrix
  9 | 
 10 | 
 11 | def load_matrix(f_path):
 12 |     with open(f_path, errors='ignore') as f:
 13 |         row, col, data, iw = [], [], [], []
 14 |         first_line = True
 15 |         lines_num = 0
 16 |         for line in f:
 17 |             if first_line:
 18 |                 first_line = False
 19 |                 words_num = int(line.rstrip().split()[0])
 20 |                 dim = int(line.rstrip().split()[1])
 21 |                 continue
 22 |             line = line.rstrip().split(' ')
 23 |             word = line[0]
 24 |             iw.append(word)
 25 |             vector = line[1:]
 26 |             for v in vector:
 27 |                 row.append(lines_num)
 28 |                 col.append(int(v.split(":")[0]))
 29 |                 data.append(float(v.split(":")[1]))
 30 |             lines_num += 1
 31 |         wi = {}
 32 |         for i in range(len(iw)):
 33 |             wi[iw[i]] = i
 34 |         row = np.array(row)
 35 |         col = np.array(col)
 36 |         data = np.array(data)
 37 |         matrix = csr_matrix((data, (row, col)), shape=(words_num, dim))
 38 |         return matrix, iw, wi
 39 | 
 40 | 
 41 | def load_vocabulary(path):
 42 |     with open(path) as f:
 43 |         vocab = [line.strip().split()[0] for line in f if len(line) > 0]
 44 |     return dict([(a, i) for i, a in enumerate(vocab)]), vocab
 45 | 
 46 | 
 47 | def read_analogy(path, iw):
 48 |     analogy = {}
 49 |     analogy_type = ""
 50 |     with open(path) as f:
 51 |         for line in f:
 52 |             oov = 0
 53 |             if line.strip().split()[0] == ':':
 54 |                 analogy_type = line.strip().split()[1]
 55 |                 analogy[analogy_type] = {}
 56 |                 analogy[analogy_type]["questions"] = []
 57 |                 analogy[analogy_type]["total"] = 0
 58 |                 analogy[analogy_type]["seen"] = 0
 59 |                 continue
 60 |             analogy_question = line.strip().split()
 61 |             for w in analogy_question[:3]:
 62 |                 if w not in iw:
 63 |                     oov = 1
 64 |             if oov == 1:
 65 |                 analogy[analogy_type]["total"] += 1
 66 |                 continue
 67 |             analogy[analogy_type]["total"] += 1
 68 |             analogy[analogy_type]["seen"] += 1
 69 |             analogy[analogy_type]["questions"].append(analogy_question)
 70 | 
 71 |         for t in analogy:
 72 |             analogy[t]['iw'] = []
 73 |             analogy[t]['wi'] = {}
 74 |             for question in analogy[t]["questions"]:
 75 |                 for w in question:
 76 |                     if w not in analogy[t]['iw']:
 77 |                         analogy[t]['iw'].append(w)
 78 |             for i, w in enumerate(analogy[t]['iw']):
 79 |                 analogy[t]['wi'][w] = i
 80 |         return analogy
 81 | 
 82 | 
 83 | def normalize(matrix):
 84 |     matrix2 = matrix.copy()
 85 |     matrix2.data **= 2
 86 |     norm = np.reciprocal(np.sqrt(np.array(matrix2.sum(axis=1))[:, 0]))
 87 |     normalizer = dok_matrix((len(norm), len(norm)))
 88 |     normalizer.setdiag(norm)
 89 |     matrix = normalizer.tocsr().dot(matrix)
 90 |     return matrix
 91 | 
 92 | 
 93 | def guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c):
 94 |     sim_a = sims[analogy[analogy_type]["wi"][word_a]]
 95 |     sim_b = sims[analogy[analogy_type]["wi"][word_b]]
 96 |     sim_c = sims[analogy[analogy_type]["wi"][word_c]]
 97 | 
 98 |     add_sim = -sim_a+sim_b+sim_c
 99 |     add_sim[wi[word_a]] = 0
100 |     add_sim[wi[word_b]] = 0
101 |     add_sim[wi[word_c]] = 0
102 |     guess_add = iw[np.nanargmax(add_sim)]
103 | 
104 |     mul_sim = sim_b * sim_c * np.reciprocal(sim_a+0.01)
105 |     mul_sim[wi[word_a]] = 0
106 |     mul_sim[wi[word_b]] = 0
107 |     mul_sim[wi[word_c]] = 0
108 |     guess_mul = iw[np.nanargmax(mul_sim)]
109 |     return guess_add, guess_mul
110 | 
111 | 
112 | def main():
113 |     neg = 1
114 |     vectors_path = "embedding_sample/sparse_small.txt"
115 |     analogy_path = "CA8/morphological.txt"
116 |     results = {}
117 | 
118 |     myParser = argparse.ArgumentParser()
119 |     myParser.add_argument('-v', '--vectors', type=str, help="Vectors path")
120 |     myParser.add_argument('-a', '--analogy', type=str, help="Analogy benchmark path")
121 |     args = myParser.parse_args()
122 |     if args.vectors:
123 |         vectors_path = args.vectors
124 |     if args.analogy:
125 |         analogy_path = args.analogy
126 | 
127 |     matrix, iw, wi = load_matrix(vectors_path)  # Read matrix into the memory
128 |     matrix = normalize(matrix)
129 |     analogy = read_analogy(analogy_path, iw)
130 |     for analogy_type in analogy.keys():  # Calculate the accuracy for each relation type
131 |         correct_add_num, correct_mul_num = 0, 0
132 |         analogy_matrix = matrix[[wi[w] if w in wi else random.randint(0, len(wi)-1) for w in analogy[analogy_type]["iw"]]]
133 |         sims = analogy_matrix.dot(matrix.T)
134 |         sims = np.array(sims.todense())
135 |         for question in analogy[analogy_type]["questions"]:  # Loop for each analogy question
136 |             word_a, word_b, word_c, word_d = question
137 |             guess_add, guess_mul = guess(sims, analogy, analogy_type, iw, wi, word_a, word_b, word_c)
138 |             if guess_add == word_d:
139 |                 correct_add_num += 1
140 |             if guess_mul == word_d:
141 |                 correct_mul_num += 1
142 |         cov = float(analogy[analogy_type]["seen"]) / analogy[analogy_type]["total"]
143 |         if analogy[analogy_type]["seen"] == 0:
144 |             acc_add = 0
145 |             acc_mul = 0
146 |             print (analogy_type + " add/mul: " + str(round(0.0, 3)) + "/" + str(round(0.0, 3)))
147 |         else:
148 |             acc_add = float(correct_add_num) / analogy[analogy_type]["seen"]
149 |             acc_mul = float(correct_mul_num) / analogy[analogy_type]["seen"]
150 |             print (analogy_type + " add/mul: " + str(round(acc_add, 3)) + "/" + str(round(acc_mul, 3)))
151 |         # Store the results
152 |         results[analogy_type] = {}
153 |         results[analogy_type]["coverage"] = [cov, analogy[analogy_type]["seen"], analogy[analogy_type]["total"]]
154 |         results[analogy_type]["accuracy_add"] = [acc_add, correct_add_num, analogy[analogy_type]["seen"]]
155 |         results[analogy_type]["accuracy_mul"] = [acc_mul, correct_mul_num, analogy[analogy_type]["seen"]]
156 | 
157 |     correct_add_num, correct_mul_num, seen = 0, 0, 0
158 |     for analogy_type in results:
159 |         correct_add_num += results[analogy_type]["accuracy_add"][1]
160 |         correct_mul_num += results[analogy_type]["accuracy_mul"][1]
161 |         seen += results[analogy_type]["coverage"][1]
162 | 
163 |     # print results
164 |     print("Total accuracy (add): " + str(round(float(correct_add_num)/seen, 3)))
165 |     print("Total accuracy (mul): " + str(round(float(correct_mul_num)/seen, 3)))
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     main()
170 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/testsets/README.md:
--------------------------------------------------------------------------------
  1 | # Chinese Word Analogy Benchmarks
  2 | The quality of word vectors is often evaluated by analogy question tasks. In this project, two benchmarks are exploited for evaluation. The first is CA-translated ([Chen et al., 2015](#reference)), where most analogy questions are directly translated from English benchmark. Although CA-translated has been widely used in many Chinese word embedding papers, it only contains questions of three semantic questions and covers 134 Chinese words. In contrast, CA8 ([Li et al., 2018](#reference)) is specifically designed for Chinese language. It contains 17813 analogy questions and covers comprehensive morphological and semantic relations.
  3 | 
  4 | ## CA8
  5 | 
  6 | CA8 incorporates comprehensive morphological and semantic relations in Chinese. Specifically, CA8-morphological (CA8-Mor) contains 10177 morphological questions, which are constructed based on two types of relations: reduplication and semi-affixation. CA8-semantic (CA8-Sem) contains 7636 semantic questions, which can be divided into 4 categories and 28 sub-categories. Detailed description is as follows:
  7 | 
  8 | <table>
  9 |   <tr align="center">
 10 |     <td colspan="5"><b>Morphological Questions: Reduplication</b></td>
 11 |   </tr>
 12 |   <tr align="center">
 13 |     <td>Category</td>
 14 |     <td>Sub-category</td>
 15 |     <td>POS</td>
 16 |     <td>Morphological Function</td>
 17 |     <td>Example</td>
 18 |   </tr>
 19 |   <tr align="center">
 20 |     <td rowspan="9">A</td>
 21 |     <td rowspan="7">AA</td>
 22 |     <td rowspan="2">Noun</td>
 23 |     <td>Form kinship terms</td>
 24 |     <td>爸 (dad) → 爸爸 (dad)</td>
 25 |   </tr>
 26 |   <tr align="center">
 27 |     <td>Yield every / each meaning</td>
 28 |     <td>天 (day) → 天天 (everyday)</td>
 29 |   </tr>
 30 |   <tr align="center">
 31 |     <td rowspan="1">Measure</td>
 32 |     <td>Yield every / each meaning</td>
 33 |     <td>个 (-) → 个个 (every/each)</td>
 34 |   </tr>
 35 |   <tr align="center">
 36 |     <td rowspan="2">Verb</td>
 37 |     <td>Signal doing something a little bit</td>
 38 |     <td>说 (say) → 说说 (say a little)</td>
 39 |   </tr>
 40 |   <tr align="center">
 41 |     <td>Signal things happen briefly</td>
 42 |     <td>看 (look) → 看看 (have a brief look)</td>
 43 |   </tr>
 44 |   <tr align="center">
 45 |     <td rowspan="2">Adjective</td>
 46 |     <td>Intensify the adjective</td>
 47 |     <td>大 (big) → 大大 (very big)</td>
 48 |   </tr>
 49 |   <tr align="center">
 50 |     <td>Transform it to adverbs</td>
 51 |     <td>慢 (slow) → 慢慢 (slowly)</td>
 52 |   </tr>
 53 |   <tr align="center">
 54 |     <td rowspan="1">A yi A</td>
 55 |     <td rowspan="1">Verb</td>
 56 |     <td>Signal trying to do something</td>
 57 |     <td>吃 (eat) → 吃一吃 (try to eat)</td>
 58 |   </tr>
 59 |   <tr align="center">
 60 |     <td rowspan="1">A lai A qu</td>
 61 |     <td rowspan="1">Verb</td>
 62 |     <td>Signal doing something repeatedly</td>
 63 |     <td>飞 (fly) → 飞来飞去 (fly around)</td>
 64 |   </tr>
 65 |   <tr align="center">
 66 |     <td rowspan="9">AB</td>
 67 |     <td rowspan="5">AABB</td>
 68 |     <td rowspan="1">Noun</td>
 69 |     <td>Yield many / much meaning</td>
 70 |     <td>山水 (mountain and river) → 山山水水 (many mountains and rivers)</td>
 71 |   </tr>
 72 |   <tr align="center">
 73 |     <td rowspan="1">Verb</td>
 74 |     <td>Indicate a continuous action</td>
 75 |     <td>说笑 (laugh and chat) → 说说笑笑 (laugh and chat for a while)</td>
 76 |   </tr>
 77 |   <tr align="center">
 78 |     <td rowspan="2">Adjective</td>
 79 |     <td>Intensify the adjective</td>
 80 |     <td>清楚 (clear) → 清清楚楚 (very clear)</td>
 81 |   </tr>
 82 |   <tr align="center">
 83 |     <td>Yield the meaning of not uniform</td>
 84 |     <td>大小 (size) → 大大小小 (all sizes)</td>
 85 |   </tr>
 86 |   <tr align="center">
 87 |     <td rowspan="1">Adverb</td>
 88 |     <td>Intensify the adverb</td>
 89 |     <td>彻底 (completely) → 彻彻底底 (totally and completely)</td>
 90 |   </tr>
 91 |   <tr align="center">
 92 |     <td rowspan="1">A li A B</td>
 93 |     <td rowspan="1">Adjective</td>
 94 |     <td>Oralize the adjective and yield derogatory meaning</td>
 95 |     <td>慌张 (flurried) → 慌里慌张 (anxious)</td>
 96 |   </tr>
 97 |   <tr align="center">
 98 |     <td rowspan="3">ABAB</td>
 99 |     <td rowspan="1">Verb</td>
100 |     <td>Signal doing something a little bit</td>
101 |     <td>注意 (pay attention) → 注意注意 (pay a little attention)</td>
102 |   </tr>
103 |   <tr align="center">
104 |     <td rowspan="2">Adjective</td>
105 |     <td>Intensify the adjective</td>
106 |     <td>雪白 (white) → 雪白雪白 (very white)</td>
107 |   </tr>
108 |   <tr align="center">
109 |     <td>Transform it to a verb</td>
110 |     <td>高兴 (happy) → 高兴高兴 (make someone happy)</td>
111 |   </tr>
112 | </table>
113 | 
114 | Affixation is a morphological process whereby a bound morpheme (an affix) is attached to roots or stems to form new language units. Chinese is a typical isolating language that has few affixes. [Liu et al. (2001)](#reference) points out that although affixes are rare in Chinese, there are some components behaving like affixes and can also be used as independent lexemes. They are called semi-affixes. We follow their work and adopt this concept.
115 | 
116 | <table>
117 |   <tr align="center">
118 |     <td colspan="3"><b>Morphological Questions: Semi-affixation</b></td>
119 |   </tr>
120 |   <tr align="center">
121 |     <td>Category</td>
122 |     <td>Semi-affix</td>
123 |     <td>Example</td>
124 |   </tr>
125 |   <tr align="center">
126 |     <td rowspan="21">Semi-prefix</td>
127 |     <td>第</td>
128 |     <td>一 (one) → 第一 (first)</td>
129 |   </tr>
130 |   <tr align="center">
131 |     <td>初</td>
132 |     <td>一 (one) → 初一 (the first day of a lunar month)</td>
133 |   </tr>
134 |   <tr align="center">
135 |     <td>十</td>
136 |     <td>一 (one) → 十一 (eleven)</td>
137 |   </tr>
138 |   <tr align="center">
139 |     <td>周</td>
140 |     <td>一 (one) → 周一 (Monday)</td>
141 |   </tr>
142 |   <tr align="center">
143 |     <td>星期</td>
144 |     <td>一 (one) → 星期一 (Monday)</td>
145 |   </tr>
146 |   <tr align="center">
147 |     <td>老</td>
148 |     <td>虎 (tiger) → 老虎 (tiger)</td>
149 |   </tr>
150 |   <tr align="center">
151 |     <td>小</td>
152 |     <td>草 (grass) → 小草 (grass)</td>
153 |   </tr>
154 |   <tr align="center">
155 |     <td>大</td>
156 |     <td>海 (sea) → 大海 (large sea)</td>
157 |   </tr>
158 |   <tr align="center">
159 |     <td>半</td>
160 |     <td>导体 (conductor) → 半导体 (semiconductor)</td>
161 |   </tr>
162 |   <tr align="center">
163 |     <td>单</td>
164 |     <td>细胞 (cell) → 单细胞 (unicell)</td>
165 |   </tr>
166 |   <tr align="center">
167 |     <td>超</td>
168 |     <td>链接 (link) → 超链接 (hyperlink)</td>
169 |   </tr>
170 |   <tr align="center">
171 |     <td>次</td>
172 |     <td>大陆 (continent) → 次大陆 (subcontinent)</td>
173 |   </tr>
174 |   <tr align="center">
175 |     <td>非</td>
176 |     <td>常规 (conventional) → 非常规 (unconventional)</td>
177 |   </tr>
178 |   <tr align="center">
179 |     <td>每</td>
180 |     <td>次 (time) → 每次 (every time)</td>
181 |   </tr>
182 |   <tr align="center">
183 |     <td>全</td>
184 |     <td>明星 (star) → 全明星 (all star)</td>
185 |   </tr>
186 |   <tr align="center">
187 |     <td>伪</td>
188 |     <td>君子 (gentlemen) → 伪君子 (hypocrites)</td>
189 |   </tr>
190 |   <tr align="center">
191 |     <td>亚</td>
192 |     <td>热带 (tropical zone) → 亚热带 (sub-tropical zone)</td>
193 |   </tr>
194 |   <tr align="center">
195 |     <td>洋</td>
196 |     <td>酒 (wine) → 洋酒 (foreign wine)</td>
197 |   </tr>
198 |   <tr align="center">
199 |     <td>总</td>
200 |     <td>比分 (score) → 总比分 (total score)</td>
201 |   </tr>
202 |   <tr align="center">
203 |     <td>反</td>
204 |     <td>物质 (matter) → 反常规 (antimatter)</td>
205 |   </tr>
206 |   <tr align="center">
207 |     <td>副</td>
208 |     <td>总统 (president) → 副总统 (vice president)</td>
209 |   </tr>
210 | 
211 |   <tr align="center">
212 |     <td rowspan="41">Semi-suffix</td>
213 |     <td>们</td>
214 |     <td>我 (I) → 我们 (we)</td>
215 |   </tr>
216 |   <tr align="center">
217 |     <td>里</td>
218 |     <td>这 (here) → 这里 (here)</td>
219 |   </tr>
220 |   <tr align="center">
221 |     <td>些</td>
222 |     <td>这 (this) → 这些 (these)</td>
223 |   </tr>
224 |   <tr align="center">
225 |     <td>样</td>
226 |     <td>这 (this) → 这样 (such)</td>
227 |   </tr>
228 |   <tr align="center">
229 |     <td>个</td>
230 |     <td>这 (this) → 这个 (this one)</td>
231 |   </tr>
232 |   <tr align="center">
233 |     <td>边</td>
234 |     <td>这 (this) → 这边 (here)</td>
235 |   </tr>
236 |   <tr align="center">
237 |     <td>种</td>
238 |     <td>这 (this) → 这种 (this kind)</td>
239 |   </tr>
240 |   <tr align="center">
241 |     <td>次</td>
242 |     <td>这 (this) → 这次 (this time)</td>
243 |   </tr>
244 |   <tr align="center">
245 |     <td>儿</td>
246 |     <td>这 (this) → 这儿 (here)</td>
247 |   </tr>
248 |   <tr align="center">
249 |     <td>部</td>
250 |     <td>东 (east) → 东部 (east)</td>
251 |   </tr>
252 |   <tr align="center">
253 |     <td>中</td>
254 |     <td>心 (heart) → 心中 (in the heart)</td>
255 |   </tr>
256 |   <tr align="center">
257 |     <td>上</td>
258 |     <td>山 (mountain) → 山上 (on the mountain)</td>
259 |   </tr>
260 |   <tr align="center">
261 |     <td>面</td>
262 |     <td>前 (front) → 前面 (in the front)</td>
263 |   </tr>
264 |   <tr align="center">
265 |     <td>者</td>
266 |     <td>强 (strong) → 强者 (the strong one)</td>
267 |   </tr>
268 |   <tr align="center">
269 |     <td>家</td>
270 |     <td>科学 (science) → 科学家 (scientist)</td>
271 |   </tr>
272 |   <tr align="center">
273 |     <td>子</td>
274 |     <td>胖 (fat) → 胖子 (a fat man)</td>
275 |   </tr>
276 |   <tr align="center">
277 |     <td>头</td>
278 |     <td>木 (wood) → 木头 (wood)</td>
279 |   </tr>
280 |   <tr align="center">
281 |     <td>工</td>
282 |     <td>木 (wood) → 木工 (carpentry)</td>
283 |   </tr>
284 |   <tr align="center">
285 |     <td>匠</td>
286 |     <td>木 (wood) → 木匠 (carpenter)</td>
287 |   </tr>
288 |   <tr align="center">
289 |     <td>星</td>
290 |     <td>笑 (laugh) → 笑星 (comedian)</td>
291 |   </tr>
292 |   <tr align="center">
293 |     <td>手</td>
294 |     <td>老 (old) → 老手 (old hand)</td>
295 |   </tr>
296 |   <tr align="center">
297 |     <td>主义</td>
298 |     <td>乐观 (optimistic) → 乐观主义 (optimism)</td>
299 |   </tr>
300 |   <tr align="center">
301 |     <td>鬼</td>
302 |     <td>吝啬 (stingy) → 吝啬鬼 (miser)</td>
303 |   </tr>
304 |   <tr align="center">
305 |     <td>式</td>
306 |     <td>中 (Chinese) → 中式 (Chinese style)</td>
307 |   </tr>
308 |   <tr align="center">
309 |     <td>队</td>
310 |     <td>考古 (archaeology) → 考古队 (archaeological team)</td>
311 |   </tr>
312 |   <tr align="center">
313 |     <td>色</td>
314 |     <td>黄 (yellow) → 黄色 (the yellow color)</td>
315 |   </tr>
316 |   <tr align="center">
317 |     <td>学</td>
318 |     <td>地质 (geology) → 地质学 (discipline of geology)</td>
319 |   </tr>
320 |   <tr align="center">
321 |     <td>论</td>
322 |     <td>宿命 (fate) → 宿命论 (fatalism)</td>
323 |   </tr>
324 |   <tr align="center">
325 |     <td>站</td>
326 |     <td>汽车 (bus) → 汽车站 (bus station)</td>
327 |   </tr>
328 |   <tr align="center">
329 |     <td>仪</td>
330 |     <td>光谱 (spectrum) → 光谱仪 (spectrograph)</td>
331 |   </tr>
332 |   <tr align="center">
333 |     <td>界</td>
334 |     <td>学术 (academic) → 学术界 (academia)</td>
335 |   </tr>
336 |   <tr align="center">
337 |     <td>族</td>
338 |     <td>追星 (chasing a star) → 追星族 (fans)</td>
339 |   </tr>
340 |   <tr align="center">
341 |     <td>棍</td>
342 |     <td>赌 (gamble) → 赌棍 (gambler)</td>
343 |   </tr>
344 |   <tr align="center">
345 |     <td>灾</td>
346 |     <td>雨 (rain) → 雨灾 (rain disaster)</td>
347 |   </tr>
348 |   <tr align="center">
349 |     <td>气</td>
350 |     <td>冷 (cold) → 冷气 (cold air)</td>
351 |   </tr>
352 |   <tr align="center">
353 |     <td>性</td>
354 |     <td>酸 (acid) → 酸性 (acidic)</td>
355 |   </tr>
356 |   <tr align="center">
357 |     <td>厅</td>
358 |     <td>歌 (song) → 歌厅 (KTV)</td>
359 |   </tr>
360 |   <tr align="center">
361 |     <td>机</td>
362 |     <td>复印 (copy) → 复印机 (copier)</td>
363 |   </tr>
364 |   <tr align="center">
365 |     <td>法</td>
366 |     <td>说 (say) → 说法 (saying)</td>
367 |   </tr>
368 |   <tr align="center">
369 |     <td>剧</td>
370 |     <td>粤 (Yue) → 粤剧 (Cantonese Opera)</td>
371 |   </tr>
372 |   <tr align="center">
373 |     <td>长</td>
374 |     <td>船 (ship) → 船长 (captain of a ship)</td>
375 |   </tr>
376 | </table>
377 | 
378 | <table>
379 |   <tr align="center">
380 |     <td colspan="3"><b>Semantic Questions</b></td>
381 |   </tr>
382 |   <tr align="center">
383 |     <td>Category</td>
384 |     <td>Sub-category</td>
385 |     <td>Example</td>
386 |   </tr>
387 |   <tr align="center">
388 |     <td rowspan="9">Geography</td>
389 |     <td>country - capital</td>
390 |     <td>中国 (China) - 北京 (Beijing)</td>
391 |   </tr>
392 |   <tr align="center">
393 |     <td>country - currency</td>
394 |     <td>中国 (China) - 人民币 (Chinese yuan)</td>
395 |   </tr>
396 |   <tr align="center">
397 |     <td>province - abbreviation</td>
398 |     <td>广东 (Guangdong) - 粤 (Yue)</td>
399 |   </tr>
400 |   <tr align="center">
401 |     <td>province - capital</td>
402 |     <td>广东 (Guangdong) - 广州 (Guangzhou)</td>
403 |   </tr>
404 |   <tr align="center">
405 |     <td>province - drama</td>
406 |     <td>广东 (Guangdong) - 粤剧 (Cantonese Opera)</td>
407 |   </tr>
408 |   <tr align="center">
409 |     <td>province - channel</td>
410 |     <td>广东 (Guangdong) - 广东卫视 (Guangdong Satellite TV)</td>
411 |   </tr>
412 |   <tr align="center">
413 |     <td>province - university</td>
414 |     <td>浙江 (Zhejiang) - 浙江大学 (Zhejiang University)</td>
415 |   </tr>
416 |   <tr align="center">
417 |     <td>city - university</td>
418 |     <td>南京 (Nanjing) - 南京大学 (Nanjing University)</td>
419 |   </tr>
420 |   <tr align="center">
421 |     <td>university - abbreviation</td>
422 |     <td>师范大学 (Normal University) - 师大 (Normal University)</td>
423 |   </tr>
424 | 
425 |   <tr align="center">
426 |     <td rowspan="4">History</td>
427 |     <td>dynasty - emperor</td>
428 |     <td>汉 (Han) - 刘邦 (Liu Bang)</td>
429 |   </tr>
430 |   <tr align="center">
431 |     <td>dynasty - capital</td>
432 |     <td>秦 (Qin) - 咸阳 (Xian Yang)</td>
433 |   </tr>
434 |   <tr align="center">
435 |     <td>title - emperor</td>
436 |     <td>汉高祖 (Emperor Gaozu of Han) - 刘邦 (Liu Bang)</td>
437 |   </tr>
438 |   <tr align="center">
439 |     <td>celebrity - country</td>
440 |     <td>屈原 (Qu Yuan) - 楚国 (Country Chu)</td>
441 |   </tr>
442 | 
443 |   <tr align="center">
444 |     <td rowspan="10">Nature</td>
445 |     <td>number</td>
446 |     <td>第一 (first) - 状元 (the first in an imperial examination)</td>
447 |   </tr>
448 |   <tr align="center">
449 |     <td>time</td>
450 |     <td>春节 (Spring Festival) - 正月 (the first month in a lunar year)</td>
451 |   </tr>
452 |   <tr align="center">
453 |     <td>animal</td>
454 |     <td>公鸡 (cock) - 母鸡 (hen)</td>
455 |   </tr>
456 |   <tr align="center">
457 |     <td>plant</td>
458 |     <td>杏树 (apricot tree) - 杏 (apricot)</td>
459 |   </tr>
460 |   <tr align="center">
461 |     <td>ornament</td>
462 |     <td>手指 (finger) - 戒指 (ring)</td>
463 |   </tr>
464 |   <tr align="center">
465 |     <td>chemistry</td>
466 |     <td>盐 (salt) - 氯化钠 (sodium chloride)</td>
467 |   </tr>
468 |   <tr align="center">
469 |     <td>physics</td>
470 |     <td>冰 (ice) - 水蒸气 (steam)</td>
471 |   </tr>
472 |   <tr align="center">
473 |     <td>weather</td>
474 |     <td>小满 (Grain Full) - 夏天 (summer)</td>
475 |   </tr>
476 |   <tr align="center">
477 |     <td>reverse</td>
478 |     <td>松 (loose) - 紧 (tight)</td>
479 |   </tr>
480 |   <tr align="center">
481 |     <td>color</td>
482 |     <td>海 (sea) - 蓝色 (blue)</td>
483 |   </tr>
484 | 
485 |   <tr align="center">
486 |     <td rowspan="5">People</td>
487 |     <td>company - founder</td>
488 |     <td>阿里巴巴 (Alibaba) - 马云 (Ma Yun)</td>
489 |   </tr>
490 |   <tr align="center">
491 |     <td>work - scientist</td>
492 |     <td>地动仪 (seismograph) - 张衡 (Zhang Heng)</td>
493 |   </tr>
494 |   <tr align="center">
495 |     <td>work - writer</td>
496 |     <td>朝花夕拾 (Dawn Blossoms Plucked at Dusk) - 鲁迅 (Lu Xun)</td>
497 |   </tr>
498 |   <tr align="center">
499 |     <td>family - member</td>
500 |     <td>爷爷 (grandfather) - 孙子 (grandson)</td>
501 |   </tr>
502 |   <tr align="center">
503 |     <td>student - degree</td>
504 |     <td>小学 (elementary school) - 小学生 (schoolchild)</td>
505 |   </tr>
506 | </table>
507 | 
508 | ##  <a name="reference"></a>Reference
509 | Shen Li, Zhe Zhao, Renfen Hu, Wensi Li, Tao Liu, Xiaoyong Du, <em>Analogical Reasoning on Chinese Morphological and Semantic Relations</em>, ACL 2018.
510 | 
511 | Xinxiong Chen, Lei Xu, Zhiyuan Liu, Maosong Sun, and Huanbo Luan. 2015. Joint learning of character and word embeddings. In IJCAI. pages 1236–1242.
512 | 
513 | Yuehua Liu, Wenyu Pan, and Wei Gu. 2001. Practical grammar of modern Chinese. The Commercial Press.
514 | 


--------------------------------------------------------------------------------
/testsets/CA_translated/ca_translated.txt:
--------------------------------------------------------------------------------
  1 | : capital-common-countries
  2 | 雅典 希腊 巴格达 伊拉克
  3 | 雅典 希腊 曼谷 泰国
  4 | 雅典 希腊 北京 中国
  5 | 雅典 希腊 柏林 德国
  6 | 雅典 希腊 伯尔尼 瑞士
  7 | 雅典 希腊 开罗 埃及
  8 | 雅典 希腊 堪培拉 澳大利亚
  9 | 雅典 希腊 河内 越南
 10 | 雅典 希腊 哈瓦那 古巴
 11 | 雅典 希腊 赫尔辛基 芬兰
 12 | 雅典 希腊 伊斯兰堡 巴基斯坦
 13 | 雅典 希腊 喀布尔 阿富汗
 14 | 雅典 希腊 伦敦 英国
 15 | 雅典 希腊 马德里 西班牙
 16 | 雅典 希腊 莫斯科 俄罗斯
 17 | 雅典 希腊 奥斯陆 挪威
 18 | 雅典 希腊 渥太华 加拿大
 19 | 雅典 希腊 巴黎 法国
 20 | 雅典 希腊 罗马 意大利
 21 | 雅典 希腊 斯德哥尔摩 瑞典
 22 | 雅典 希腊 德黑兰 伊朗
 23 | 雅典 希腊 东京 日本
 24 | 巴格达 伊拉克 曼谷 泰国
 25 | 巴格达 伊拉克 北京 中国
 26 | 巴格达 伊拉克 柏林 德国
 27 | 巴格达 伊拉克 伯尔尼 瑞士
 28 | 巴格达 伊拉克 开罗 埃及
 29 | 巴格达 伊拉克 堪培拉 澳大利亚
 30 | 巴格达 伊拉克 河内 越南
 31 | 巴格达 伊拉克 哈瓦那 古巴
 32 | 巴格达 伊拉克 赫尔辛基 芬兰
 33 | 巴格达 伊拉克 伊斯兰堡 巴基斯坦
 34 | 巴格达 伊拉克 喀布尔 阿富汗
 35 | 巴格达 伊拉克 伦敦 英国
 36 | 巴格达 伊拉克 马德里 西班牙
 37 | 巴格达 伊拉克 莫斯科 俄罗斯
 38 | 巴格达 伊拉克 奥斯陆 挪威
 39 | 巴格达 伊拉克 渥太华 加拿大
 40 | 巴格达 伊拉克 巴黎 法国
 41 | 巴格达 伊拉克 罗马 意大利
 42 | 巴格达 伊拉克 斯德哥尔摩 瑞典
 43 | 巴格达 伊拉克 德黑兰 伊朗
 44 | 巴格达 伊拉克 东京 日本
 45 | 巴格达 伊拉克 雅典 希腊
 46 | 曼谷 泰国 北京 中国
 47 | 曼谷 泰国 柏林 德国
 48 | 曼谷 泰国 伯尔尼 瑞士
 49 | 曼谷 泰国 开罗 埃及
 50 | 曼谷 泰国 堪培拉 澳大利亚
 51 | 曼谷 泰国 河内 越南
 52 | 曼谷 泰国 哈瓦那 古巴
 53 | 曼谷 泰国 赫尔辛基 芬兰
 54 | 曼谷 泰国 伊斯兰堡 巴基斯坦
 55 | 曼谷 泰国 喀布尔 阿富汗
 56 | 曼谷 泰国 伦敦 英国
 57 | 曼谷 泰国 马德里 西班牙
 58 | 曼谷 泰国 莫斯科 俄罗斯
 59 | 曼谷 泰国 奥斯陆 挪威
 60 | 曼谷 泰国 渥太华 加拿大
 61 | 曼谷 泰国 巴黎 法国
 62 | 曼谷 泰国 罗马 意大利
 63 | 曼谷 泰国 斯德哥尔摩 瑞典
 64 | 曼谷 泰国 德黑兰 伊朗
 65 | 曼谷 泰国 东京 日本
 66 | 曼谷 泰国 雅典 希腊
 67 | 曼谷 泰国 巴格达 伊拉克
 68 | 北京 中国 柏林 德国
 69 | 北京 中国 伯尔尼 瑞士
 70 | 北京 中国 开罗 埃及
 71 | 北京 中国 堪培拉 澳大利亚
 72 | 北京 中国 河内 越南
 73 | 北京 中国 哈瓦那 古巴
 74 | 北京 中国 赫尔辛基 芬兰
 75 | 北京 中国 伊斯兰堡 巴基斯坦
 76 | 北京 中国 喀布尔 阿富汗
 77 | 北京 中国 伦敦 英国
 78 | 北京 中国 马德里 西班牙
 79 | 北京 中国 莫斯科 俄罗斯
 80 | 北京 中国 奥斯陆 挪威
 81 | 北京 中国 渥太华 加拿大
 82 | 北京 中国 巴黎 法国
 83 | 北京 中国 罗马 意大利
 84 | 北京 中国 斯德哥尔摩 瑞典
 85 | 北京 中国 德黑兰 伊朗
 86 | 北京 中国 东京 日本
 87 | 北京 中国 雅典 希腊
 88 | 北京 中国 巴格达 伊拉克
 89 | 北京 中国 曼谷 泰国
 90 | 柏林 德国 伯尔尼 瑞士
 91 | 柏林 德国 开罗 埃及
 92 | 柏林 德国 堪培拉 澳大利亚
 93 | 柏林 德国 河内 越南
 94 | 柏林 德国 哈瓦那 古巴
 95 | 柏林 德国 赫尔辛基 芬兰
 96 | 柏林 德国 伊斯兰堡 巴基斯坦
 97 | 柏林 德国 喀布尔 阿富汗
 98 | 柏林 德国 伦敦 英国
 99 | 柏林 德国 马德里 西班牙
100 | 柏林 德国 莫斯科 俄罗斯
101 | 柏林 德国 奥斯陆 挪威
102 | 柏林 德国 渥太华 加拿大
103 | 柏林 德国 巴黎 法国
104 | 柏林 德国 罗马 意大利
105 | 柏林 德国 斯德哥尔摩 瑞典
106 | 柏林 德国 德黑兰 伊朗
107 | 柏林 德国 东京 日本
108 | 柏林 德国 雅典 希腊
109 | 柏林 德国 巴格达 伊拉克
110 | 柏林 德国 曼谷 泰国
111 | 柏林 德国 北京 中国
112 | 伯尔尼 瑞士 开罗 埃及
113 | 伯尔尼 瑞士 堪培拉 澳大利亚
114 | 伯尔尼 瑞士 河内 越南
115 | 伯尔尼 瑞士 哈瓦那 古巴
116 | 伯尔尼 瑞士 赫尔辛基 芬兰
117 | 伯尔尼 瑞士 伊斯兰堡 巴基斯坦
118 | 伯尔尼 瑞士 喀布尔 阿富汗
119 | 伯尔尼 瑞士 伦敦 英国
120 | 伯尔尼 瑞士 马德里 西班牙
121 | 伯尔尼 瑞士 莫斯科 俄罗斯
122 | 伯尔尼 瑞士 奥斯陆 挪威
123 | 伯尔尼 瑞士 渥太华 加拿大
124 | 伯尔尼 瑞士 巴黎 法国
125 | 伯尔尼 瑞士 罗马 意大利
126 | 伯尔尼 瑞士 斯德哥尔摩 瑞典
127 | 伯尔尼 瑞士 德黑兰 伊朗
128 | 伯尔尼 瑞士 东京 日本
129 | 伯尔尼 瑞士 雅典 希腊
130 | 伯尔尼 瑞士 巴格达 伊拉克
131 | 伯尔尼 瑞士 曼谷 泰国
132 | 伯尔尼 瑞士 北京 中国
133 | 伯尔尼 瑞士 柏林 德国
134 | 开罗 埃及 堪培拉 澳大利亚
135 | 开罗 埃及 河内 越南
136 | 开罗 埃及 哈瓦那 古巴
137 | 开罗 埃及 赫尔辛基 芬兰
138 | 开罗 埃及 伊斯兰堡 巴基斯坦
139 | 开罗 埃及 喀布尔 阿富汗
140 | 开罗 埃及 伦敦 英国
141 | 开罗 埃及 马德里 西班牙
142 | 开罗 埃及 莫斯科 俄罗斯
143 | 开罗 埃及 奥斯陆 挪威
144 | 开罗 埃及 渥太华 加拿大
145 | 开罗 埃及 巴黎 法国
146 | 开罗 埃及 罗马 意大利
147 | 开罗 埃及 斯德哥尔摩 瑞典
148 | 开罗 埃及 德黑兰 伊朗
149 | 开罗 埃及 东京 日本
150 | 开罗 埃及 雅典 希腊
151 | 开罗 埃及 巴格达 伊拉克
152 | 开罗 埃及 曼谷 泰国
153 | 开罗 埃及 北京 中国
154 | 开罗 埃及 柏林 德国
155 | 开罗 埃及 伯尔尼 瑞士
156 | 堪培拉 澳大利亚 河内 越南
157 | 堪培拉 澳大利亚 哈瓦那 古巴
158 | 堪培拉 澳大利亚 赫尔辛基 芬兰
159 | 堪培拉 澳大利亚 伊斯兰堡 巴基斯坦
160 | 堪培拉 澳大利亚 喀布尔 阿富汗
161 | 堪培拉 澳大利亚 伦敦 英国
162 | 堪培拉 澳大利亚 马德里 西班牙
163 | 堪培拉 澳大利亚 莫斯科 俄罗斯
164 | 堪培拉 澳大利亚 奥斯陆 挪威
165 | 堪培拉 澳大利亚 渥太华 加拿大
166 | 堪培拉 澳大利亚 巴黎 法国
167 | 堪培拉 澳大利亚 罗马 意大利
168 | 堪培拉 澳大利亚 斯德哥尔摩 瑞典
169 | 堪培拉 澳大利亚 德黑兰 伊朗
170 | 堪培拉 澳大利亚 东京 日本
171 | 堪培拉 澳大利亚 雅典 希腊
172 | 堪培拉 澳大利亚 巴格达 伊拉克
173 | 堪培拉 澳大利亚 曼谷 泰国
174 | 堪培拉 澳大利亚 北京 中国
175 | 堪培拉 澳大利亚 柏林 德国
176 | 堪培拉 澳大利亚 伯尔尼 瑞士
177 | 堪培拉 澳大利亚 开罗 埃及
178 | 河内 越南 哈瓦那 古巴
179 | 河内 越南 赫尔辛基 芬兰
180 | 河内 越南 伊斯兰堡 巴基斯坦
181 | 河内 越南 喀布尔 阿富汗
182 | 河内 越南 伦敦 英国
183 | 河内 越南 马德里 西班牙
184 | 河内 越南 莫斯科 俄罗斯
185 | 河内 越南 奥斯陆 挪威
186 | 河内 越南 渥太华 加拿大
187 | 河内 越南 巴黎 法国
188 | 河内 越南 罗马 意大利
189 | 河内 越南 斯德哥尔摩 瑞典
190 | 河内 越南 德黑兰 伊朗
191 | 河内 越南 东京 日本
192 | 河内 越南 雅典 希腊
193 | 河内 越南 巴格达 伊拉克
194 | 河内 越南 曼谷 泰国
195 | 河内 越南 北京 中国
196 | 河内 越南 柏林 德国
197 | 河内 越南 伯尔尼 瑞士
198 | 河内 越南 开罗 埃及
199 | 河内 越南 堪培拉 澳大利亚
200 | 哈瓦那 古巴 赫尔辛基 芬兰
201 | 哈瓦那 古巴 伊斯兰堡 巴基斯坦
202 | 哈瓦那 古巴 喀布尔 阿富汗
203 | 哈瓦那 古巴 伦敦 英国
204 | 哈瓦那 古巴 马德里 西班牙
205 | 哈瓦那 古巴 莫斯科 俄罗斯
206 | 哈瓦那 古巴 奥斯陆 挪威
207 | 哈瓦那 古巴 渥太华 加拿大
208 | 哈瓦那 古巴 巴黎 法国
209 | 哈瓦那 古巴 罗马 意大利
210 | 哈瓦那 古巴 斯德哥尔摩 瑞典
211 | 哈瓦那 古巴 德黑兰 伊朗
212 | 哈瓦那 古巴 东京 日本
213 | 哈瓦那 古巴 雅典 希腊
214 | 哈瓦那 古巴 巴格达 伊拉克
215 | 哈瓦那 古巴 曼谷 泰国
216 | 哈瓦那 古巴 北京 中国
217 | 哈瓦那 古巴 柏林 德国
218 | 哈瓦那 古巴 伯尔尼 瑞士
219 | 哈瓦那 古巴 开罗 埃及
220 | 哈瓦那 古巴 堪培拉 澳大利亚
221 | 哈瓦那 古巴 河内 越南
222 | 赫尔辛基 芬兰 伊斯兰堡 巴基斯坦
223 | 赫尔辛基 芬兰 喀布尔 阿富汗
224 | 赫尔辛基 芬兰 伦敦 英国
225 | 赫尔辛基 芬兰 马德里 西班牙
226 | 赫尔辛基 芬兰 莫斯科 俄罗斯
227 | 赫尔辛基 芬兰 奥斯陆 挪威
228 | 赫尔辛基 芬兰 渥太华 加拿大
229 | 赫尔辛基 芬兰 巴黎 法国
230 | 赫尔辛基 芬兰 罗马 意大利
231 | 赫尔辛基 芬兰 斯德哥尔摩 瑞典
232 | 赫尔辛基 芬兰 德黑兰 伊朗
233 | 赫尔辛基 芬兰 东京 日本
234 | 赫尔辛基 芬兰 雅典 希腊
235 | 赫尔辛基 芬兰 巴格达 伊拉克
236 | 赫尔辛基 芬兰 曼谷 泰国
237 | 赫尔辛基 芬兰 北京 中国
238 | 赫尔辛基 芬兰 柏林 德国
239 | 赫尔辛基 芬兰 伯尔尼 瑞士
240 | 赫尔辛基 芬兰 开罗 埃及
241 | 赫尔辛基 芬兰 堪培拉 澳大利亚
242 | 赫尔辛基 芬兰 河内 越南
243 | 赫尔辛基 芬兰 哈瓦那 古巴
244 | 伊斯兰堡 巴基斯坦 喀布尔 阿富汗
245 | 伊斯兰堡 巴基斯坦 伦敦 英国
246 | 伊斯兰堡 巴基斯坦 马德里 西班牙
247 | 伊斯兰堡 巴基斯坦 莫斯科 俄罗斯
248 | 伊斯兰堡 巴基斯坦 奥斯陆 挪威
249 | 伊斯兰堡 巴基斯坦 渥太华 加拿大
250 | 伊斯兰堡 巴基斯坦 巴黎 法国
251 | 伊斯兰堡 巴基斯坦 罗马 意大利
252 | 伊斯兰堡 巴基斯坦 斯德哥尔摩 瑞典
253 | 伊斯兰堡 巴基斯坦 德黑兰 伊朗
254 | 伊斯兰堡 巴基斯坦 东京 日本
255 | 伊斯兰堡 巴基斯坦 雅典 希腊
256 | 伊斯兰堡 巴基斯坦 巴格达 伊拉克
257 | 伊斯兰堡 巴基斯坦 曼谷 泰国
258 | 伊斯兰堡 巴基斯坦 北京 中国
259 | 伊斯兰堡 巴基斯坦 柏林 德国
260 | 伊斯兰堡 巴基斯坦 伯尔尼 瑞士
261 | 伊斯兰堡 巴基斯坦 开罗 埃及
262 | 伊斯兰堡 巴基斯坦 堪培拉 澳大利亚
263 | 伊斯兰堡 巴基斯坦 河内 越南
264 | 伊斯兰堡 巴基斯坦 哈瓦那 古巴
265 | 伊斯兰堡 巴基斯坦 赫尔辛基 芬兰
266 | 喀布尔 阿富汗 伦敦 英国
267 | 喀布尔 阿富汗 马德里 西班牙
268 | 喀布尔 阿富汗 莫斯科 俄罗斯
269 | 喀布尔 阿富汗 奥斯陆 挪威
270 | 喀布尔 阿富汗 渥太华 加拿大
271 | 喀布尔 阿富汗 巴黎 法国
272 | 喀布尔 阿富汗 罗马 意大利
273 | 喀布尔 阿富汗 斯德哥尔摩 瑞典
274 | 喀布尔 阿富汗 德黑兰 伊朗
275 | 喀布尔 阿富汗 东京 日本
276 | 喀布尔 阿富汗 雅典 希腊
277 | 喀布尔 阿富汗 巴格达 伊拉克
278 | 喀布尔 阿富汗 曼谷 泰国
279 | 喀布尔 阿富汗 北京 中国
280 | 喀布尔 阿富汗 柏林 德国
281 | 喀布尔 阿富汗 伯尔尼 瑞士
282 | 喀布尔 阿富汗 开罗 埃及
283 | 喀布尔 阿富汗 堪培拉 澳大利亚
284 | 喀布尔 阿富汗 河内 越南
285 | 喀布尔 阿富汗 哈瓦那 古巴
286 | 喀布尔 阿富汗 赫尔辛基 芬兰
287 | 喀布尔 阿富汗 伊斯兰堡 巴基斯坦
288 | 伦敦 英国 马德里 西班牙
289 | 伦敦 英国 莫斯科 俄罗斯
290 | 伦敦 英国 奥斯陆 挪威
291 | 伦敦 英国 渥太华 加拿大
292 | 伦敦 英国 巴黎 法国
293 | 伦敦 英国 罗马 意大利
294 | 伦敦 英国 斯德哥尔摩 瑞典
295 | 伦敦 英国 德黑兰 伊朗
296 | 伦敦 英国 东京 日本
297 | 伦敦 英国 雅典 希腊
298 | 伦敦 英国 巴格达 伊拉克
299 | 伦敦 英国 曼谷 泰国
300 | 伦敦 英国 北京 中国
301 | 伦敦 英国 柏林 德国
302 | 伦敦 英国 伯尔尼 瑞士
303 | 伦敦 英国 开罗 埃及
304 | 伦敦 英国 堪培拉 澳大利亚
305 | 伦敦 英国 河内 越南
306 | 伦敦 英国 哈瓦那 古巴
307 | 伦敦 英国 赫尔辛基 芬兰
308 | 伦敦 英国 伊斯兰堡 巴基斯坦
309 | 伦敦 英国 喀布尔 阿富汗
310 | 马德里 西班牙 莫斯科 俄罗斯
311 | 马德里 西班牙 奥斯陆 挪威
312 | 马德里 西班牙 渥太华 加拿大
313 | 马德里 西班牙 巴黎 法国
314 | 马德里 西班牙 罗马 意大利
315 | 马德里 西班牙 斯德哥尔摩 瑞典
316 | 马德里 西班牙 德黑兰 伊朗
317 | 马德里 西班牙 东京 日本
318 | 马德里 西班牙 雅典 希腊
319 | 马德里 西班牙 巴格达 伊拉克
320 | 马德里 西班牙 曼谷 泰国
321 | 马德里 西班牙 北京 中国
322 | 马德里 西班牙 柏林 德国
323 | 马德里 西班牙 伯尔尼 瑞士
324 | 马德里 西班牙 开罗 埃及
325 | 马德里 西班牙 堪培拉 澳大利亚
326 | 马德里 西班牙 河内 越南
327 | 马德里 西班牙 哈瓦那 古巴
328 | 马德里 西班牙 赫尔辛基 芬兰
329 | 马德里 西班牙 伊斯兰堡 巴基斯坦
330 | 马德里 西班牙 喀布尔 阿富汗
331 | 马德里 西班牙 伦敦 英国
332 | 莫斯科 俄罗斯 奥斯陆 挪威
333 | 莫斯科 俄罗斯 渥太华 加拿大
334 | 莫斯科 俄罗斯 巴黎 法国
335 | 莫斯科 俄罗斯 罗马 意大利
336 | 莫斯科 俄罗斯 斯德哥尔摩 瑞典
337 | 莫斯科 俄罗斯 德黑兰 伊朗
338 | 莫斯科 俄罗斯 东京 日本
339 | 莫斯科 俄罗斯 雅典 希腊
340 | 莫斯科 俄罗斯 巴格达 伊拉克
341 | 莫斯科 俄罗斯 曼谷 泰国
342 | 莫斯科 俄罗斯 北京 中国
343 | 莫斯科 俄罗斯 柏林 德国
344 | 莫斯科 俄罗斯 伯尔尼 瑞士
345 | 莫斯科 俄罗斯 开罗 埃及
346 | 莫斯科 俄罗斯 堪培拉 澳大利亚
347 | 莫斯科 俄罗斯 河内 越南
348 | 莫斯科 俄罗斯 哈瓦那 古巴
349 | 莫斯科 俄罗斯 赫尔辛基 芬兰
350 | 莫斯科 俄罗斯 伊斯兰堡 巴基斯坦
351 | 莫斯科 俄罗斯 喀布尔 阿富汗
352 | 莫斯科 俄罗斯 伦敦 英国
353 | 莫斯科 俄罗斯 马德里 西班牙
354 | 奥斯陆 挪威 渥太华 加拿大
355 | 奥斯陆 挪威 巴黎 法国
356 | 奥斯陆 挪威 罗马 意大利
357 | 奥斯陆 挪威 斯德哥尔摩 瑞典
358 | 奥斯陆 挪威 德黑兰 伊朗
359 | 奥斯陆 挪威 东京 日本
360 | 奥斯陆 挪威 雅典 希腊
361 | 奥斯陆 挪威 巴格达 伊拉克
362 | 奥斯陆 挪威 曼谷 泰国
363 | 奥斯陆 挪威 北京 中国
364 | 奥斯陆 挪威 柏林 德国
365 | 奥斯陆 挪威 伯尔尼 瑞士
366 | 奥斯陆 挪威 开罗 埃及
367 | 奥斯陆 挪威 堪培拉 澳大利亚
368 | 奥斯陆 挪威 河内 越南
369 | 奥斯陆 挪威 哈瓦那 古巴
370 | 奥斯陆 挪威 赫尔辛基 芬兰
371 | 奥斯陆 挪威 伊斯兰堡 巴基斯坦
372 | 奥斯陆 挪威 喀布尔 阿富汗
373 | 奥斯陆 挪威 伦敦 英国
374 | 奥斯陆 挪威 马德里 西班牙
375 | 奥斯陆 挪威 莫斯科 俄罗斯
376 | 渥太华 加拿大 巴黎 法国
377 | 渥太华 加拿大 罗马 意大利
378 | 渥太华 加拿大 斯德哥尔摩 瑞典
379 | 渥太华 加拿大 德黑兰 伊朗
380 | 渥太华 加拿大 东京 日本
381 | 渥太华 加拿大 雅典 希腊
382 | 渥太华 加拿大 巴格达 伊拉克
383 | 渥太华 加拿大 曼谷 泰国
384 | 渥太华 加拿大 北京 中国
385 | 渥太华 加拿大 柏林 德国
386 | 渥太华 加拿大 伯尔尼 瑞士
387 | 渥太华 加拿大 开罗 埃及
388 | 渥太华 加拿大 堪培拉 澳大利亚
389 | 渥太华 加拿大 河内 越南
390 | 渥太华 加拿大 哈瓦那 古巴
391 | 渥太华 加拿大 赫尔辛基 芬兰
392 | 渥太华 加拿大 伊斯兰堡 巴基斯坦
393 | 渥太华 加拿大 喀布尔 阿富汗
394 | 渥太华 加拿大 伦敦 英国
395 | 渥太华 加拿大 马德里 西班牙
396 | 渥太华 加拿大 莫斯科 俄罗斯
397 | 渥太华 加拿大 奥斯陆 挪威
398 | 巴黎 法国 罗马 意大利
399 | 巴黎 法国 斯德哥尔摩 瑞典
400 | 巴黎 法国 德黑兰 伊朗
401 | 巴黎 法国 东京 日本
402 | 巴黎 法国 雅典 希腊
403 | 巴黎 法国 巴格达 伊拉克
404 | 巴黎 法国 曼谷 泰国
405 | 巴黎 法国 北京 中国
406 | 巴黎 法国 柏林 德国
407 | 巴黎 法国 伯尔尼 瑞士
408 | 巴黎 法国 开罗 埃及
409 | 巴黎 法国 堪培拉 澳大利亚
410 | 巴黎 法国 河内 越南
411 | 巴黎 法国 哈瓦那 古巴
412 | 巴黎 法国 赫尔辛基 芬兰
413 | 巴黎 法国 伊斯兰堡 巴基斯坦
414 | 巴黎 法国 喀布尔 阿富汗
415 | 巴黎 法国 伦敦 英国
416 | 巴黎 法国 马德里 西班牙
417 | 巴黎 法国 莫斯科 俄罗斯
418 | 巴黎 法国 奥斯陆 挪威
419 | 巴黎 法国 渥太华 加拿大
420 | 罗马 意大利 斯德哥尔摩 瑞典
421 | 罗马 意大利 德黑兰 伊朗
422 | 罗马 意大利 东京 日本
423 | 罗马 意大利 雅典 希腊
424 | 罗马 意大利 巴格达 伊拉克
425 | 罗马 意大利 曼谷 泰国
426 | 罗马 意大利 北京 中国
427 | 罗马 意大利 柏林 德国
428 | 罗马 意大利 伯尔尼 瑞士
429 | 罗马 意大利 开罗 埃及
430 | 罗马 意大利 堪培拉 澳大利亚
431 | 罗马 意大利 河内 越南
432 | 罗马 意大利 哈瓦那 古巴
433 | 罗马 意大利 赫尔辛基 芬兰
434 | 罗马 意大利 伊斯兰堡 巴基斯坦
435 | 罗马 意大利 喀布尔 阿富汗
436 | 罗马 意大利 伦敦 英国
437 | 罗马 意大利 马德里 西班牙
438 | 罗马 意大利 莫斯科 俄罗斯
439 | 罗马 意大利 奥斯陆 挪威
440 | 罗马 意大利 渥太华 加拿大
441 | 罗马 意大利 巴黎 法国
442 | 斯德哥尔摩 瑞典 德黑兰 伊朗
443 | 斯德哥尔摩 瑞典 东京 日本
444 | 斯德哥尔摩 瑞典 雅典 希腊
445 | 斯德哥尔摩 瑞典 巴格达 伊拉克
446 | 斯德哥尔摩 瑞典 曼谷 泰国
447 | 斯德哥尔摩 瑞典 北京 中国
448 | 斯德哥尔摩 瑞典 柏林 德国
449 | 斯德哥尔摩 瑞典 伯尔尼 瑞士
450 | 斯德哥尔摩 瑞典 开罗 埃及
451 | 斯德哥尔摩 瑞典 堪培拉 澳大利亚
452 | 斯德哥尔摩 瑞典 河内 越南
453 | 斯德哥尔摩 瑞典 哈瓦那 古巴
454 | 斯德哥尔摩 瑞典 赫尔辛基 芬兰
455 | 斯德哥尔摩 瑞典 伊斯兰堡 巴基斯坦
456 | 斯德哥尔摩 瑞典 喀布尔 阿富汗
457 | 斯德哥尔摩 瑞典 伦敦 英国
458 | 斯德哥尔摩 瑞典 马德里 西班牙
459 | 斯德哥尔摩 瑞典 莫斯科 俄罗斯
460 | 斯德哥尔摩 瑞典 奥斯陆 挪威
461 | 斯德哥尔摩 瑞典 渥太华 加拿大
462 | 斯德哥尔摩 瑞典 巴黎 法国
463 | 斯德哥尔摩 瑞典 罗马 意大利
464 | 德黑兰 伊朗 东京 日本
465 | 德黑兰 伊朗 雅典 希腊
466 | 德黑兰 伊朗 巴格达 伊拉克
467 | 德黑兰 伊朗 曼谷 泰国
468 | 德黑兰 伊朗 北京 中国
469 | 德黑兰 伊朗 柏林 德国
470 | 德黑兰 伊朗 伯尔尼 瑞士
471 | 德黑兰 伊朗 开罗 埃及
472 | 德黑兰 伊朗 堪培拉 澳大利亚
473 | 德黑兰 伊朗 河内 越南
474 | 德黑兰 伊朗 哈瓦那 古巴
475 | 德黑兰 伊朗 赫尔辛基 芬兰
476 | 德黑兰 伊朗 伊斯兰堡 巴基斯坦
477 | 德黑兰 伊朗 喀布尔 阿富汗
478 | 德黑兰 伊朗 伦敦 英国
479 | 德黑兰 伊朗 马德里 西班牙
480 | 德黑兰 伊朗 莫斯科 俄罗斯
481 | 德黑兰 伊朗 奥斯陆 挪威
482 | 德黑兰 伊朗 渥太华 加拿大
483 | 德黑兰 伊朗 巴黎 法国
484 | 德黑兰 伊朗 罗马 意大利
485 | 德黑兰 伊朗 斯德哥尔摩 瑞典
486 | 东京 日本 雅典 希腊
487 | 东京 日本 巴格达 伊拉克
488 | 东京 日本 曼谷 泰国
489 | 东京 日本 北京 中国
490 | 东京 日本 柏林 德国
491 | 东京 日本 伯尔尼 瑞士
492 | 东京 日本 开罗 埃及
493 | 东京 日本 堪培拉 澳大利亚
494 | 东京 日本 河内 越南
495 | 东京 日本 哈瓦那 古巴
496 | 东京 日本 赫尔辛基 芬兰
497 | 东京 日本 伊斯兰堡 巴基斯坦
498 | 东京 日本 喀布尔 阿富汗
499 | 东京 日本 伦敦 英国
500 | 东京 日本 马德里 西班牙
501 | 东京 日本 莫斯科 俄罗斯
502 | 东京 日本 奥斯陆 挪威
503 | 东京 日本 渥太华 加拿大
504 | 东京 日本 巴黎 法国
505 | 东京 日本 罗马 意大利
506 | 东京 日本 斯德哥尔摩 瑞典
507 | 东京 日本 德黑兰 伊朗
508 | : city-in-state
509 | 石家庄 河北 南昌 江西
510 | 石家庄 河北 海口 海南
511 | 石家庄 河北 兰州 甘肃
512 | 石家庄 河北 西宁 青海
513 | 太原 山西 南昌 江西
514 | 太原 山西 广州 广东
515 | 太原 山西 西宁 青海
516 | 沈阳 辽宁 哈尔滨 黑龙江
517 | 沈阳 辽宁 杭州 浙江
518 | 沈阳 辽宁 南昌 江西
519 | 沈阳 辽宁 贵阳 贵州
520 | 沈阳 辽宁 兰州 甘肃
521 | 沈阳 辽宁 南宁 广西
522 | 沈阳 辽宁 银川 宁夏
523 | 长春 吉林 石家庄 河北
524 | 长春 吉林 哈尔滨 黑龙江
525 | 长春 吉林 南京 江苏
526 | 长春 吉林 杭州 浙江
527 | 长春 吉林 合肥 安徽
528 | 长春 吉林 南昌 江西
529 | 长春 吉林 广州 广东
530 | 长春 吉林 贵阳 贵州
531 | 长春 吉林 西安 陕西
532 | 长春 吉林 呼和浩特 内蒙古
533 | 哈尔滨 黑龙江 南京 江苏
534 | 哈尔滨 黑龙江 南昌 江西
535 | 哈尔滨 黑龙江 贵阳 贵州
536 | 哈尔滨 黑龙江 昆明 云南
537 | 哈尔滨 黑龙江 南宁 广西
538 | 南京 江苏 杭州 浙江
539 | 南京 江苏 合肥 安徽
540 | 南京 江苏 福州 福建
541 | 南京 江苏 郑州 河南
542 | 南京 江苏 广州 广东
543 | 南京 江苏 成都 四川
544 | 南京 江苏 贵阳 贵州
545 | 南京 江苏 西安 陕西
546 | 南京 江苏 呼和浩特 内蒙古
547 | 杭州 浙江 广州 广东
548 | 杭州 浙江 海口 海南
549 | 杭州 浙江 西宁 青海
550 | 杭州 浙江 南宁 广西
551 | 合肥 安徽 太原 山西
552 | 合肥 安徽 沈阳 辽宁
553 | 合肥 安徽 长春 吉林
554 | 合肥 安徽 杭州 浙江
555 | 合肥 安徽 成都 四川
556 | 合肥 安徽 兰州 甘肃
557 | 福州 福建 石家庄 河北
558 | 福州 福建 南昌 江西
559 | 福州 福建 郑州 河南
560 | 福州 福建 贵阳 贵州
561 | 福州 福建 昆明 云南
562 | 福州 福建 乌鲁木齐 新疆
563 | 南昌 江西 长春 吉林
564 | 南昌 江西 福州 福建
565 | 南昌 江西 海口 海南
566 | 南昌 江西 银川 宁夏
567 | 济南 山东 太原 山西
568 | 济南 山东 杭州 浙江
569 | 济南 山东 合肥 安徽
570 | 济南 山东 长沙 湖南
571 | 济南 山东 海口 海南
572 | 济南 山东 贵阳 贵州
573 | 济南 山东 西安 陕西
574 | 郑州 河南 长春 吉林
575 | 郑州 河南 福州 福建
576 | 郑州 河南 武汉 湖北
577 | 郑州 河南 长沙 湖南
578 | 郑州 河南 成都 四川
579 | 郑州 河南 昆明 云南
580 | 郑州 河南 兰州 甘肃
581 | 郑州 河南 银川 宁夏
582 | 武汉 湖北 沈阳 辽宁
583 | 武汉 湖北 杭州 浙江
584 | 武汉 湖北 西安 陕西
585 | 武汉 湖北 兰州 甘肃
586 | 武汉 湖北 西宁 青海
587 | 武汉 湖北 拉萨 西藏
588 | 武汉 湖北 银川 宁夏
589 | 长沙 湖南 合肥 安徽
590 | 长沙 湖南 济南 山东
591 | 长沙 湖南 广州 广东
592 | 长沙 湖南 拉萨 西藏
593 | 广州 广东 石家庄 河北
594 | 广州 广东 沈阳 辽宁
595 | 广州 广东 南京 江苏
596 | 广州 广东 杭州 浙江
597 | 广州 广东 福州 福建
598 | 广州 广东 南昌 江西
599 | 广州 广东 济南 山东
600 | 广州 广东 拉萨 西藏
601 | 广州 广东 呼和浩特 内蒙古
602 | 海口 海南 南京 江苏
603 | 海口 海南 济南 山东
604 | 海口 海南 武汉 湖北
605 | 海口 海南 长沙 湖南
606 | 海口 海南 西安 陕西
607 | 成都 四川 太原 山西
608 | 成都 四川 哈尔滨 黑龙江
609 | 成都 四川 南京 江苏
610 | 成都 四川 杭州 浙江
611 | 成都 四川 长沙 湖南
612 | 成都 四川 兰州 甘肃
613 | 成都 四川 南宁 广西
614 | 成都 四川 呼和浩特 内蒙古
615 | 成都 四川 银川 宁夏
616 | 贵阳 贵州 石家庄 河北
617 | 贵阳 贵州 太原 山西
618 | 贵阳 贵州 哈尔滨 黑龙江
619 | 贵阳 贵州 南昌 江西
620 | 贵阳 贵州 济南 山东
621 | 贵阳 贵州 广州 广东
622 | 贵阳 贵州 西安 陕西
623 | 贵阳 贵州 拉萨 西藏
624 | 昆明 云南 长春 吉林
625 | 昆明 云南 杭州 浙江
626 | 昆明 云南 合肥 安徽
627 | 昆明 云南 济南 山东
628 | 昆明 云南 武汉 湖北
629 | 昆明 云南 广州 广东
630 | 昆明 云南 兰州 甘肃
631 | 昆明 云南 西宁 青海
632 | 昆明 云南 呼和浩特 内蒙古
633 | 昆明 云南 乌鲁木齐 新疆
634 | 西安 陕西 石家庄 河北
635 | 西安 陕西 哈尔滨 黑龙江
636 | 西安 陕西 南京 江苏
637 | 西安 陕西 武汉 湖北
638 | 西安 陕西 海口 海南
639 | 西安 陕西 贵阳 贵州
640 | 西安 陕西 呼和浩特 内蒙古
641 | 兰州 甘肃 武汉 湖北
642 | 兰州 甘肃 海口 海南
643 | 兰州 甘肃 西宁 青海
644 | 兰州 甘肃 拉萨 西藏
645 | 兰州 甘肃 南宁 广西
646 | 兰州 甘肃 呼和浩特 内蒙古
647 | 兰州 甘肃 银川 宁夏
648 | 西宁 青海 哈尔滨 黑龙江
649 | 西宁 青海 南京 江苏
650 | 西宁 青海 杭州 浙江
651 | 西宁 青海 济南 山东
652 | 西宁 青海 成都 四川
653 | 西宁 青海 贵阳 贵州
654 | 西宁 青海 南宁 广西
655 | 西宁 青海 银川 宁夏
656 | 拉萨 西藏 石家庄 河北
657 | 拉萨 西藏 哈尔滨 黑龙江
658 | 拉萨 西藏 福州 福建
659 | 拉萨 西藏 郑州 河南
660 | 拉萨 西藏 长沙 湖南
661 | 拉萨 西藏 贵阳 贵州
662 | 拉萨 西藏 西宁 青海
663 | 南宁 广西 杭州 浙江
664 | 南宁 广西 福州 福建
665 | 南宁 广西 南昌 江西
666 | 南宁 广西 成都 四川
667 | 南宁 广西 昆明 云南
668 | 呼和浩特 内蒙古 太原 山西
669 | 呼和浩特 内蒙古 昆明 云南
670 | 呼和浩特 内蒙古 西安 陕西
671 | 呼和浩特 内蒙古 兰州 甘肃
672 | 呼和浩特 内蒙古 拉萨 西藏
673 | 银川 宁夏 福州 福建
674 | 银川 宁夏 拉萨 西藏
675 | 乌鲁木齐 新疆 石家庄 河北
676 | 乌鲁木齐 新疆 沈阳 辽宁
677 | 乌鲁木齐 新疆 哈尔滨 黑龙江
678 | 乌鲁木齐 新疆 合肥 安徽
679 | 乌鲁木齐 新疆 广州 广东
680 | 乌鲁木齐 新疆 成都 四川
681 | 乌鲁木齐 新疆 西安 陕西
682 | 乌鲁木齐 新疆 兰州 甘肃
683 | 乌鲁木齐 新疆 南宁 广西
684 | : family
685 | 男孩 女孩 兄弟 姐妹
686 | 男孩 女孩 爸爸 妈妈
687 | 男孩 女孩 父亲 母亲
688 | 男孩 女孩 祖父 祖母
689 | 男孩 女孩 爷爷 奶奶
690 | 男孩 女孩 孙子 孙女
691 | 男孩 女孩 新郎 新娘
692 | 男孩 女孩 丈夫 妻子
693 | 男孩 女孩 国王 王后
694 | 男孩 女孩 男人 女人
695 | 男孩 女孩 侄子 侄女
696 | 男孩 女孩 王子 公主
697 | 男孩 女孩 儿子 女儿
698 | 男孩 女孩 继父 继母
699 | 男孩 女孩 继子 继女
700 | 男孩 女孩 叔叔 阿姨
701 | 兄弟 姐妹 爸爸 妈妈
702 | 兄弟 姐妹 父亲 母亲
703 | 兄弟 姐妹 祖父 祖母
704 | 兄弟 姐妹 爷爷 奶奶
705 | 兄弟 姐妹 孙子 孙女
706 | 兄弟 姐妹 新郎 新娘
707 | 兄弟 姐妹 丈夫 妻子
708 | 兄弟 姐妹 国王 王后
709 | 兄弟 姐妹 男人 女人
710 | 兄弟 姐妹 侄子 侄女
711 | 兄弟 姐妹 王子 公主
712 | 兄弟 姐妹 儿子 女儿
713 | 兄弟 姐妹 继父 继母
714 | 兄弟 姐妹 继子 继女
715 | 兄弟 姐妹 叔叔 阿姨
716 | 兄弟 姐妹 男孩 女孩
717 | 爸爸 妈妈 父亲 母亲
718 | 爸爸 妈妈 祖父 祖母
719 | 爸爸 妈妈 爷爷 奶奶
720 | 爸爸 妈妈 孙子 孙女
721 | 爸爸 妈妈 新郎 新娘
722 | 爸爸 妈妈 丈夫 妻子
723 | 爸爸 妈妈 国王 王后
724 | 爸爸 妈妈 男人 女人
725 | 爸爸 妈妈 侄子 侄女
726 | 爸爸 妈妈 王子 公主
727 | 爸爸 妈妈 儿子 女儿
728 | 爸爸 妈妈 继父 继母
729 | 爸爸 妈妈 继子 继女
730 | 爸爸 妈妈 叔叔 阿姨
731 | 爸爸 妈妈 男孩 女孩
732 | 爸爸 妈妈 兄弟 姐妹
733 | 父亲 母亲 祖父 祖母
734 | 父亲 母亲 爷爷 奶奶
735 | 父亲 母亲 孙子 孙女
736 | 父亲 母亲 新郎 新娘
737 | 父亲 母亲 丈夫 妻子
738 | 父亲 母亲 国王 王后
739 | 父亲 母亲 男人 女人
740 | 父亲 母亲 侄子 侄女
741 | 父亲 母亲 王子 公主
742 | 父亲 母亲 儿子 女儿
743 | 父亲 母亲 继父 继母
744 | 父亲 母亲 继子 继女
745 | 父亲 母亲 叔叔 阿姨
746 | 父亲 母亲 男孩 女孩
747 | 父亲 母亲 兄弟 姐妹
748 | 父亲 母亲 爸爸 妈妈
749 | 祖父 祖母 爷爷 奶奶
750 | 祖父 祖母 孙子 孙女
751 | 祖父 祖母 新郎 新娘
752 | 祖父 祖母 丈夫 妻子
753 | 祖父 祖母 国王 王后
754 | 祖父 祖母 男人 女人
755 | 祖父 祖母 侄子 侄女
756 | 祖父 祖母 王子 公主
757 | 祖父 祖母 儿子 女儿
758 | 祖父 祖母 继父 继母
759 | 祖父 祖母 继子 继女
760 | 祖父 祖母 叔叔 阿姨
761 | 祖父 祖母 男孩 女孩
762 | 祖父 祖母 兄弟 姐妹
763 | 祖父 祖母 爸爸 妈妈
764 | 祖父 祖母 父亲 母亲
765 | 爷爷 奶奶 孙子 孙女
766 | 爷爷 奶奶 新郎 新娘
767 | 爷爷 奶奶 丈夫 妻子
768 | 爷爷 奶奶 国王 王后
769 | 爷爷 奶奶 男人 女人
770 | 爷爷 奶奶 侄子 侄女
771 | 爷爷 奶奶 王子 公主
772 | 爷爷 奶奶 儿子 女儿
773 | 爷爷 奶奶 继父 继母
774 | 爷爷 奶奶 继子 继女
775 | 爷爷 奶奶 叔叔 阿姨
776 | 爷爷 奶奶 男孩 女孩
777 | 爷爷 奶奶 兄弟 姐妹
778 | 爷爷 奶奶 爸爸 妈妈
779 | 爷爷 奶奶 父亲 母亲
780 | 爷爷 奶奶 祖父 祖母
781 | 孙子 孙女 新郎 新娘
782 | 孙子 孙女 丈夫 妻子
783 | 孙子 孙女 国王 王后
784 | 孙子 孙女 男人 女人
785 | 孙子 孙女 侄子 侄女
786 | 孙子 孙女 王子 公主
787 | 孙子 孙女 儿子 女儿
788 | 孙子 孙女 继父 继母
789 | 孙子 孙女 继子 继女
790 | 孙子 孙女 叔叔 阿姨
791 | 孙子 孙女 男孩 女孩
792 | 孙子 孙女 兄弟 姐妹
793 | 孙子 孙女 爸爸 妈妈
794 | 孙子 孙女 父亲 母亲
795 | 孙子 孙女 祖父 祖母
796 | 孙子 孙女 爷爷 奶奶
797 | 新郎 新娘 丈夫 妻子
798 | 新郎 新娘 国王 王后
799 | 新郎 新娘 男人 女人
800 | 新郎 新娘 侄子 侄女
801 | 新郎 新娘 王子 公主
802 | 新郎 新娘 儿子 女儿
803 | 新郎 新娘 继父 继母
804 | 新郎 新娘 继子 继女
805 | 新郎 新娘 叔叔 阿姨
806 | 新郎 新娘 男孩 女孩
807 | 新郎 新娘 兄弟 姐妹
808 | 新郎 新娘 爸爸 妈妈
809 | 新郎 新娘 父亲 母亲
810 | 新郎 新娘 祖父 祖母
811 | 新郎 新娘 爷爷 奶奶
812 | 新郎 新娘 孙子 孙女
813 | 丈夫 妻子 国王 王后
814 | 丈夫 妻子 男人 女人
815 | 丈夫 妻子 侄子 侄女
816 | 丈夫 妻子 王子 公主
817 | 丈夫 妻子 儿子 女儿
818 | 丈夫 妻子 继父 继母
819 | 丈夫 妻子 继子 继女
820 | 丈夫 妻子 叔叔 阿姨
821 | 丈夫 妻子 男孩 女孩
822 | 丈夫 妻子 兄弟 姐妹
823 | 丈夫 妻子 爸爸 妈妈
824 | 丈夫 妻子 父亲 母亲
825 | 丈夫 妻子 祖父 祖母
826 | 丈夫 妻子 爷爷 奶奶
827 | 丈夫 妻子 孙子 孙女
828 | 丈夫 妻子 新郎 新娘
829 | 国王 王后 男人 女人
830 | 国王 王后 侄子 侄女
831 | 国王 王后 王子 公主
832 | 国王 王后 儿子 女儿
833 | 国王 王后 继父 继母
834 | 国王 王后 继子 继女
835 | 国王 王后 叔叔 阿姨
836 | 国王 王后 男孩 女孩
837 | 国王 王后 兄弟 姐妹
838 | 国王 王后 爸爸 妈妈
839 | 国王 王后 父亲 母亲
840 | 国王 王后 祖父 祖母
841 | 国王 王后 爷爷 奶奶
842 | 国王 王后 孙子 孙女
843 | 国王 王后 新郎 新娘
844 | 国王 王后 丈夫 妻子
845 | 男人 女人 侄子 侄女
846 | 男人 女人 王子 公主
847 | 男人 女人 儿子 女儿
848 | 男人 女人 继父 继母
849 | 男人 女人 继子 继女
850 | 男人 女人 叔叔 阿姨
851 | 男人 女人 男孩 女孩
852 | 男人 女人 兄弟 姐妹
853 | 男人 女人 爸爸 妈妈
854 | 男人 女人 父亲 母亲
855 | 男人 女人 祖父 祖母
856 | 男人 女人 爷爷 奶奶
857 | 男人 女人 孙子 孙女
858 | 男人 女人 新郎 新娘
859 | 男人 女人 丈夫 妻子
860 | 男人 女人 国王 王后
861 | 侄子 侄女 王子 公主
862 | 侄子 侄女 儿子 女儿
863 | 侄子 侄女 继父 继母
864 | 侄子 侄女 继子 继女
865 | 侄子 侄女 叔叔 阿姨
866 | 侄子 侄女 男孩 女孩
867 | 侄子 侄女 兄弟 姐妹
868 | 侄子 侄女 爸爸 妈妈
869 | 侄子 侄女 父亲 母亲
870 | 侄子 侄女 祖父 祖母
871 | 侄子 侄女 爷爷 奶奶
872 | 侄子 侄女 孙子 孙女
873 | 侄子 侄女 新郎 新娘
874 | 侄子 侄女 丈夫 妻子
875 | 侄子 侄女 国王 王后
876 | 侄子 侄女 男人 女人
877 | 王子 公主 儿子 女儿
878 | 王子 公主 继父 继母
879 | 王子 公主 继子 继女
880 | 王子 公主 叔叔 阿姨
881 | 王子 公主 男孩 女孩
882 | 王子 公主 兄弟 姐妹
883 | 王子 公主 爸爸 妈妈
884 | 王子 公主 父亲 母亲
885 | 王子 公主 祖父 祖母
886 | 王子 公主 爷爷 奶奶
887 | 王子 公主 孙子 孙女
888 | 王子 公主 新郎 新娘
889 | 王子 公主 丈夫 妻子
890 | 王子 公主 国王 王后
891 | 王子 公主 男人 女人
892 | 王子 公主 侄子 侄女
893 | 儿子 女儿 继父 继母
894 | 儿子 女儿 继子 继女
895 | 儿子 女儿 叔叔 阿姨
896 | 儿子 女儿 男孩 女孩
897 | 儿子 女儿 兄弟 姐妹
898 | 儿子 女儿 爸爸 妈妈
899 | 儿子 女儿 父亲 母亲
900 | 儿子 女儿 祖父 祖母
901 | 儿子 女儿 爷爷 奶奶
902 | 儿子 女儿 孙子 孙女
903 | 儿子 女儿 新郎 新娘
904 | 儿子 女儿 丈夫 妻子
905 | 儿子 女儿 国王 王后
906 | 儿子 女儿 男人 女人
907 | 儿子 女儿 侄子 侄女
908 | 儿子 女儿 王子 公主
909 | 继父 继母 继子 继女
910 | 继父 继母 叔叔 阿姨
911 | 继父 继母 男孩 女孩
912 | 继父 继母 兄弟 姐妹
913 | 继父 继母 爸爸 妈妈
914 | 继父 继母 父亲 母亲
915 | 继父 继母 祖父 祖母
916 | 继父 继母 爷爷 奶奶
917 | 继父 继母 孙子 孙女
918 | 继父 继母 新郎 新娘
919 | 继父 继母 丈夫 妻子
920 | 继父 继母 国王 王后
921 | 继父 继母 男人 女人
922 | 继父 继母 侄子 侄女
923 | 继父 继母 王子 公主
924 | 继父 继母 儿子 女儿
925 | 继子 继女 叔叔 阿姨
926 | 继子 继女 男孩 女孩
927 | 继子 继女 兄弟 姐妹
928 | 继子 继女 爸爸 妈妈
929 | 继子 继女 父亲 母亲
930 | 继子 继女 祖父 祖母
931 | 继子 继女 爷爷 奶奶
932 | 继子 继女 孙子 孙女
933 | 继子 继女 新郎 新娘
934 | 继子 继女 丈夫 妻子
935 | 继子 继女 国王 王后
936 | 继子 继女 男人 女人
937 | 继子 继女 侄子 侄女
938 | 继子 继女 王子 公主
939 | 继子 继女 儿子 女儿
940 | 继子 继女 继父 继母
941 | 叔叔 阿姨 男孩 女孩
942 | 叔叔 阿姨 兄弟 姐妹
943 | 叔叔 阿姨 爸爸 妈妈
944 | 叔叔 阿姨 父亲 母亲
945 | 叔叔 阿姨 祖父 祖母
946 | 叔叔 阿姨 爷爷 奶奶
947 | 叔叔 阿姨 孙子 孙女
948 | 叔叔 阿姨 新郎 新娘
949 | 叔叔 阿姨 丈夫 妻子
950 | 叔叔 阿姨 国王 王后
951 | 叔叔 阿姨 男人 女人
952 | 叔叔 阿姨 侄子 侄女
953 | 叔叔 阿姨 王子 公主
954 | 叔叔 阿姨 儿子 女儿
955 | 叔叔 阿姨 继父 继母
956 | 叔叔 阿姨 继子 继女


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Chinese Word Vectors 中文词向量
  2 | This project provides 100+ Chinese Word Vectors (embeddings) trained with different **representations** (dense and sparse), **context features** (word, ngram, character, and more), and **corpora**. One can easily obtain pre-trained vectors with different properties and use them for downstream tasks. 
  3 | 
  4 | Moreover, we provide a Chinese analogical reasoning dataset **CA8** and an evaluation toolkit for users to evaluate the quality of their word vectors.
  5 | 
  6 | ## Reference
  7 | Please cite the paper, if using these embeddings and CA8 dataset.
  8 | 
  9 | Shen Li, Zhe Zhao, Renfen Hu, Wensi Li, Tao Liu, Xiaoyong Du, <a href="http://aclweb.org/anthology/P18-2023"><em>Analogical Reasoning on Chinese Morphological and Semantic Relations</em></a>, ACL 2018.
 10 | 
 11 | ```
 12 | @InProceedings{P18-2023,
 13 |   author =  "Li, Shen
 14 |     and Zhao, Zhe
 15 |     and Hu, Renfen
 16 |     and Li, Wensi
 17 |     and Liu, Tao
 18 |     and Du, Xiaoyong",
 19 |   title =   "Analogical Reasoning on Chinese Morphological and Semantic Relations",
 20 |   booktitle =   "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
 21 |   year =  "2018",
 22 |   publisher =   "Association for Computational Linguistics",
 23 |   pages =   "138--143",
 24 |   location =  "Melbourne, Australia",
 25 |   url =   "http://aclweb.org/anthology/P18-2023"
 26 | }
 27 | ```
 28 | 
 29 | &nbsp;
 30 | 
 31 | A detailed analysis of the relation between the intrinsic and extrinsic evaluations of Chinese word embeddings is shown in the paper:
 32 | 
 33 | Yuanyuan Qiu, Hongzheng Li, Shen Li, Yingdi Jiang, Renfen Hu, Lijiao Yang. <a href="http://www.cips-cl.org/static/anthology/CCL-2018/CCL-18-086.pdf"><em>Revisiting Correlations between Intrinsic and Extrinsic Evaluations of Word Embeddings</em></a>. Chinese Computational Linguistics and Natural Language Processing Based on Naturally Annotated Big Data. Springer, Cham, 2018. 209-221. (CCL & NLP-NABD 2018 Best Paper)
 34 | 
 35 | ```
 36 | @incollection{qiu2018revisiting,
 37 |   title={Revisiting Correlations between Intrinsic and Extrinsic Evaluations of Word Embeddings},
 38 |   author={Qiu, Yuanyuan and Li, Hongzheng and Li, Shen and Jiang, Yingdi and Hu, Renfen and Yang, Lijiao},
 39 |   booktitle={Chinese Computational Linguistics and Natural Language Processing Based on Naturally Annotated Big Data},
 40 |   pages={209--221},
 41 |   year={2018},
 42 |   publisher={Springer}
 43 | }
 44 | ```
 45 | 
 46 | ## Format
 47 | The pre-trained vector files are in text format. Each line contains a word and its vector. Each value is separated by space. The first line records the meta information: the first number indicates the number of words in the file and the second indicates the dimension size. 
 48 | 
 49 | Besides dense word vectors (trained with SGNS), we also provide sparse vectors (trained with PPMI). They are in the same format with liblinear, where the number before " : " denotes dimension index and the number after the " : " denotes the value. 
 50 | 
 51 | ## Pre-trained Chinese Word Vectors
 52 | 
 53 | ### Basic Settings
 54 | 
 55 | <table>
 56 |   <tr align="center">
 57 |     <td><b>Window Size</b></td>
 58 |     <td><b>Dynamic Window</b></td>
 59 |     <td><b>Sub-sampling</b></td>
 60 |     <td><b>Low-Frequency Word</b></td>
 61 |     <td><b>Iteration</b></td>
 62 |     <td><b>Negative Sampling<sup>*</sup></b></td>
 63 |   </tr>
 64 |   <tr align="center">
 65 |     <td>5</td>
 66 |     <td>Yes</td>
 67 |     <td>1e-5</td>
 68 |     <td>10</td>
 69 |     <td>5</td>
 70 |     <td>5</td>
 71 |   </tr>
 72 | </table>
 73 | 
 74 | <sup>\*</sup>Only for SGNS.
 75 | 
 76 | ### Various Domains
 77 | 
 78 | Chinese Word Vectors trained with different representations, context features, and corpora.
 79 | 
 80 | <table align="center">
 81 |     <tr align="center">
 82 |         <td colspan="5"><b>Word2vec / Skip-Gram with Negative Sampling (SGNS)</b></td>
 83 |     </tr>
 84 |     <tr align="center">
 85 |         <td rowspan="2">Corpus</td>
 86 |         <td colspan="4">Context Features</td>
 87 |     </tr>
 88 |     <tr  align="center">
 89 |       <td>Word</td>
 90 |       <td>Word + Ngram</td>
 91 |       <td>Word + Character</td>
 92 |       <td>Word + Character + Ngram</td>
 93 |     </tr>
 94 |     <tr  align="center">
 95 |       <td>Baidu Encyclopedia 百度百科</td>
 96 |       <td><a href="https://pan.baidu.com/s/1Rn7LtTH0n7SHyHPfjRHbkg">300d</a></td>
 97 |       <td><a href="https://pan.baidu.com/s/1XEmP_0FkQwOjipCjI2OPEw">300d</a></td>
 98 |       <td><a href="https://pan.baidu.com/s/1eeCS7uD3e_qVN8rPwmXhAw">300d</a></td>
 99 |       <td><a href="https://pan.baidu.com/s/1IiIbQGJ_AooTj5s8aZYcvA">300d</a> / PWD: 5555</td>
100 |     </tr>
101 |     <tr  align="center">
102 |       <td>Wikipedia_zh 中文维基百科</td>
103 |       <td><a href="https://pan.baidu.com/s/1AmXYWVgkxrG4GokevPtNgA">300d</a></td>
104 |       <td><a href="https://pan.baidu.com/s/1ZKePwxwsDdzNrfkc6WKdGQ">300d</a></td>
105 |       <td><a href="https://pan.baidu.com/s/1ZBVVD4mUSUuXOxlZ3V71ZA">300d</a></td>
106 |       <td><a href="https://pan.baidu.com/s/19wQrclyynOnco3JBvnI5pA">300d</td>
107 |     </tr>
108 |     <tr  align="center">
109 |       <td>People's Daily News 人民日报</td>
110 |       <td><a href="https://pan.baidu.com/s/19sqMz-JAhhxh3o6ecvQxQw">300d</a></td>
111 |       <td><a href="https://pan.baidu.com/s/1upPkA8KJnxTZBfjuNDtaeQ">300d</a></td>
112 |       <td><a href="https://pan.baidu.com/s/1BvKk2QjbtQMch7EISppW2A">300d</a></td>
113 |       <td><a href="https://pan.baidu.com/s/19Vso_k79FZb5OZCWQPAnFQ">300d</a></td>
114 |     </tr>
115 |     <tr  align="center">
116 |       <td>Sogou News 搜狗新闻</td>
117 |       <td><a href="https://pan.baidu.com/s/1tUghuTno5yOvOx4LXA9-wg">300d</a></td>
118 |       <td><a href="https://pan.baidu.com/s/13yVrXeGYkxdGW3P6juiQmA">300d</a></td>
119 |       <td><a href="https://pan.baidu.com/s/1pUqyn7mnPcUmzxT64gGpSw">300d</a></td>
120 |       <td><a href="https://pan.baidu.com/s/1svFOwFBKnnlsqrF1t99Lnw">300d</a></td>
121 |     </tr>
122 |     <tr  align="center">
123 |       <td>Financial News 金融新闻</td>
124 |       <td><a href="https://pan.baidu.com/s/1EhtsbDa3ekzZPODWNLHcXA">300d</a></td>
125 |       <td><a href="https://pan.baidu.com/s/1FcPHv7S4vUgnL7WeWf4_PA">300d</a></td>
126 |       <td><a href="https://pan.baidu.com/s/13CAxY5ffRFuOcHZu8VmArw">300d</a></td>
127 |       <td><a href="https://pan.baidu.com/s/1sqvrUtGBAZ7YWEsGz41DRQ">300d</a></td>
128 |     </tr>
129 |     <tr  align="center">
130 |       <td>Zhihu_QA 知乎问答 </td>
131 |       <td><a href="https://pan.baidu.com/s/1VGOs0RH7DXE5vRrtw6boQA">300d</a></td>
132 |       <td><a href="https://pan.baidu.com/s/1OQ6fQLCgqT43WTwh5fh_lg">300d</a></td>
133 |       <td><a href="https://pan.baidu.com/s/1_xogqF9kJT6tmQHSAYrYeg">300d</a></td>
134 |       <td><a href="https://pan.baidu.com/s/1Fo27Lv_0nz8FXg-xbOz14Q">300d</a></td>
135 |     </tr>
136 |     <tr  align="center">
137 |       <td>Weibo 微博</td>
138 |       <td><a href="https://pan.baidu.com/s/1zbuUJEEEpZRNHxZ7Gezzmw">300d</a></td>
139 |       <td><a href="https://pan.baidu.com/s/11PWBcvruXEDvKf2TiIXntg">300d</a></td>
140 |       <td><a href="https://pan.baidu.com/s/10bhJpaXMCUK02nHvRAttqA">300d</a></td>
141 |       <td><a href="https://pan.baidu.com/s/1FHl_bQkYucvVk-j2KG4dxA">300d</a></td>
142 |     </tr>
143 |     <tr  align="center">
144 |       <td>Literature 文学作品</td>
145 |       <td><a href="https://pan.baidu.com/s/1ciq8iXtcrHpu3ir_VhK0zg">300d</a></td>
146 |       <td><a href="https://pan.baidu.com/s/1Oa4CkPd8o2xd6LEAaa4gmg">300d</a> / PWD: z5b4</td>
147 |       <td><a href="https://pan.baidu.com/s/1IG8IxNp2s7vVklz-vyZR9A">300d</a></td>
148 |       <td><a href="https://pan.baidu.com/s/1SEOKrJYS14HpqIaQT462kA">300d</a> / PWD: yenb</td>
149 |     </tr>
150 |     <tr  align="center">
151 |       <td>Complete Library in Four Sections<br />四库全书<sup>*</sup></td>
152 |       <td><a href="https://pan.baidu.com/s/1vPSeUsSiWYXEWAuokLR0qQ">300d</a></td>
153 |       <td><a href="https://pan.baidu.com/s/1sS9E7sclvS_UZcBgHN7xLQ">300d</a></td>
154 |       <td>NAN</td>
155 |       <td>NAN</td>
156 |     </tr>
157 |     <tr  align="center">
158 |       <td>Mixed-large 综合<br>Baidu Netdisk / Google Drive</td>
159 |       <td>
160 |         <a href="https://pan.baidu.com/s/1luy-GlTdqqvJ3j-A4FcIOw">300d</a><br>
161 |         <a href="https://drive.google.com/open?id=1Zh9ZCEu8_eSQ-qkYVQufQDNKPC4mtEKR">300d</a>
162 |       </td>
163 |       <td>
164 |         <a href="https://pan.baidu.com/s/1oJol-GaRMk4-8Ejpzxo6Gw">300d</a><br>
165 |         <a href="https://drive.google.com/open?id=1WUU9LnoAjs--1E_WqcghLJ-Pp8bb38oS">300d</a>
166 |       </td>
167 |       <td>
168 |         <a href="https://pan.baidu.com/s/1DjIGENlhRbsVyHW-caRePg">300d</a><br>
169 |         <a href="https://drive.google.com/open?id=1aVAK0Z2E5DkdIH6-JHbiWSL5dbAcz6c3">300d</a>
170 |       </td>
171 |       <td>
172 |         <a href="https://pan.baidu.com/s/14JP1gD7hcmsWdSpTvA3vKA">300d</a><br>
173 |         <a href="https://drive.google.com/open?id=1kSAl4_AOg3_6ayU7KRM0Nk66uGdSZdnk">300d</a>
174 |       </td>
175 |     </tr>
176 | </table>
177 | 
178 | <table align="center">
179 |     <tr align="center">
180 |         <td colspan="5"><b>Positive Pointwise Mutual Information (PPMI)</b></td>
181 |     </tr>
182 |     <tr align="center">
183 |         <td rowspan="2">Corpus</td>
184 |         <td colspan="4">Context Features</td>
185 |     </tr>
186 |     <tr  align="center">
187 |       <td>Word</td>
188 |       <td>Word + Ngram</td>
189 |       <td>Word + Character</td>
190 |       <td>Word + Character + Ngram</td>
191 |     </tr>
192 |     <tr  align="center">
193 |       <td>Baidu Encyclopedia 百度百科</td>
194 |       <td><a href="https://pan.baidu.com/s/1_itcjrQawCwcURa7WZLPOA">Sparse</a></td>
195 |       <td><a href="https://pan.baidu.com/s/1cEZzN1S2senwWSyHOnL7YQ">Sparse</a></td>
196 |       <td><a href="https://pan.baidu.com/s/1KcfFdyO0-kE9S9CwzIisfw">Sparse</a></td>
197 |       <td><a href="https://pan.baidu.com/s/1FXYM3CY161_4QMgiH8vasQ">Sparse</a></td>
198 |     </tr>
199 |     <tr  align="center">
200 |       <td>Wikipedia_zh 中文维基百科</td>
201 |       <td><a href="https://pan.baidu.com/s/1MGXRrc54nITPzQ7sfEUjMA">Sparse</a></td>
202 |       <td><a href="https://pan.baidu.com/s/1mtxZna8UJ7xBIxhBFntumQ">Sparse</a></td>
203 |       <td><a href="https://pan.baidu.com/s/1dDImpAx41V73Byl2julOGA">Sparse</a></td>
204 |       <td><a href="https://pan.baidu.com/s/1bsBQHXFpxMHGBexYof1_rw">Sparse</a></td>
205 |     </tr>
206 |     <tr  align="center">
207 |       <td>People's Daily News 人民日报</td>
208 |       <td><a href="https://pan.baidu.com/s/1NLr1K7aapU2sYBvzbVny5g">Sparse</a></td>
209 |       <td><a href="https://pan.baidu.com/s/1LJl3Br0ccGDHP0XX2k3pVw">Sparse</a></td>
210 |       <td><a href="https://pan.baidu.com/s/1GQQXGMn1AHh-BlifT0JD2g">Sparse</a></td>
211 |       <td><a href="https://pan.baidu.com/s/1Xm9Ec3O3rJ6ayrwVwonC7g">Sparse</a></td>
212 |     </tr>
213 |     <tr  align="center">
214 |       <td>Sogou News 搜狗新闻</td>
215 |       <td><a href="https://pan.baidu.com/s/1ECA51CZLp9_JB_me7YZ9-Q">Sparse</a></td>
216 |       <td><a href="https://pan.baidu.com/s/1FO39ZYy1mStERf_b53Y_yQ">Sparse</a></td>
217 |       <td><a href="https://pan.baidu.com/s/1lLBFBk8nn3spFAvKY9IJ6A">Sparse</a></td>
218 |       <td><a href="https://pan.baidu.com/s/1f-dLQZlZo_-B5ZKcPIc6rw">Sparse</a></td>
219 |     </tr>
220 |     <tr  align="center">
221 |       <td>Financial News 金融新闻</td>
222 |       <td><a href="https://pan.baidu.com/s/10wtgdmrTsTrjpSDvI0KzOw">Sparse</a></td>
223 |       <td><a href="https://pan.baidu.com/s/1b6zjvhOIqTdACSSbriisVw">Sparse</a></td>
224 |       <td><a href="https://pan.baidu.com/s/1w24vCfgqcoJvPxsB5VrRvw">Sparse</a></td>
225 |       <td><a href="https://pan.baidu.com/s/1b9BPiDRhiEZ-6ybTcovrqQ">Sparse</a></td>
226 |     </tr>
227 |     <tr  align="center">
228 |       <td>Zhihu_QA 知乎问答 </td>
229 |       <td><a href="https://pan.baidu.com/s/1VaUP3YJC0IZKTbJ-1_8HZg">Sparse</a></td>
230 |       <td><a href="https://pan.baidu.com/s/1g39PKwT0kSmpneKOgXR5YQ">Sparse</a></td>
231 |       <td><a href="https://pan.baidu.com/s/1d8Bsuak0fyXxQOVUiNr-2w">Sparse</a></td>
232 |       <td><a href="https://pan.baidu.com/s/1D5fteBX0Vy4czEqpxXjlrQ">Sparse</a></td>
233 |     </tr>
234 |     <tr  align="center">
235 |       <td>Weibo 微博</td>
236 |       <td><a href="https://pan.baidu.com/s/15O2EbToOzjNSkzJwAOk_Ug">Sparse</a></td>
237 |       <td><a href="https://pan.baidu.com/s/11Dqywn0hfMhysto7bZS1Dw">Sparse</a></td>
238 |       <td><a href="https://pan.baidu.com/s/1wY-7mfV6nwDj_tru6W9h4Q">Sparse</a></td>
239 |       <td><a href="https://pan.baidu.com/s/1DMW-MgLApbQnWwDd-pT_qw">Sparse</a></td>
240 |     </tr>
241 |     <tr  align="center">
242 |       <td>Literature 文学作品</td>
243 |       <td><a href="https://pan.baidu.com/s/1HTHhlr8zvzhTwed7dO0sDg">Sparse</a></td>
244 |       <td><a href="https://pan.baidu.com/s/1jAuGJBxKqgapt__urGsBOQ">Sparse</a></td>
245 |       <td><a href="https://pan.baidu.com/s/173AJfCoAV0ZA8Z31tKBdTA">Sparse</a></td>
246 |       <td><a href="https://pan.baidu.com/s/1dFCxke_Su3lLsuwZr7co3A">Sparse</a></td>
247 |     </tr>
248 |     <tr  align="center">
249 |       <td>Complete Library in Four Sections<br />四库全书<sup>*</sup></td>
250 |       <td><a href="https://pan.baidu.com/s/1NJ1Gc99oE0-GV0QxBqy-qw">Sparse</a></td>
251 |       <td><a href="https://pan.baidu.com/s/1YGEgyXIbw0O4NtoM1ohjdA">Sparse</a></td>
252 |       <td>NAN</td>
253 |       <td>NAN</td>
254 |     </tr>
255 |     </tr>
256 |     <tr  align="center">
257 |       <td>Mixed-large 综合</td>
258 |       <td>Sparse</td>
259 |       <td>Sparse</td>
260 |       <td>Sparse</td>
261 |       <td>Sparse</td>
262 |     </tr>
263 | </table>
264 | 
265 | <sup>\*</sup>Character embeddings are provided, since most of Hanzi are words in the archaic Chinese.
266 | 
267 | ### Various Co-occurrence Information
268 | 
269 | We release word vectors upon different co-occurrence statistics. Target and context vectors are often called input and output vectors in some related papers. 
270 | 
271 | In this part, one can obtain vectors of arbitrary linguistic units beyond word. For example, character vectors is in the context vectors of word-character.
272 | 
273 | All vectors are trained by SGNS on Baidu Encyclopedia.
274 | 
275 | <table>
276 |   <tr align="center">
277 |     <td><b>Feature</b></td>
278 |     <td><b>Co-occurrence Type</b></td>
279 |     <td><b>Target Word Vectors</b></td>
280 |     <td><b>Context Word Vectors</b></td>
281 |   </tr>
282 |   
283 |   <tr align="center">
284 |   	<td rowspan="1">Word</td>
285 |     <td>Word → Word</td>
286 |     <td><a href="https://pan.baidu.com/s/1Rn7LtTH0n7SHyHPfjRHbkg">300d</a></td>
287 |  	  <td><a href="https://pan.baidu.com/s/18T6DRVmS_cZu5u64EbbESQ">300d</a></td>
288 |   </tr>
289 | 
290 |   <tr align="center">
291 |     <td rowspan="3">Ngram</td>
292 |     <td>Word → Ngram (1-2)</td>
293 |     <td><a href="https://pan.baidu.com/s/1XEmP_0FkQwOjipCjI2OPEw">300d</a></td>
294 |  	  <td><a href="https://pan.baidu.com/s/12asujjAaaqxNFYRNP-MThw">300d</a></td>
295 |   </tr>
296 |   <tr align="center">
297 |     <td>Word → Ngram (1-3)</td>
298 |     <td><a href="https://pan.baidu.com/s/1oUmbxsnSuXf2jU8Jxu7U8A">300d</a></td>
299 |  	  <td><a href="https://pan.baidu.com/s/1ylg6FfFHa0kXbiVz8bIL8g">300d</a></td>
300 |   </tr>
301 |   <tr align="center">
302 |     <td>Ngram (1-2) → Ngram (1-2)</td>
303 |     <td><a href="https://pan.baidu.com/s/1Za7DIGVhE6dMsTmxHb-izg">300d</a></td>
304 |  	  <td><a href="https://pan.baidu.com/s/1oKI4Cs9eo7bg5mqfY1hdmg">300d</a></td>
305 |   </tr>
306 |   
307 |   <tr align="center">
308 |     <td rowspan="3">Character</td>
309 |     <td>Word → Character (1)</td>
310 |  	  <td><a href="https://pan.baidu.com/s/1c9yiosHKNIZwRlLzD_F1ig">300d</a></td>
311 |     <td><a href="https://pan.baidu.com/s/1KGZ_x8r-lq-AuElLCSVzvQ">300d</a></td>
312 |   </tr>
313 |   <tr align="center">
314 |     <td>Word → Character (1-2)</td>
315 |  	  <td><a href="https://pan.baidu.com/s/1eeCS7uD3e_qVN8rPwmXhAw">300d</a></td>
316 |     <td><a href="https://pan.baidu.com/s/1q0ItLzbn5Tfb3LhepRCeEA">300d</a></td>
317 |   </tr>
318 |   <tr align="center">
319 |     <td>Word → Character (1-4)</td>
320 |     <td><a href="https://pan.baidu.com/s/1WNWAnba56Rqjmx-FAN_7_g">300d</a></td>
321 |  	  <td><a href="https://pan.baidu.com/s/1hJKTAz6PwS7wmz9wQgmYeg">300d</a></td>
322 |   </tr>
323 |   
324 |   <tr align="center">
325 |   	<td rowspan="1">Radical</td>
326 |     <td>Radical</td>
327 |     <td>300d</td>
328 |  	  <td>300d</td>
329 |   </tr>
330 |   
331 |   <tr align="center">
332 |     <td rowspan="2">Position</td>
333 |     <td>Word → Word (left/right)</td>
334 |     <td><a href="https://pan.baidu.com/s/1JvjcrXFZPknT5H5Xw6KRVg">300d</a></td>
335 |  	  <td><a href="https://pan.baidu.com/s/1m6K9CnIIS8FrQZdDuF6hPQ">300d</a></td>
336 |   </tr>
337 |   <tr align="center">
338 |     <td>Word → Word (distance)</td>
339 |     <td><a href="https://pan.baidu.com/s/1c29BDu4R1hyUX-sgvlHJnA">300d</a></td>
340 |  	  <td><a href="https://pan.baidu.com/s/1sMZHIc-7eU6gRalHwtBHZw">300d</a></td>
341 |   </tr>
342 |   
343 |   <tr align="center">
344 |     <td>Global</td>
345 |     <td>Word → Text</td>
346 |     <td>300d</td>
347 |  	  <td>300d</td>
348 |   </tr>
349 |     
350 |   <tr align="center">
351 |     <td rowspan="2">Syntactic Feature</td>
352 |     <td>Word → POS</td>
353 |     <td>300d</td>
354 |  	  <td>300d</td>
355 |   </tr>
356 |   <tr align="center">
357 |     <td>Word → Dependency</td>
358 |     <td>300d</td>
359 |  	  <td>300d</td>
360 |   </tr>
361 | </table>
362 | 
363 | ## Representations
364 | Existing word representation methods fall into one of the two classes, **dense** and **sparse** represnetations. SGNS model (a model in word2vec toolkit) and PPMI model are respectively typical methods of these two classes. SGNS model trains low-dimensional real (dense) vectors through a shallow neural network. It is also called neural embedding method. PPMI model is a sparse bag-of-feature representation weighted by positive-pointwise-mutual-information (PPMI) weighting scheme.
365 | 
366 | ## Context Features
367 | Three context features: **word**, **ngram**, and **character** are commonly used in the word embedding literature. Most word representation methods essentially exploit word-word co-occurrence statistics, namely using word as context feature **(word feature)**. Inspired by language modeling problem, we introduce ngram feature into the context. Both word-word and word-ngram co-occurrence statistics are used for training **(ngram feature)**. For Chinese, characters (Hanzi) often convey strong semantics. To this end, we consider using word-word and word-character co-occurrence statistics for learning word vectors. The length of character-level ngrams ranges from 1 to 4 **(character feature)**.
368 | 
369 | Besides word, ngram, and character, there are other features which have substantial influence on properties of word vectors. For example, using entire text as context feature could introduce more topic information into word vectors; using dependency parse as context feature could add syntactic constraint to word vectors. 17 co-occurrence types are considered in this project.
370 | 
371 | ## Corpus
372 | We made great efforts to collect corpus across various domains. All text data are preprocessed by removing html and xml tags. Only the plain text are kept and [HanLP(v_1.5.3)](https://github.com/hankcs/HanLP) is used for word segmentation. In addition, traditional Chinese characters are converted into simplified characters with [Open Chinese Convert (OpenCC)](https://github.com/BYVoid/OpenCC). The detailed corpora information is listed as follows:
373 | 
374 | <table>
375 | 	<tr align="center">
376 | 		<td><b>Corpus</b></td>
377 | 		<td><b>Size</b></td>
378 | 		<td><b>Tokens</b></td>
379 | 		<td><b>Vocabulary Size</b></td>
380 | 		<td><b>Description</b></td>
381 | 	</tr>
382 | 	<tr align="center">
383 | 		<td>Baidu Encyclopedia<br />百度百科</td>
384 | 		<td>4.1G</td>
385 | 		<td>745M</td>
386 | 		<td>5422K</td>
387 | 		<td>Chinese Encyclopedia data from<br />https://baike.baidu.com/</td>
388 | 	</tr>
389 | 	<tr align="center">
390 | 		<td>Wikipedia_zh<br />中文维基百科</td>
391 | 		<td>1.3G</td>
392 | 		<td>223M</td>
393 | 		<td>2129K</td>
394 | 		<td>Chinese Wikipedia data from<br />https://dumps.wikimedia.org/</td>
395 | 	</tr>
396 | 	<tr align="center">
397 | 		<td>People's Daily News<br />人民日报</td>
398 | 		<td>3.9G</td>
399 | 		<td>668M</td>
400 | 		<td>1664K</td>
401 | 		<td>News data from People's Daily(1946-2017)<br />http://data.people.com.cn/</td>
402 | 	</tr>
403 | 	<tr align="center">
404 | 		<td>Sogou News<br />搜狗新闻</td>
405 | 		<td>3.7G</td>
406 | 		<td>649M</td>
407 | 		<td>1226K</td>
408 | 		<td>News data provided by Sogou labs<br />http://www.sogou.com/labs/</td>
409 | 	</tr>
410 |   <tr align="center">
411 |     <td>Financial News<br />金融新闻</td>
412 |     <td>6.2G</td>
413 |     <td>1055M</td>
414 |     <td>2785K</td>
415 |     <td>Financial news collected from multiple news websites</td>
416 |   </tr>
417 | 	<tr align="center">
418 | 		<td>Zhihu_QA<br />知乎问答</td>
419 | 		<td>2.1G</td>
420 | 		<td>384M</td>
421 | 		<td>1117K</td>
422 | 		<td>Chinese QA data from<br />https://www.zhihu.com/</td>
423 | 	</tr>
424 | 	<tr align="center">
425 | 		<td>Weibo<br />微博</td>
426 | 		<td>0.73G</td>
427 | 		<td>136M</td>
428 | 		<td>850K</td>
429 | 		<td>Chinese microblog data provided by NLPIR Lab<br />http://www.nlpir.org/wordpress/download/weibo.7z</td>
430 | 	</tr>
431 | 	<tr align="center">
432 | 		<td>Literature<br />文学作品</td>
433 | 		<td>0.93G</td>
434 | 		<td>177M</td>
435 | 		<td>702K</td>
436 | 		<td>8599 modern Chinese literature works</td>
437 | 	</tr>
438 | 	<tr align="center">
439 | 		<td>Mixed-large<br />综合</td>
440 | 		<td>22.6G</td>
441 |     <td>4037M</td>
442 |     <td>10653K</td>
443 | 		<td>We build the large corpus by merging the above corpora.</td>
444 | 	</tr>
445 |   <tr align="center">
446 |     <td>Complete Library in Four Sections<br />四库全书</td>
447 |     <td>1.5G</td>
448 |     <td>714M</td>
449 |     <td>21.8K</td>
450 |     <td>The largest collection of texts in pre-modern China.</td>
451 |   </tr>
452 | </table>
453 | 
454 | All words are concerned, including low frequency words.
455 | 
456 | ## Toolkits
457 | All word vectors are trained by [ngram2vec](https://github.com/zhezhaoa/ngram2vec/) toolkit. Ngram2vec toolkit is a superset of [word2vec](https://github.com/svn2github/word2vec) and [fasttext](https://github.com/facebookresearch/fastText) toolkit, where arbitrary context features and models are supported.
458 | 
459 | ## Chinese Word Analogy Benchmarks
460 | The quality of word vectors is often evaluated by analogy question tasks. In this project, two benchmarks are exploited for evaluation. The first is CA-translated, where most analogy questions are directly translated from English benchmark. Although CA-translated has been widely used in many Chinese word embedding papers, it only contains questions of three semantic questions and covers 134 Chinese words. In contrast, CA8 is specifically designed for Chinese language. It contains 17813 analogy questions and covers comprehensive morphological and semantic relations. The CA-translated, CA8, and their detailed descriptions are provided in [**testsets**](https://github.com/Embedding/Chinese-Word-Vectors/tree/master/testsets) folder.
461 | 
462 | ## Evaluation Toolkit
463 | We present an evaluation toolkit in [**evaluation**](https://github.com/Embedding/Chinese-Word-Vectors/tree/master/evaluation) folder. 
464 | 
465 | Run the following codes to evaluate dense vectors.
466 | ```
467 | $ python ana_eval_dense.py -v <vector.txt> -a CA8/morphological.txt
468 | $ python ana_eval_dense.py -v <vector.txt> -a CA8/semantic.txt
469 | ```
470 | Run the following codes to evaluate sparse vectors.
471 | ```
472 | $ python ana_eval_sparse.py -v <vector.txt> -a CA8/morphological.txt
473 | $ python ana_eval_sparse.py -v <vector.txt> -a CA8/semantic.txt
474 | ```
475 | 


--------------------------------------------------------------------------------