├── .gitignore ├── README.md ├── coca20000.txt ├── coca_refined.txt └── split.py /.gitignore: -------------------------------------------------------------------------------- 1 | *un~ 2 | .DS_Store 3 | coca20000_batch_import.txt 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | This script can split [COCA](https://www.wordfrequency.info/) vocabulary into small groups to be imported into a 3 | dictionary app (e.g. Eudic) for studying. 4 | 5 | Please refer to [COCA 词频表使用](https://zhuanlan.zhihu.com/p/53261968) and [快速掌握 COCA 词汇表](https://zhuanlan.zhihu.com/p/56823867). 6 | 7 | ### Requirements 8 | - **Python 3** 9 | 10 | Please make sure you have **Python 3** installed in your environment. For me, it's `/usr/bin/python3`. 11 | 12 | ### Usage 13 | 14 | ``` 15 | python split.py coca20000.txt 15 16 | ``` 17 | by default, the **Output file** is `coca20000_batch_import.txt`. 18 | 19 | ##### Note: 20 | The last number 15 is the group size, it means each group contains 15 words, you can change it to fit your need. 21 | 22 | ### Files 23 | 24 | - **`coca20000.txt`** contains the origianl vocabulary list 25 | - **`coca_refined.txt`** contains the final refined vocabulary list according to this article [快速掌握 COCA 词汇表](https://zhuanlan.zhihu.com/p/56823867) 26 | -------------------------------------------------------------------------------- /split.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | #! /usr/bin/python3 3 | 4 | import sys 5 | import os 6 | import timeit 7 | # from sets import Set 8 | 9 | # default settings 10 | connector = "_" 11 | extension = ".txt" 12 | split_size = 200 13 | maxline = 60000 14 | 15 | original_file = "coca60000.txt" 16 | output_path = "output" 17 | 18 | # remove duplicates while maintaining the order 19 | def remove_duplicates(input_file): 20 | with open(input_file, "r") as fin: 21 | lines = fin.readlines() 22 | all_words = [x.strip() for x in lines] 23 | 24 | seen = {} 25 | pos = 0 26 | for word in all_words: 27 | if word not in seen: 28 | seen[word] = True 29 | all_words[pos] = word 30 | pos += 1 31 | del all_words[pos:] 32 | 33 | outfile = os.path.splitext(input_file)[0] + "_removed_duplicates" + extension 34 | with open(outfile, "w") as fout: 35 | for word in all_words: 36 | fout.write(word) 37 | fout.write("\n") 38 | 39 | # strip spaces, symbols pre and post the words 40 | def strip_spaces(input_file): 41 | outfile = os.path.splitext(input_file)[0] + "_removed_spaces" + extension 42 | 43 | with open(input_file, "r") as fin, open(outfile, "w") as fout: 44 | for line in fin: 45 | # line = re.sub(r'\(\)', '', line.strip()) 46 | line = line.strip("()\n") 47 | fout.write(line) 48 | fout.write("\n") 49 | 50 | # generate a file, with splits, that can be used for batch-importing of Eudic 51 | def batch_import(input_file, num_of_words): 52 | if (len(sys.argv) < 3): 53 | print("split size is not specified, use default:%d" % split_size) 54 | else: 55 | num_of_words = int(sys.argv[2]) 56 | 57 | outfile = os.path.splitext(input_file)[0] + "_batch_import" + extension 58 | with open(input_file, "r") as fin, open(outfile, "w") as fout: 59 | base, cursor = 0, 0 60 | for line in fin: 61 | if base == 0: 62 | fout.write("#1_"+str(num_of_words)) 63 | fout.write("\n") 64 | base = num_of_words 65 | 66 | if cursor > num_of_words - 1: 67 | start, end = str(base+1), str(base+cursor) 68 | title = '#'+start+connector+end 69 | fout.write(title) 70 | fout.write("\n") 71 | base = base + cursor 72 | cursor = 0 73 | 74 | fout.write(line.strip()) 75 | fout.write("\n") 76 | cursor += 1 77 | 78 | # strip chinese meanings, output only english words 79 | def strip_meanings(input_file): 80 | outfile = os.path.splitext(input_file)[0] + "_no_meaning" + extension 81 | # print outfile 82 | with open(input_file, "r") as fin, open(outfile, "w") as fout: 83 | for line in fin: 84 | word = line.strip().split(" ")[0] 85 | fout.write(word) 86 | fout.write("\n") 87 | 88 | # remove last line, which is empty! 89 | fout.seek(-2, os.SEEK_CUR) 90 | fout.truncate() 91 | 92 | # split words into groups with specified group size 93 | def splitwords(input_file, num_of_words): 94 | if (len(sys.argv) < 3): 95 | print("split size is not specified, use default:%d" % split_size) 96 | else: 97 | num_of_words = int(sys.argv[2]) 98 | 99 | wordlist = [] 100 | with open(input_file, "r") as fin: 101 | base, cursor = 0, 0 102 | for line in fin: 103 | # if base > maxline: 104 | # sys.exit(0) 105 | 106 | if cursor > num_of_words - 1: 107 | #1. assemble filename 108 | start, end = str(base+1), str(base+cursor) 109 | outfile = os.path.join(output_path, start + connector + end + extension) 110 | 111 | #2. output to file 112 | with open(outfile, "w") as fout: 113 | for word in wordlist: 114 | fout.write(word) 115 | fout.write("\n") 116 | 117 | # delate last line 118 | fout.seek(-2, os.SEEK_CUR) 119 | fout.truncate() 120 | 121 | #3. update base, reset cursor and wordlist 122 | base = base + cursor 123 | cursor = 0 124 | wordlist = [] 125 | 126 | 127 | # collects words and update cursor 128 | wordlist.append(line.strip()) 129 | cursor+=1 130 | 131 | if cursor > 0: 132 | start, end = str(base+1), str(base+cursor) 133 | outfile = os.path.join(output_path, start + connector + end + extension) 134 | 135 | with open(outfile, "w") as fout: 136 | for word in wordlist: 137 | fout.write(word) 138 | fout.write("\n") 139 | fout.seek(-2, os.SEEK_CUR) 140 | fout.truncate() 141 | 142 | ########################################################################### 143 | 144 | # usage: 145 | # ./split.py filename 200 146 | 147 | def main(argv): 148 | global output_path 149 | input_file = original_file 150 | 151 | if len(argv) < 2: 152 | print("must specify filename") 153 | sys.exit(1) 154 | else: 155 | input_file = argv[1] 156 | 157 | # create output directory if not exist 158 | if not os.path.exists(output_path): 159 | os.makedirs(output_path) 160 | 161 | start = timeit.default_timer() 162 | # splitwords(input_file, split_size) 163 | batch_import(input_file, split_size) 164 | # remove_duplicates(input_file) 165 | # strip_spaces(input_file) 166 | # strip_meanings(input_file) 167 | 168 | stop = timeit.default_timer() 169 | print('Time elapsed: {}'.format(stop - start)) 170 | 171 | if __name__ == "__main__": 172 | main(sys.argv) 173 | --------------------------------------------------------------------------------