├── .gitignore
├── README.md
├── coca20000.txt
├── coca_refined.txt
└── split.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *un~
2 | .DS_Store
3 | coca20000_batch_import.txt
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | This script can split [COCA](https://www.wordfrequency.info/) vocabulary into small groups to be imported into a
 3 | dictionary app (e.g. Eudic) for studying.
 4 | 
 5 | Please refer to [COCA 词频表使用](https://zhuanlan.zhihu.com/p/53261968) and [快速掌握 COCA 词汇表](https://zhuanlan.zhihu.com/p/56823867).
 6 | 
 7 | ### Requirements
 8 | - **Python 3**
 9 | 
10 | Please make sure you have **Python 3** installed in your environment. For me, it's `/usr/bin/python3`.
11 | 
12 | ### Usage
13 | 
14 | ```
15 | python split.py coca20000.txt 15
16 | ```
17 | by default, the **Output file** is `coca20000_batch_import.txt`.
18 | 
19 | ##### Note:
20 | The last number 15 is the group size, it means each group contains 15 words, you can change it to fit your need.
21 | 
22 | ### Files
23 | 
24 | - **`coca20000.txt`** contains the origianl vocabulary list
25 | - **`coca_refined.txt`** contains the final refined vocabulary list according to this article [快速掌握 COCA 词汇表](https://zhuanlan.zhihu.com/p/56823867)
26 | 


--------------------------------------------------------------------------------
/split.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | #! /usr/bin/python3
  3 | 
  4 | import sys
  5 | import os
  6 | import timeit
  7 | # from sets import Set
  8 | 
  9 | # default settings
 10 | connector = "_"
 11 | extension = ".txt"
 12 | split_size = 200
 13 | maxline = 60000
 14 | 
 15 | original_file = "coca60000.txt"
 16 | output_path = "output"
 17 | 
 18 | # remove duplicates while maintaining the order
 19 | def remove_duplicates(input_file):
 20 |     with open(input_file, "r") as fin:
 21 |         lines = fin.readlines()
 22 |         all_words = [x.strip() for x in lines]
 23 | 
 24 |         seen = {}
 25 |         pos = 0
 26 |         for word in all_words:
 27 |             if word not in seen:
 28 |                 seen[word] = True
 29 |                 all_words[pos] = word
 30 |                 pos += 1
 31 |         del all_words[pos:]
 32 | 
 33 |         outfile = os.path.splitext(input_file)[0] + "_removed_duplicates" + extension
 34 |         with open(outfile, "w") as fout:
 35 |             for word in all_words:
 36 |                 fout.write(word)
 37 |                 fout.write("\n")
 38 | 
 39 | # strip spaces, symbols pre and post the words
 40 | def strip_spaces(input_file):
 41 |     outfile = os.path.splitext(input_file)[0] + "_removed_spaces" + extension
 42 | 
 43 |     with open(input_file, "r") as fin, open(outfile, "w") as fout:
 44 |         for line in fin:
 45 |             # line = re.sub(r'\(\)', '', line.strip())
 46 |             line = line.strip("()\n")
 47 |             fout.write(line)
 48 |             fout.write("\n")
 49 | 
 50 | # generate a file, with splits, that can be used for batch-importing of Eudic
 51 | def batch_import(input_file, num_of_words):
 52 |     if (len(sys.argv) < 3):
 53 |         print("split size is not specified, use default:%d" % split_size)
 54 |     else:
 55 |         num_of_words = int(sys.argv[2])
 56 | 
 57 |     outfile = os.path.splitext(input_file)[0] + "_batch_import" + extension
 58 |     with open(input_file, "r") as fin, open(outfile, "w") as fout:
 59 |         base, cursor = 0, 0
 60 |         for line in fin:
 61 |             if base == 0:
 62 |                 fout.write("#1_"+str(num_of_words))
 63 |                 fout.write("\n")
 64 |                 base = num_of_words
 65 | 
 66 |             if cursor > num_of_words - 1:
 67 |                 start, end = str(base+1), str(base+cursor)
 68 |                 title = '#'+start+connector+end
 69 |                 fout.write(title)
 70 |                 fout.write("\n")
 71 |                 base = base + cursor
 72 |                 cursor = 0
 73 | 
 74 |             fout.write(line.strip())
 75 |             fout.write("\n")
 76 |             cursor += 1
 77 | 
 78 | # strip chinese meanings, output only english words
 79 | def strip_meanings(input_file):
 80 |     outfile = os.path.splitext(input_file)[0] + "_no_meaning" + extension
 81 |     # print outfile
 82 |     with open(input_file, "r") as fin, open(outfile, "w") as fout:
 83 |         for line in fin:
 84 |             word = line.strip().split(" ")[0]
 85 |             fout.write(word)
 86 |             fout.write("\n")
 87 | 
 88 |         # remove last line, which is empty!
 89 |         fout.seek(-2, os.SEEK_CUR)
 90 |         fout.truncate()
 91 | 
 92 | # split words into groups with specified group size
 93 | def splitwords(input_file, num_of_words):
 94 |     if (len(sys.argv) < 3):
 95 |         print("split size is not specified, use default:%d" % split_size)
 96 |     else:
 97 |         num_of_words = int(sys.argv[2])
 98 | 
 99 |     wordlist = []
100 |     with open(input_file, "r") as fin:
101 |         base, cursor = 0, 0
102 |         for line in fin:
103 |             # if base > maxline:
104 |             #     sys.exit(0)
105 | 
106 |             if cursor > num_of_words - 1:
107 |                 #1. assemble filename
108 |                 start, end = str(base+1), str(base+cursor)
109 |                 outfile = os.path.join(output_path, start + connector + end + extension)
110 | 
111 |                 #2. output to file
112 |                 with open(outfile, "w") as fout:
113 |                     for word in wordlist:
114 |                         fout.write(word)
115 |                         fout.write("\n")
116 | 
117 |                     # delate last line
118 |                     fout.seek(-2, os.SEEK_CUR)
119 |                     fout.truncate()
120 | 
121 |                 #3. update base, reset cursor and wordlist
122 |                 base = base + cursor
123 |                 cursor = 0
124 |                 wordlist = []
125 | 
126 | 
127 |             # collects words and update cursor
128 |             wordlist.append(line.strip())
129 |             cursor+=1
130 | 
131 |         if cursor > 0:
132 |             start, end = str(base+1), str(base+cursor)
133 |             outfile = os.path.join(output_path, start + connector + end + extension)
134 | 
135 |             with open(outfile, "w") as fout:
136 |                 for word in wordlist:
137 |                     fout.write(word)
138 |                     fout.write("\n")
139 |                 fout.seek(-2, os.SEEK_CUR)
140 |                 fout.truncate()
141 | 
142 | ###########################################################################
143 | 
144 | # usage:
145 | # ./split.py filename 200
146 | 
147 | def main(argv):
148 |     global output_path
149 |     input_file = original_file
150 | 
151 |     if len(argv) < 2:
152 |         print("must specify filename")
153 |         sys.exit(1)
154 |     else:
155 |         input_file = argv[1]
156 | 
157 |     # create output directory if not exist
158 |     if not os.path.exists(output_path):
159 |         os.makedirs(output_path)
160 | 
161 |     start = timeit.default_timer()
162 |     # splitwords(input_file, split_size)
163 |     batch_import(input_file, split_size)
164 |     # remove_duplicates(input_file)
165 |     # strip_spaces(input_file)
166 |     # strip_meanings(input_file)
167 | 
168 |     stop = timeit.default_timer()
169 |     print('Time elapsed: {}'.format(stop - start))
170 | 
171 | if __name__ == "__main__":
172 |     main(sys.argv)
173 | 


--------------------------------------------------------------------------------