├── hanzifreq ├── input │ └── .gitignore ├── split.sh ├── template │ └── template.html ├── config.py ├── calculate_freq.py └── combine_freq.py ├── .gitignore ├── LICENSE.txt └── README.md /hanzifreq/input/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | 6 | -------------------------------------------------------------------------------- /hanzifreq/split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo >&2 "Usage: $0 path/to/large.file" 5 | echo >&2 "This script takes a large file and splits it" 6 | echo >&2 "into smaller files in the directory 'input/'" 7 | exit 1 8 | fi 9 | 10 | type split >/dev/null 2>&1 || { 11 | echo >&2 "ERROR: Please install 'split'." 12 | exit 1 13 | } 14 | 15 | split --lines=500000 "$1" "input/part_" 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Input files 2 | *.xml 3 | *.freq 4 | part_* 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Christian Zielinski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /hanzifreq/template/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Chinese Character Frequencies 6 | 40 | 41 | 42 |

Chinese Character Frequencies

43 |

44 | Created using https://github.com/czielinski/hanzifreq. 45 |

46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | {table} 59 | 60 | 61 |
RankHanziFreq. [%]Cum. Freq. [%]
62 | 63 | 64 | -------------------------------------------------------------------------------- /hanzifreq/config.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2015 Christian Zielinski 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import sys 24 | import re 25 | 26 | # Output file 27 | output_file = "output/frequencies.html" 28 | 29 | # File ending for frequency information 30 | freq_file = ".freq" 31 | 32 | # Number of characters to output 33 | num_output_chars = 10000 34 | 35 | # HTML template file 36 | template_file = "template/template.html" 37 | 38 | # Definition of Chinese characters 39 | hanzi = '\u3007\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF' 40 | 41 | if sys.maxunicode > 0xFFFF: 42 | hanzi += '\U00020000-\U0002A6DF\U0002A700-\U0002B73F' 43 | hanzi += '\U0002B740-\U0002B81F\U0002F800-\U0002FA1F' 44 | 45 | re_hanzi = re.compile('[{}]'.format(hanzi)) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chinese Character Frequency Counter 2 | 3 | These scripts allow the analysis of character frequencies in Chinese text corpora. This might be helpful for Chinese language learners to prioritize common characters when learning how to write. 4 | 5 | ## Usage 6 | 7 | The scripts can take _any form and number of non-binary text input files_ (such as `txt`, `HTML`, `XML`, ...) encoded in `UTF-8`. All non-Chinese characters will be automatically removed (such as `HTML` tags) and the analysis will be done using the remaining characters. One example for a large text corpus is the [Chinese Wikipedia](https://zh.wikipedia.org/), but the scripts can also process other kinds of text corpora. 8 | 9 | All input files should be placed in a common directory such as `hanzifreq/input/`. It is recommended (although not necessary) to split up larger files (i.e. above 100 MB) with the `split.sh` utility, which can be called via `./split.sh path/to/large.file`. The resulting smaller files will be automatically placed in the `hanzifreq/input/` directory. The `split.sh` script needs the [`split` utility](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/split.html) installed, which should be pre-installed on most `Unix` systems. 10 | 11 | Then run `./calculate_freq.py input/` to analyze all files in the input directory. The input files will be processed in parallel on multicore architectures. For each input file `input.file` the script generates a file `input.file.freq` with frequency information of Chinese characters. 12 | 13 | Finally run `./combine_freq.py input/` to combine all frequency information into one summary table. You can find the resulting table of the most common Chinese characters of your text corpus in the file `output/frequencies.html`. The `HTML` template file for that table is `template/template.html` and can be modified. 14 | 15 | You can also change some settings by editing the `config.py` file. 16 | 17 | ### Chinese Wikipedia 18 | 19 | One large language corpus is the Chinese Wikipedia, which you can download from: 20 | 21 | * https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2 22 | 23 | After downloading and unpacking, run `./split.sh zhwiki-latest-pages-articles.xml` to create smaller input files. Due to its encyclopedic nature, the character frequencies in Wikipedia vary from other sources such as novels or classical poetry. For example characters such as `年` (year), `月` (month) and `日` (day) occur more frequent than in many other text corpora. 24 | 25 | ### Precomputed frequencies 26 | 27 | Go to http://git.io/hanzifreq to see the calculated character frequencies for the Chinese Wikipedia corpus. 28 | 29 | -------------------------------------------------------------------------------- /hanzifreq/calculate_freq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 2015 Christian Zielinski 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import multiprocessing 26 | import collections 27 | import pickle 28 | import sys 29 | import os 30 | import re 31 | 32 | import config 33 | 34 | 35 | def count_characters(input_file): 36 | """ 37 | Calculates Chinese character frequencies in a given file. 38 | """ 39 | if not os.path.exists(input_file): 40 | print("ERROR: File <{}> does not exist".format(input_file)) 41 | return 0 42 | 43 | output_file = input_file + config.freq_file 44 | 45 | if os.path.exists(output_file): 46 | print("Skipping <{}> ...".format(input_file)) 47 | return 0 48 | 49 | print("Processing file <{}> ...".format(input_file)) 50 | with open(input_file, 'r', encoding='utf-8', errors='ignore') as f: 51 | content = f.read() 52 | 53 | # Count hanzi 54 | chars = ''.join(re.findall(config.re_hanzi, content)) 55 | chars_count = collections.Counter(chars) 56 | chars_total = sum(chars_count.values()) 57 | 58 | with open(output_file, 'wb') as f: 59 | pickle.dump(chars_count, f, pickle.HIGHEST_PROTOCOL) 60 | 61 | print("Processed {} characters ...".format(chars_total)) 62 | return chars_total 63 | 64 | 65 | def main(): 66 | if len(sys.argv) != 2: 67 | print("Usage:\n> python3 {} input_directory/".format(sys.argv[0])) 68 | return 1 69 | 70 | input_dir = sys.argv[1] 71 | if not os.path.isdir(input_dir): 72 | print("ERROR: <{}> is not a valid input directory".format(input_dir)) 73 | return 2 74 | 75 | input_files = os.listdir(input_dir) 76 | input_files = [f for f in input_files if not f.startswith('.')] 77 | input_files = [f for f in input_files if not f.endswith(config.freq_file)] 78 | input_files = [os.path.join(input_dir, f) for f in input_files] 79 | 80 | pool = multiprocessing.Pool() 81 | pool.map(count_characters, input_files) 82 | 83 | print("Done.") 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /hanzifreq/combine_freq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 2015 Christian Zielinski 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import collections 26 | import pickle 27 | import sys 28 | import os 29 | 30 | import config 31 | 32 | 33 | def export_frequencies(chars_count, output_file): 34 | """ 35 | Writes a frequency distribution to a HTML file. 36 | """ 37 | with open(config.template_file, 'r', encoding='utf-8', errors='ignore') as f: 38 | template = f.read() 39 | 40 | chars_total = sum(chars_count.values()) 41 | cum_freq = 0.0 42 | 43 | table = "" 44 | full_freq_data = enumerate(chars_count.most_common(config.num_output_chars), start=1) 45 | for rank, char_data in full_freq_data: 46 | char, count = char_data 47 | 48 | freq = 100.0 * count / chars_total 49 | cum_freq += freq 50 | 51 | data = (rank, char, freq, cum_freq) 52 | table += "{}{}{:.5f}{:.5f}\n".format(*data) 53 | 54 | content = template.format(table=table) 55 | 56 | with open(output_file, 'w', encoding='utf-8', errors='ignore') as f: 57 | f.write(content) 58 | 59 | 60 | def main(): 61 | if len(sys.argv) != 2: 62 | print("Usage:\n> python3 {} input_directory/".format(sys.argv[0])) 63 | return 1 64 | 65 | input_dir = sys.argv[1] 66 | if not os.path.isdir(input_dir): 67 | print("ERROR: <{}> is not a valid input directory".format(input_dir)) 68 | return 2 69 | 70 | input_files = os.listdir(input_dir) 71 | input_files = [f for f in input_files if f.endswith(config.freq_file)] 72 | input_files = [os.path.join(input_dir, f) for f in input_files] 73 | 74 | chars_count = collections.Counter() 75 | num_files = len(input_files) 76 | 77 | for i in range(num_files): 78 | input_file = input_files[i] 79 | print("{}/{}: Processing file <{}> ...".format(i+1, num_files, input_file)) 80 | 81 | with open(input_file, 'rb') as f: 82 | new_count = pickle.load(f, encoding='utf-8', errors='ignore') 83 | 84 | chars_count.update(new_count) 85 | 86 | export_frequencies(chars_count, config.output_file) 87 | print("Done.") 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | --------------------------------------------------------------------------------