├── hanzifreq
├── input
│ └── .gitignore
├── split.sh
├── template
│ └── template.html
├── config.py
├── calculate_freq.py
└── combine_freq.py
├── .gitignore
├── LICENSE.txt
└── README.md
/hanzifreq/input/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 |
6 |
--------------------------------------------------------------------------------
/hanzifreq/split.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$#" -ne 1 ]; then
4 | echo >&2 "Usage: $0 path/to/large.file"
5 | echo >&2 "This script takes a large file and splits it"
6 | echo >&2 "into smaller files in the directory 'input/'"
7 | exit 1
8 | fi
9 |
10 | type split >/dev/null 2>&1 || {
11 | echo >&2 "ERROR: Please install 'split'."
12 | exit 1
13 | }
14 |
15 | split --lines=500000 "$1" "input/part_"
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Input files
2 | *.xml
3 | *.freq
4 | part_*
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 |
58 | # Sphinx documentation
59 | docs/_build/
60 |
61 | # PyBuilder
62 | target/
63 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Christian Zielinski
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/hanzifreq/template/template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Chinese Character Frequencies
6 |
40 |
41 |
42 | Chinese Character Frequencies
43 |
44 | Created using https://github.com/czielinski/hanzifreq.
45 |
46 |
47 |
48 |
49 |
50 | | Rank |
51 | Hanzi |
52 | Freq. [%] |
53 | Cum. Freq. [%] |
54 |
55 |
56 |
57 |
58 | {table}
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/hanzifreq/config.py:
--------------------------------------------------------------------------------
1 | # The MIT License (MIT)
2 | #
3 | # Copyright (c) 2015 Christian Zielinski
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | import sys
24 | import re
25 |
26 | # Output file
27 | output_file = "output/frequencies.html"
28 |
29 | # File ending for frequency information
30 | freq_file = ".freq"
31 |
32 | # Number of characters to output
33 | num_output_chars = 10000
34 |
35 | # HTML template file
36 | template_file = "template/template.html"
37 |
38 | # Definition of Chinese characters
39 | hanzi = '\u3007\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF'
40 |
41 | if sys.maxunicode > 0xFFFF:
42 | hanzi += '\U00020000-\U0002A6DF\U0002A700-\U0002B73F'
43 | hanzi += '\U0002B740-\U0002B81F\U0002F800-\U0002FA1F'
44 |
45 | re_hanzi = re.compile('[{}]'.format(hanzi))
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Chinese Character Frequency Counter
2 |
3 | These scripts allow the analysis of character frequencies in Chinese text corpora. This might be helpful for Chinese language learners to prioritize common characters when learning how to write.
4 |
5 | ## Usage
6 |
7 | The scripts can take _any form and number of non-binary text input files_ (such as `txt`, `HTML`, `XML`, ...) encoded in `UTF-8`. All non-Chinese characters will be automatically removed (such as `HTML` tags) and the analysis will be done using the remaining characters. One example for a large text corpus is the [Chinese Wikipedia](https://zh.wikipedia.org/), but the scripts can also process other kinds of text corpora.
8 |
9 | All input files should be placed in a common directory such as `hanzifreq/input/`. It is recommended (although not necessary) to split up larger files (i.e. above 100 MB) with the `split.sh` utility, which can be called via `./split.sh path/to/large.file`. The resulting smaller files will be automatically placed in the `hanzifreq/input/` directory. The `split.sh` script needs the [`split` utility](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/split.html) installed, which should be pre-installed on most `Unix` systems.
10 |
11 | Then run `./calculate_freq.py input/` to analyze all files in the input directory. The input files will be processed in parallel on multicore architectures. For each input file `input.file` the script generates a file `input.file.freq` with frequency information of Chinese characters.
12 |
13 | Finally run `./combine_freq.py input/` to combine all frequency information into one summary table. You can find the resulting table of the most common Chinese characters of your text corpus in the file `output/frequencies.html`. The `HTML` template file for that table is `template/template.html` and can be modified.
14 |
15 | You can also change some settings by editing the `config.py` file.
16 |
17 | ### Chinese Wikipedia
18 |
19 | One large language corpus is the Chinese Wikipedia, which you can download from:
20 |
21 | * https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
22 |
23 | After downloading and unpacking, run `./split.sh zhwiki-latest-pages-articles.xml` to create smaller input files. Due to its encyclopedic nature, the character frequencies in Wikipedia vary from other sources such as novels or classical poetry. For example characters such as `年` (year), `月` (month) and `日` (day) occur more frequent than in many other text corpora.
24 |
25 | ### Precomputed frequencies
26 |
27 | Go to http://git.io/hanzifreq to see the calculated character frequencies for the Chinese Wikipedia corpus.
28 |
29 |
--------------------------------------------------------------------------------
/hanzifreq/calculate_freq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | # The MIT License (MIT)
4 | #
5 | # Copyright (c) 2015 Christian Zielinski
6 | #
7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
8 | # of this software and associated documentation files (the "Software"), to deal
9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 |
25 | import multiprocessing
26 | import collections
27 | import pickle
28 | import sys
29 | import os
30 | import re
31 |
32 | import config
33 |
34 |
35 | def count_characters(input_file):
36 | """
37 | Calculates Chinese character frequencies in a given file.
38 | """
39 | if not os.path.exists(input_file):
40 | print("ERROR: File <{}> does not exist".format(input_file))
41 | return 0
42 |
43 | output_file = input_file + config.freq_file
44 |
45 | if os.path.exists(output_file):
46 | print("Skipping <{}> ...".format(input_file))
47 | return 0
48 |
49 | print("Processing file <{}> ...".format(input_file))
50 | with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
51 | content = f.read()
52 |
53 | # Count hanzi
54 | chars = ''.join(re.findall(config.re_hanzi, content))
55 | chars_count = collections.Counter(chars)
56 | chars_total = sum(chars_count.values())
57 |
58 | with open(output_file, 'wb') as f:
59 | pickle.dump(chars_count, f, pickle.HIGHEST_PROTOCOL)
60 |
61 | print("Processed {} characters ...".format(chars_total))
62 | return chars_total
63 |
64 |
65 | def main():
66 | if len(sys.argv) != 2:
67 | print("Usage:\n> python3 {} input_directory/".format(sys.argv[0]))
68 | return 1
69 |
70 | input_dir = sys.argv[1]
71 | if not os.path.isdir(input_dir):
72 | print("ERROR: <{}> is not a valid input directory".format(input_dir))
73 | return 2
74 |
75 | input_files = os.listdir(input_dir)
76 | input_files = [f for f in input_files if not f.startswith('.')]
77 | input_files = [f for f in input_files if not f.endswith(config.freq_file)]
78 | input_files = [os.path.join(input_dir, f) for f in input_files]
79 |
80 | pool = multiprocessing.Pool()
81 | pool.map(count_characters, input_files)
82 |
83 | print("Done.")
84 |
85 |
86 | if __name__ == "__main__":
87 | main()
88 |
--------------------------------------------------------------------------------
/hanzifreq/combine_freq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | # The MIT License (MIT)
4 | #
5 | # Copyright (c) 2015 Christian Zielinski
6 | #
7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
8 | # of this software and associated documentation files (the "Software"), to deal
9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULtAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 |
25 | import collections
26 | import pickle
27 | import sys
28 | import os
29 |
30 | import config
31 |
32 |
33 | def export_frequencies(chars_count, output_file):
34 | """
35 | Writes a frequency distribution to a HTML file.
36 | """
37 | with open(config.template_file, 'r', encoding='utf-8', errors='ignore') as f:
38 | template = f.read()
39 |
40 | chars_total = sum(chars_count.values())
41 | cum_freq = 0.0
42 |
43 | table = ""
44 | full_freq_data = enumerate(chars_count.most_common(config.num_output_chars), start=1)
45 | for rank, char_data in full_freq_data:
46 | char, count = char_data
47 |
48 | freq = 100.0 * count / chars_total
49 | cum_freq += freq
50 |
51 | data = (rank, char, freq, cum_freq)
52 | table += "| {} | {} | {:.5f} | {:.5f} |
\n".format(*data)
53 |
54 | content = template.format(table=table)
55 |
56 | with open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
57 | f.write(content)
58 |
59 |
60 | def main():
61 | if len(sys.argv) != 2:
62 | print("Usage:\n> python3 {} input_directory/".format(sys.argv[0]))
63 | return 1
64 |
65 | input_dir = sys.argv[1]
66 | if not os.path.isdir(input_dir):
67 | print("ERROR: <{}> is not a valid input directory".format(input_dir))
68 | return 2
69 |
70 | input_files = os.listdir(input_dir)
71 | input_files = [f for f in input_files if f.endswith(config.freq_file)]
72 | input_files = [os.path.join(input_dir, f) for f in input_files]
73 |
74 | chars_count = collections.Counter()
75 | num_files = len(input_files)
76 |
77 | for i in range(num_files):
78 | input_file = input_files[i]
79 | print("{}/{}: Processing file <{}> ...".format(i+1, num_files, input_file))
80 |
81 | with open(input_file, 'rb') as f:
82 | new_count = pickle.load(f, encoding='utf-8', errors='ignore')
83 |
84 | chars_count.update(new_count)
85 |
86 | export_frequencies(chars_count, config.output_file)
87 | print("Done.")
88 |
89 |
90 | if __name__ == "__main__":
91 | main()
92 |
--------------------------------------------------------------------------------