├── core ├── __init__.py └── shared.py ├── requirements.txt ├── .gitignore ├── vocab_adder.py ├── comprehension.py ├── analyzer.py └── README.md /core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pdfminer.six==20201018 2 | LAC==2.1.1 3 | Unidecode==1.2.0 4 | tabulate==0.8.9 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /core/__pycache__ 2 | .DS_STORE 3 | /books 4 | /data/*.txt 5 | output.txt 6 | mega_book.py 7 | -------------------------------------------------------------------------------- /vocab_adder.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import core.shared as shared 3 | from collections import Counter 4 | from LAC import LAC 5 | 6 | parser = argparse.ArgumentParser( 7 | description="Add words from a file to your known vocab." 8 | ) 9 | parser.add_argument("-t", "--target", required=True, help="Path to .txt or .pdf file of target text to add") 10 | parser.add_argument("-k", "--known", required=True, help="Path to .txt file where vocabulary from the text will be dumped.") 11 | parser.add_argument("-m", "--mode", default="smart", help="smart/simple (smart default) - whether to split text char-by-char (simple) or word-by-word (smart)") 12 | args = parser.parse_args() 13 | 14 | print("Initializing parser...", end="\r") 15 | lac = LAC(mode='seg') 16 | print("Initializing parser... done\n") 17 | 18 | def add_vocab(targetfile: str, knownfile: str, mode: str): 19 | target_text = shared.text_setup(targetfile) 20 | target_text_content = shared.text_clean_up(target_text) 21 | 22 | if mode == "smart": 23 | target_text_content = list(lac.run(target_text_content)) 24 | else: 25 | target_text_content = shared.split_unicode_chrs(target_text_content) 26 | 27 | target_text_content = shared.remove_exclusions(target_text_content, [], True) 28 | 29 | # add to known wordlist if not in wordlist 30 | known_words_list = [] 31 | with open(knownfile, "r+", encoding="utf8") as file: 32 | known_words_list = file.read().splitlines() 33 | 34 | with open(knownfile, "w+", encoding="utf8") as file: 35 | for word in known_words_list: 36 | file.write(word + "\n") 37 | 38 | for word in list(dict.fromkeys(target_text_content)): 39 | if word not in known_words_list: 40 | known_words_list.append(word) 41 | file.write(word + "\n") 42 | 43 | 44 | 45 | if __name__ == "__main__": 46 | add_vocab(args.target, args.known, args.mode) 47 | print("Task completed.") 48 | -------------------------------------------------------------------------------- /core/shared.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | import unicodedata 3 | import pdfminer.high_level 4 | from re import compile as _Re 5 | 6 | def load_word_list_from_file(file: str): 7 | try: 8 | word_list = open(file, "r", encoding="utf8") # filename of your known words here 9 | except KeyError as ke: 10 | raise ke 11 | 12 | words = set( 13 | re.sub(r"\s+", "\n", word_list.read()).split("\n") 14 | ) # splitting to remove accidental whitespace 15 | if "" in words: 16 | words.remove("") 17 | word_list.close() 18 | 19 | finalized_words = words.copy() 20 | 21 | # assume learner knows characters used in every word they know 22 | # this is to make parsing words such as 慢慢的 which are not 23 | # on the HSK be "recognized" by the program. 24 | for word in words: 25 | for single_hanzi in [char for char in word]: 26 | finalized_words.add(single_hanzi) 27 | 28 | return finalized_words 29 | 30 | def text_clean_up(target_text): 31 | target_text_content = "".join( 32 | re.sub(r"\s+", "\n", target_text).split("\n") 33 | ) # remove whitespace 34 | 35 | # remove diacritics 36 | normalized = unicodedata.normalize("NFKD", target_text_content) 37 | result = "".join(c for c in normalized if unicodedata.category(c) != "Mn") 38 | return result 39 | 40 | def remove_exclusions(word_list: list, additional_exclusions: list, do_punctuations=False): 41 | punctuations = ( 42 | ",.:()!@[]+/\\!??。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\"" 43 | ) # NOTE: need to include English punctuation due to PDF reader 44 | # NOTE: punctuations are now disabled by default as that is the industry standard 45 | 46 | if do_punctuations: 47 | exclusions = [char for char in punctuations] 48 | word_list = [word for word in word_list if word not in exclusions] 49 | 50 | word_list = list(filter(lambda x: x not in additional_exclusions and not re.match(r'[a-zA-Z0-9]+', x), word_list)) 51 | 52 | return word_list 53 | 54 | def round_to_nearest_50(x, base=50): 55 | return base * round(x/base) 56 | 57 | def text_setup(file): 58 | _, file_extension = os.path.splitext(file) 59 | if file_extension == ".pdf": 60 | target_text = pdfminer.high_level.extract_text(file) 61 | else: # already in txt format 62 | try: 63 | target_text = open(file, "r", encoding="utf8") # filename of your target text here 64 | target_text = target_text.read() 65 | except KeyError as ke: 66 | raise ke 67 | 68 | return target_text 69 | 70 | 71 | _unicode_chr_splitter = _Re("(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)").split 72 | 73 | def split_unicode_chrs(text): 74 | """ 75 | Split a Chinese text character by character. 76 | 77 | Courtesy of `flow` on StackOverflow: https://stackoverflow.com/a/3798790/12876940 78 | """ 79 | return [chr for chr in _unicode_chr_splitter(text) if chr] 80 | -------------------------------------------------------------------------------- /comprehension.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from LAC import LAC 3 | from collections import Counter 4 | import core.shared as shared 5 | 6 | parser = argparse.ArgumentParser( 7 | description="Calculates percentage comprehension of a text file based on known words." 8 | ) 9 | parser.add_argument( 10 | "-k", 11 | "--known", 12 | required=True, 13 | help="Relative path to .txt file with newline-separated known words.", 14 | ) 15 | parser.add_argument( 16 | "-t", 17 | "--target", 18 | required=True, 19 | help="Relative path to .txt target file in Chinese.", 20 | ) 21 | parser.add_argument( 22 | "-m", 23 | "--mode", 24 | default="smart", 25 | help="Mode for separating text and known vocab: 'smart' (default, word-by-word using jieba) 'simple' (character-by-character)", 26 | ) 27 | parser.add_argument( 28 | "-c", 29 | "--characters", 30 | required=False, 31 | default=False, 32 | action="store_true", 33 | help="SUGGESTED: Add this flag (just -c, no extra info) if you know all the characters in your wordlist. This is due to segmentation limitation. For ex. 慢慢的 is seen as one word, if this word is not in your wordlist, it will be unknown. By setting this flag (and having the characters 慢 and 的 in your wordlist (can be part of other words), 慢慢的 will be an 'understood' word." 34 | ) 35 | parser.add_argument( 36 | "-u", 37 | "--unknown", 38 | required=False, 39 | help="Path to output file with unknown words from text. Skip to not create an output file.", 40 | ) 41 | parser.add_argument( 42 | "-e", 43 | "--exclude", 44 | required=False, 45 | help="Path to .txt file with newline-separated words to exclude (e.g. proper nouns)", 46 | ) 47 | 48 | args = parser.parse_args() 49 | 50 | print("Initializing parser...", end="\r") 51 | lac = LAC(mode='seg') 52 | print("Initializing parser... done\n") 53 | 54 | 55 | def comprehension_checker( 56 | knownfile: str, targetfile: str, mode: str, outputfile: str, excludefile: str, 57 | ) -> str: 58 | known_words = shared.load_word_list_from_file(knownfile) 59 | 60 | exclude_words = [] 61 | if excludefile != None: 62 | exclude_words = shared.load_word_list_from_file(excludefile) 63 | 64 | 65 | # get text in correct format if in PDF format; TODO: more formats 66 | target_text = shared.text_setup(targetfile) 67 | 68 | target_text_content = shared.text_clean_up(target_text) 69 | 70 | character_word_text = "" 71 | if mode == "smart": 72 | character_word_text = "Words" 73 | target_text_content = list(lac.run(target_text_content)) 74 | elif mode == "simple": 75 | character_word_text = "Characters" 76 | target_text_content = shared.split_unicode_chrs(target_text_content) 77 | known_words = set( 78 | "".join([e for e in known_words]) 79 | ) # convert known_words to chr_by_chr too 80 | else: 81 | raise KeyError("mode provided invalid") 82 | 83 | target_text_content = shared.remove_exclusions(target_text_content, exclude_words) 84 | counted_target = Counter(target_text_content) 85 | 86 | # get rid of punctuations, as this is used for writing and stats 87 | punctuations = ( 88 | ",.:()!@[]+/\\!??。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\"" 89 | ) 90 | punctuations = [e for e in punctuations] 91 | # delete elements in punctuations from counted_target Counter object 92 | for e in punctuations: 93 | del counted_target[e] 94 | 95 | target_length = len(target_text_content) # includes punctuation 96 | 97 | total_unique_words = len(counted_target) 98 | counter = 0 99 | crosstext_count = 0 # counter of words that are understood 100 | unknown_words = [] 101 | unknown_word_counter = 0 102 | 103 | for hanzi, count in counted_target.items(): # hanzzi represents a full word (unless simple mode) 104 | counter += 1 105 | print(f"-- {counter/total_unique_words * 100:.2f}% complete --", end="\r") 106 | if hanzi in known_words: 107 | crosstext_count += count 108 | elif set([char for char in hanzi]).issubset(set(known_words)) and args.characters: 109 | # ex. user knows 慢 的,慢慢的=对 110 | crosstext_count += count 111 | else: 112 | unknown_word_counter += 1 113 | if outputfile is not None: 114 | unknown_words.append((hanzi, count)) 115 | 116 | unknown_words.sort(key=sort_by_count, reverse=True) 117 | if outputfile is not None: 118 | try: 119 | with open(outputfile, "w+", encoding="utf8") as file: 120 | for ele, count in unknown_words: 121 | file.write(ele + " : " + str(count) + "\n") 122 | except KeyError as ke: 123 | return ke 124 | 125 | return ( 126 | f"\nWord Count: {len(target_text_content)} (excluding 'exclusions')" 127 | + "\nTotal Unique " + f"{character_word_text}" + ": " 128 | + f"{total_unique_words}" 129 | +"\nComprehension: " 130 | + f"{crosstext_count/target_length * 100:.3f}%" 131 | + "\nUnique Unknown " + f"{character_word_text}" + ": " 132 | + f"{unknown_word_counter}" 133 | ) 134 | 135 | def sort_by_count(e): 136 | return e[1] 137 | 138 | if __name__ == "__main__": 139 | print(comprehension_checker(args.known, args.target, args.mode, args.unknown, args.exclude)) 140 | -------------------------------------------------------------------------------- /analyzer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import csv 4 | from tabulate import tabulate 5 | from collections import Counter 6 | from re import compile as _Re 7 | import core.shared as shared 8 | 9 | os.system("") 10 | 11 | parser = argparse.ArgumentParser( 12 | description="Calculate unique words and character count of a text file - result is rounded to nearest 50" 13 | ) 14 | parser.add_argument( 15 | "-k", 16 | "--known", 17 | required=False, 18 | help="Relative path to .txt file with newline-separated known words for *ing in output.", 19 | ) 20 | parser.add_argument( 21 | "-t", 22 | "--target", 23 | required=True, 24 | help="Relative path to .txt target file in Chinese.", 25 | ) 26 | parser.add_argument( 27 | "-o", 28 | "--output", 29 | required=False, 30 | help="Path to output file with all words & characters words from text. Skip to not create an output file.", 31 | ) 32 | parser.add_argument( 33 | "-e", 34 | "--exclude", 35 | required=False, 36 | help="Path to .txt file with newline-separated words to exclude (e.g. proper nouns).", 37 | ) 38 | parser.add_argument( 39 | "-n", 40 | "--no-words", 41 | dest="no_words", 42 | action="store_true", 43 | help="Setting this flag will mean that the tool does not segment words, so you will not have a calculating of # of words, # of unique words, and HSK breakdown. Can lead to a significant speedup, as segmentation takes approx. 1 minute per 1 million characters. Off by default. To set, simply add -n." 44 | ) 45 | 46 | parser.set_defaults(no_words=False) 47 | args = parser.parse_args() 48 | 49 | _unicode_chr_splitter = _Re("(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)").split 50 | 51 | 52 | def split_unicode_chrs(text): 53 | """ 54 | Split a Chinese text character by character. 55 | 56 | Curtesy of `flow` on StackOverflow: https://stackoverflow.com/a/3798790/12876940 57 | """ 58 | return [chr for chr in _unicode_chr_splitter(text) if chr] 59 | 60 | 61 | def text_analyzer( 62 | knownfile: str, targetfile: str, outputfile: str, excludefile: str, no_words: bool = False 63 | ) -> str: 64 | try: 65 | known_words = shared.load_word_list_from_file(knownfile) 66 | except TypeError: 67 | known_words = [] 68 | 69 | exclude_words = [] 70 | if excludefile != None: 71 | exclude_words = shared.load_word_list_from_file(excludefile) 72 | 73 | # access text in .txt format 74 | target_text = shared.text_setup(targetfile) 75 | 76 | target_text_content = shared.text_clean_up(target_text) 77 | target_text_content = ''.join(shared.remove_exclusions(target_text_content, exclude_words)) 78 | target_character_content = split_unicode_chrs(target_text_content) 79 | counted_target_character = Counter(shared.remove_exclusions(target_character_content, exclude_words)) 80 | 81 | punctuations = ( 82 | ",.:()!@[]+/\\!??。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\"" 83 | ) 84 | punctuations = [e for e in punctuations] 85 | # delete the elements in punctuations from counted_target Counter object 86 | for e in punctuations: 87 | del counted_target_character[e] 88 | 89 | total_unique_characters = len(counted_target_character) 90 | 91 | if not no_words: 92 | # import LAC for segmentation 93 | from LAC import LAC 94 | 95 | # initialize the parser 96 | print("Initializing parser...", end="\r") 97 | lac = LAC(mode='seg') 98 | print("Initializing parser... done\n") 99 | 100 | target_word_content = list(lac.run(target_text_content)) 101 | stripped_target_word_content = shared.remove_exclusions(target_word_content, exclude_words, True) # get rid of punctuation that inflates the word count 102 | counted_target_word = Counter(stripped_target_word_content) # yes for getting rid of punctuations 103 | total_unique_words = len(counted_target_word) 104 | 105 | # calculate hsk distribution 106 | hsk_distribution = {} 107 | with open('data/hsk_list.csv', mode='r', encoding="utf8") as csv_file: 108 | rows = csv.reader(csv_file, delimiter=",") 109 | for row in rows: 110 | if row[0] != "hanzi": # first row 111 | hsk_distribution[row[0]] = { 112 | "level": row[1], 113 | "pinyin": row[2], 114 | "meaning": row[3] 115 | } 116 | 117 | hsk_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, "-": 0} 118 | for word in stripped_target_word_content: 119 | try: 120 | hsk_counts[int(hsk_distribution[word]["level"])] += 1 121 | except: 122 | hsk_counts["-"] += 1 123 | 124 | total_value = 0 125 | all_values = sum(hsk_counts.values()) 126 | for (key, value) in hsk_counts.items(): 127 | total_value += value 128 | percentage = round((total_value / all_values) * 100, 3) 129 | value = [str(value), f" ({percentage}%)"] 130 | hsk_counts[key] = value 131 | 132 | hsk_output = [] 133 | for (key, value) in hsk_counts.items(): 134 | hsk_output.append([key, value[0], value[1]]) 135 | 136 | if outputfile is not None: 137 | try: 138 | with open(outputfile, "w+", encoding="utf8") as file: 139 | if not no_words: 140 | file.write("=== All Unique Words ===\n") 141 | total_count = sum(counted_target_word.values()) 142 | current_cumulative_count = 0 143 | for ele, count in counted_target_word.most_common(): 144 | current_cumulative_count += count 145 | if ele not in known_words: 146 | ele = "*" + str(ele) 147 | file.write(ele + (8 - len(ele)) * " " + ": " + str(count) + (7 - len(str(count))) * " " + ": " + 8 * " " + str(round((current_cumulative_count * 100) / total_count, 3)) + "%\n") 148 | 149 | file.write("\n\n\n") 150 | file.write("=== All Unique Characters ===\n") 151 | total_count = sum(counted_target_character.values()) 152 | current_cumulative_count = 0 153 | for ele, count in counted_target_character.most_common(): 154 | current_cumulative_count += count 155 | if ele not in known_words: 156 | ele = "*" + str(ele) 157 | file.write(ele + (8 - len(ele)) * " " + ": " + str(count) + (7 - len(str(count))) * " " + ": " + 8 * " " + str(round((current_cumulative_count * 100) / total_count, 3)) + "%\n") 158 | except KeyError as ke: 159 | return ke 160 | 161 | if not no_words: 162 | return ( 163 | "\nTotal Words: " 164 | + f"{shared.round_to_nearest_50(len(stripped_target_word_content))}" # stripped as in no punctuations 165 | "\nTotal Unique Words: " 166 | + f"{shared.round_to_nearest_50(total_unique_words)}" 167 | "\nTotal Characters: " 168 | + f"{shared.round_to_nearest_50(len(target_text_content))}" 169 | "\nTotal Unique Characters: " 170 | + f"{shared.round_to_nearest_50(total_unique_characters)}" 171 | + "\n\n=== HSK Breakdown ===\n" 172 | + tabulate(hsk_output, headers=["Level", "Count", "Cumulative Frequency"]) 173 | ) 174 | else: 175 | return ( 176 | "Total Characters: " 177 | + f"{shared.round_to_nearest_50(len(target_text_content))}" 178 | "\nTotal Unique Characters: " 179 | + f"{shared.round_to_nearest_50(total_unique_characters)}" 180 | ) 181 | 182 | if __name__ == "__main__": 183 | print(text_analyzer(args.known, args.target, args.output, args.exclude, args.no_words)) 184 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chinese-comprehension 2 | Analyze a Chinese text using your known words to gauge comprehension. 3 | 4 | # Requirements 5 | * Python 3.9 or below 6 | * [LAC](https://github.com/baidu/lac/) - Chinese character segmentation library 7 | 8 | ![image](https://user-images.githubusercontent.com/61620873/118395391-88917000-b64a-11eb-8341-3fe3a12aec27.png) 9 | ![image](https://user-images.githubusercontent.com/61620873/118395410-a65ed500-b64a-11eb-8d3c-5546e65b9d5a.png) 10 | ![image](https://user-images.githubusercontent.com/61620873/118395468-db6b2780-b64a-11eb-8d29-cc745228ea18.png) 11 | 12 | 13 | ## Features 14 | - Count unique words in text 15 | - Count unique unknown words in text 16 | - Calculate comprehension of text based on your known words 17 | - Calculate the above splitting text and known vocab word-by-word or character-by-character 18 | - Generate a breakdown of the text by HSK level 19 | - Exclude words such as proper nouns to improve comprehension accuracy 20 | - Output unknown words into a file, sorted by frequency 21 | - Add all words from book to known wordlist 22 | 23 | ## Installation 24 | *Non-technical explanation for those not familiar with programming.* 25 | 26 | Step 1: Download Python 3.9 or below. [Link](https://www.python.org/downloads/release/python-3912/) 27 | 28 | **Note program will not run on Python 3.10 or above** 29 | 30 | Step 2: Install Python 3.9. When installing you need to make sure you click the box on the bottom that says "install path variable". 31 | 32 | Step 3: After Python is installed click on the search bar and type cmd to start command prompt. This will pull up a window that is "Command Prompt" (Windows). 33 | 34 | If you are using Mac, simply open a terminal window by pressing cmd-space and then typing terminal and opening said app. 35 | 36 | Step 4: Verify you have python installed by typing `python --version`, it should tell you that you have python 3.X.X installed. If the version says 2.X.X, try typing `python3 --version` instead, and use `python3 -m pip` instead of `pip` in future steps. 37 | 38 | Step 5: Download the comprehension zip file [here](https://github.com/Destaq/chinese-comprehension) by clicking the green button and clicking download zip. 39 | 40 | Step 6: Once the comprehension zip file is installed you can extract it to where you want it to be. 41 | 42 | Step 7: Open the command prompt/terminal window and navigate to the folder you extracted the comprehension zip file to. [How to navigate in the terminal tutorial](https://tutorials.codebar.io/command-line/introduction/tutorial.html). You can also navigate by dragging the folder to the command prompt. 43 | 44 | Step 8: Type in `pip install -r requirements.txt` in the commmand line once you navigate to the comprehension folder. 45 | 46 | Step 9: You can now refer to the documentation on this page to use the tool! 47 | 48 | Finally (technical), note that if you are using an **M1 Mac**, you will need to run everything via a virtual environment as this chip is not supported yet by LAC. This can be easily done with conda by following the following [steps](https://github.com/conda-forge/miniforge/issues/165#issuecomment-860233092). 49 | 50 | ## Usage 51 | ``` 52 | usage: comprehension.py [-h] -k KNOWN -t TARGET [-m MODE] [-u UNKNOWN] 53 | [-e EXCLUDE] 54 | 55 | Calculates percentage comprehension of a text file based on known words. 56 | 57 | optional arguments: 58 | -h, --help show this help message and exit 59 | -k KNOWN, --known KNOWN 60 | Relative path to .txt file with newline-separated known words. 61 | -t TARGET, --target TARGET 62 | Relative path to .txt target file in Chinese. 63 | -m MODE, --mode MODE Mode for separating text and known vocab: 'smart' (default, word-by-word using jieba) 'simple' (character-by-character) 64 | -c, --characters Add this flag (just -c, no extra info) if you know all the characters in your wordlist. This is due to segmentation limitation. For ex. 慢慢的 is seen as one word, if this word is not in your wordlist, 65 | it will be unknown. By setting this flag (and having the characters 慢 and 的 in your wordlist (can be part of other words), 慢慢的 will be an 'understood' word. 66 | -u UNKNOWN, --unknown UNKNOWN 67 | Path to output file with unknown words from text. Skip to not create an output file. 68 | -e EXCLUDE, --exclude EXCLUDE 69 | Path to .txt file with newline-separated words to exclude (e.g. proper nouns) 70 | ``` 71 | 72 | The `--known` parameter takes the filename containing known words. These words represent all words the user knows for best accuracy. Methods for fetching these words: 73 | - export from Anki 74 | - export from Pleco 75 | - take HSK test 76 | - consult [HelloChinese word list](https://docs.google.com/spreadsheets/d/1PppWybtv_ch5QMqtWlU4kAm08uFuhYK-6HGVnGeT63Y/edit#gid=121546596) 77 | 78 | The file should have words separated line-by-line: 79 | ``` 80 | 是 81 | 你好 82 | 再见 83 | 有 84 | 五 85 | ... 86 | ``` 87 | 88 | The `--target` parameter takes the filename containing the target text. This should be normally formatted: 89 | ``` 90 | 美猴王一見,倒身下拜,磕頭不計其數,口中只道:「師父,師父,我弟子志心 91 | 朝禮,志心朝禮。」祖師道:「你是那方人氏?且說個鄉貫、姓名明白,再拜。」 92 | 猴王道:「弟子乃東勝神洲傲來國花果山水簾洞人氏。」祖師喝令:「趕出去! 93 | 他本是個撒詐搗虛之徒,那裏 94 | ... 95 | ``` 96 | 97 | The `-c` or `--comprehension` flag allows you to mark words which would otherwise be unknown as known, as long as you know all of the characters that make it up. Due to the way the word segmenter words, many words that learners are likely to know are graded separately, and thus would not be present on the `known.txt` file. 98 | 99 | For example, say a learner knows the word `开心` and the particle `地`. Logically, they would be expected to understand the word `开心地`, or happily. However, because this word is parsed *standalone*, unless it is explicitly on the wordlist, it would be viewed as unknown. This behavior can be bypassed by setting the `-c` flag, ex. `python3 comprehension.py -k "known.txt" -t "myfile.pdf" -c`. Keep in mind that this method is also not perfect, because independent words made up of known characters may have differing meanings (e.g. 头发 - learners may know 头 and 发 but not them in conjunction). 100 | 101 | Quick side note: the `"` here are not required, but it's best to put them here anywhere. If your filename has a space in it (for example `my known.txt`), then that will obviously mess with the command line, so this would be reason to put speech marks around it. If you're not sure, just always put speech marks around the arguments. 102 | 103 | It is advised to take comprehension using the `-c` flag with a grain of salt, based on the difficulty of the text the level is likely to be some percentage points lower. But it is still far more accurate then without the flag. 104 | 105 | `--mode` allows you to switch between 'simple' and 'smart' mode, where the default is 'smart' - segmenting text word-by-word (ex. 你/有/什么/名字/? for smart vs 你/有/什/么…… for simple). 106 | 107 | `--unknown` allows you to create a file with all the unknown words in the text, in the format: 108 | ``` 109 | Hanzi : Count 110 | Hanzi : Count 111 | ... 112 | ``` 113 | 114 | which is sorted by frequency. Ideal when preparing for a more difficult text or wanting to recap words. __This file has to be .txt. format__. Ex. `python3 comprehension.py -k "data/known.txt" -t "books/Earth_Vernes.pdf" -u "data/unknown_words.txt"`. 115 | 116 | The `--exclude` parameter takes the filename containing words to exclude words. Exclude any proper nouns such as character names & company names to improve accuracy. 117 | 118 | The file should have words separated line-by-line: 119 | ``` 120 | 安琪 121 | 赵宁一 122 | 爱丽丝 123 | 麦当劳 124 | ... 125 | ``` 126 | 127 | ### Example 128 | 129 | *Code*: `python3 comprehension.py --known "known.txt" -t "samples/books/Great_Expectations.pdf" -u "output.txt"` 130 | *Description*: Gathers known words from `known.txt`, and analyzes `samples/books/Great_Expectations.pdf` using the default word-by-word splitting. Unknown words are outputted to `output.txt`. 131 | 132 | *Content of `output.txt`* 133 | ``` 134 | 道 : 4621 135 | 行者 : 2575 136 | 來 : 1665 137 | 裏 : 1591 138 | 與 : 1498 139 | 又 : 1485 140 | 卻 : 1264 141 | ... 142 | ``` 143 | 144 | # Analyzer 145 | 146 | ## Usage 147 | ``` 148 | usage: analyzer.py [-h] -t TARGET [-o OUTPUT] [-e EXCLUDE] [-n NOWORDS] 149 | 150 | Calculate unique words and character count of a text file - result is rounded 151 | to nearest 50. Note that character counts may not line up with character counts 152 | seen in official webnovel figures, as this tool does not count punctuation. 153 | 154 | optional arguments: 155 | -h, --help show this help message and exit 156 | -t TARGET, --target TARGET 157 | Relative path to .txt or .pdf target file in Chinese. 158 | -o OUTPUT, --output OUTPUT 159 | Path to output file with all words & characters words 160 | from text. Skip to not create an output file. 161 | -e EXCLUDE, --exclude EXCLUDE 162 | Path to .txt file with newline-separated words to 163 | exclude (e.g. proper nouns) 164 | -n NOWORDS, --no-words NOWORDS 165 | Setting this flag will mean that the tool does not segment words, so you will not have a calculating of # of words, # of unique words, and HSK breakdown. Can lead to a significant speedup, as segmentation takes approx. 1 minute per 1 million characters. Off by default. To set, simply add -n. 166 | ``` 167 | 168 | Finally, you can use the `--known` or `-k` argument linking to a file in your system that has newline-separated words that you know. By doing so, and also outputting a file, any words/characters that you don't know will have a star symbol (*) by them. 169 | 170 | ### Example 171 | 172 | *Code*: `python3 analyzer.py -t "samples/books/journey_to_the_west.txt" -o "output.txt"` 173 | *Description*: Analyzes `samples/books/journey_to_the_west.txt` using the word-by-word and character-by-character splitting. Outputs all characters and words to `output.txt`. 174 | 175 | *Output* 176 | ``` 177 | Total Unique Words: 32226 178 | Total Unique Characters: 3572 179 | ``` 180 | 181 | *Content of `output.txt`* 182 | ``` 183 | === All Unique Words === 184 | 的 : 18840 185 | 了 : 15791 186 | 昇 : 10683 187 | 周 : 9155 188 | 余皓 : 8995 189 | 我 : 7512 190 | ... 191 | 192 | === All Unique Characters === 193 | 的 : 19664 (4.2%) 194 | 了 : 17135 (5.6%) 195 | 余 : 14223 ... 196 | *皓 : 14103 197 | 一 : 12667 198 | 周 : 11641 199 | 昇 : 10756 200 | 不 : 10217 201 | 我 : 8723 202 | ``` 203 | 204 | Note the *. This will be put next to words that are not known if a knownfile is provided. Likewise, note the percentages. This is the cumulative frequency percentages of these texts. 205 | Currently only possible if smart mode is selected. 206 | 207 | # Vocab Adder 208 | The `vocab_adder` file is extremely simple. It allows you to input a file and your known vocab list, and will append all unknown words in the file to your vocab list. 209 | 210 | Example: 211 | `python3 vocab_adder.py -t books/Earth_Vernes.pdf -k data/known.txt` 212 | 213 | You can specify the mode (default is smart, which is segmentation) with the `-m` flag by typing `--mode simple`. 214 | 215 | # FAQ 216 | 217 | Q: How is punctuation counted? 218 | A: As is the industry standard, punctuation (periods, commas, etc.) *are* included in total character count. However, they are not included in unique character count nor in any output files. 219 | --------------------------------------------------------------------------------