├── core
    ├── __init__.py
    └── shared.py
├── requirements.txt
├── .gitignore
├── vocab_adder.py
├── comprehension.py
├── analyzer.py
└── README.md


/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pdfminer.six==20201018
2 | LAC==2.1.1
3 | Unidecode==1.2.0
4 | tabulate==0.8.9
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /core/__pycache__
2 | .DS_STORE
3 | /books
4 | /data/*.txt
5 | output.txt
6 | mega_book.py
7 | 


--------------------------------------------------------------------------------
/vocab_adder.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import core.shared as shared
 3 | from collections import Counter
 4 | from LAC import LAC
 5 | 
 6 | parser = argparse.ArgumentParser(
 7 |     description="Add words from a file to your known vocab."
 8 | )
 9 | parser.add_argument("-t", "--target", required=True, help="Path to .txt or .pdf file of target text to add")
10 | parser.add_argument("-k", "--known", required=True, help="Path to .txt file where vocabulary from the text will be dumped.")
11 | parser.add_argument("-m", "--mode", default="smart", help="smart/simple (smart default) - whether to split text char-by-char (simple) or word-by-word (smart)")
12 | args = parser.parse_args()
13 | 
14 | print("Initializing parser...", end="\r")
15 | lac = LAC(mode='seg')
16 | print("Initializing parser... done\n")
17 | 
18 | def add_vocab(targetfile: str, knownfile: str, mode: str):
19 |     target_text = shared.text_setup(targetfile)
20 |     target_text_content = shared.text_clean_up(target_text)
21 | 
22 |     if mode == "smart":
23 |         target_text_content = list(lac.run(target_text_content))
24 |     else:
25 |         target_text_content = shared.split_unicode_chrs(target_text_content)
26 | 
27 |     target_text_content = shared.remove_exclusions(target_text_content, [], True)
28 | 
29 |     # add to known wordlist if not in wordlist
30 |     known_words_list = []
31 |     with open(knownfile, "r+", encoding="utf8") as file:
32 |         known_words_list = file.read().splitlines()
33 |     
34 |     with open(knownfile, "w+", encoding="utf8") as file:
35 |         for word in known_words_list:
36 |             file.write(word + "\n")
37 | 
38 |         for word in list(dict.fromkeys(target_text_content)):
39 |             if word not in known_words_list:
40 |                 known_words_list.append(word)
41 |                 file.write(word + "\n")
42 | 
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     add_vocab(args.target, args.known, args.mode)
47 |     print("Task completed.")
48 | 


--------------------------------------------------------------------------------
/core/shared.py:
--------------------------------------------------------------------------------
 1 | import re, os
 2 | import unicodedata
 3 | import pdfminer.high_level
 4 | from re import compile as _Re
 5 | 
 6 | def load_word_list_from_file(file: str):
 7 |     try:
 8 |         word_list = open(file, "r", encoding="utf8")  # filename of your known words here
 9 |     except KeyError as ke:
10 |         raise ke
11 | 
12 |     words = set(
13 |         re.sub(r"\s+", "\n", word_list.read()).split("\n")
14 |     )  # splitting to remove accidental whitespace
15 |     if "" in words:
16 |         words.remove("")
17 |     word_list.close()
18 | 
19 |     finalized_words = words.copy()
20 | 
21 |     # assume learner knows characters used in every word they know
22 |     # this is to make parsing words such as 慢慢的 which are not
23 |     # on the HSK be "recognized" by the program.
24 |     for word in words:
25 |         for single_hanzi in [char for char in word]:
26 |             finalized_words.add(single_hanzi)
27 | 
28 |     return finalized_words
29 | 
30 | def text_clean_up(target_text):
31 |     target_text_content = "".join(
32 |         re.sub(r"\s+", "\n", target_text).split("\n")
33 |     )  # remove whitespace
34 | 
35 |     # remove diacritics
36 |     normalized = unicodedata.normalize("NFKD", target_text_content)
37 |     result = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
38 |     return result
39 | 
40 | def remove_exclusions(word_list: list, additional_exclusions: list, do_punctuations=False):
41 |     punctuations = (
42 |         ",.:()!@[]+/\\！?？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\""
43 |     )  # NOTE: need to include English punctuation due to PDF reader
44 |     # NOTE: punctuations are now disabled by default as that is the industry standard
45 | 
46 |     if do_punctuations:
47 |         exclusions = [char for char in punctuations]
48 |         word_list = [word for word in word_list if word not in exclusions]
49 | 
50 |     word_list = list(filter(lambda x: x not in additional_exclusions and not re.match(r'[a-zA-Z0-9]+', x), word_list))
51 | 
52 |     return word_list
53 | 
54 | def round_to_nearest_50(x, base=50):
55 |     return base * round(x/base)
56 | 
57 | def text_setup(file):
58 |     _, file_extension = os.path.splitext(file)
59 |     if file_extension == ".pdf":
60 |         target_text = pdfminer.high_level.extract_text(file)
61 |     else:  # already in txt format
62 |         try:
63 |             target_text = open(file, "r", encoding="utf8")  # filename of your target text here
64 |             target_text = target_text.read()
65 |         except KeyError as ke:
66 |             raise ke
67 | 
68 |     return target_text
69 | 
70 | 
71 | _unicode_chr_splitter = _Re("(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)").split
72 | 
73 | def split_unicode_chrs(text):
74 |     """
75 |     Split a Chinese text character by character.
76 | 
77 |     Courtesy of `flow` on StackOverflow: https://stackoverflow.com/a/3798790/12876940
78 |     """
79 |     return [chr for chr in _unicode_chr_splitter(text) if chr]
80 | 


--------------------------------------------------------------------------------
/comprehension.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from LAC import LAC
  3 | from collections import Counter
  4 | import core.shared as shared
  5 | 
  6 | parser = argparse.ArgumentParser(
  7 |     description="Calculates percentage comprehension of a text file based on known words."
  8 | )
  9 | parser.add_argument(
 10 |     "-k",
 11 |     "--known",
 12 |     required=True,
 13 |     help="Relative path to .txt file with newline-separated known words.",
 14 | )
 15 | parser.add_argument(
 16 |     "-t",
 17 |     "--target",
 18 |     required=True,
 19 |     help="Relative path to .txt target file in Chinese.",
 20 | )
 21 | parser.add_argument(
 22 |     "-m",
 23 |     "--mode",
 24 |     default="smart",
 25 |     help="Mode for separating text and known vocab: 'smart' (default, word-by-word using jieba) 'simple' (character-by-character)",
 26 | )
 27 | parser.add_argument(
 28 |     "-c",
 29 |     "--characters",
 30 |     required=False,
 31 |     default=False,
 32 |     action="store_true",
 33 |     help="SUGGESTED: Add this flag (just -c, no extra info) if you know all the characters in your wordlist. This is due to segmentation limitation. For ex. 慢慢的 is seen as one word, if this word is not in your wordlist, it will be unknown. By setting this flag (and having the characters 慢 and 的 in your wordlist (can be part of other words), 慢慢的 will be an 'understood' word."
 34 | )
 35 | parser.add_argument(
 36 |     "-u",
 37 |     "--unknown",
 38 |     required=False,
 39 |     help="Path to output file with unknown words from text. Skip to not create an output file.",
 40 | )
 41 | parser.add_argument(
 42 |     "-e",
 43 |     "--exclude",
 44 |     required=False,
 45 |     help="Path to .txt file with newline-separated words to exclude (e.g. proper nouns)",
 46 | )
 47 | 
 48 | args = parser.parse_args()
 49 | 
 50 | print("Initializing parser...", end="\r")
 51 | lac = LAC(mode='seg')
 52 | print("Initializing parser... done\n")
 53 | 
 54 | 
 55 | def comprehension_checker(
 56 |     knownfile: str, targetfile: str, mode: str, outputfile: str, excludefile: str,
 57 | ) -> str:
 58 |     known_words = shared.load_word_list_from_file(knownfile)
 59 | 
 60 |     exclude_words = []
 61 |     if excludefile != None:
 62 |         exclude_words = shared.load_word_list_from_file(excludefile)
 63 | 
 64 | 
 65 |     # get text in correct format if in PDF format; TODO: more formats
 66 |     target_text = shared.text_setup(targetfile)
 67 | 
 68 |     target_text_content = shared.text_clean_up(target_text)
 69 | 
 70 |     character_word_text = ""
 71 |     if mode == "smart":
 72 |         character_word_text = "Words"
 73 |         target_text_content = list(lac.run(target_text_content))
 74 |     elif mode == "simple":
 75 |         character_word_text = "Characters"
 76 |         target_text_content = shared.split_unicode_chrs(target_text_content)
 77 |         known_words = set(
 78 |             "".join([e for e in known_words])
 79 |         )  # convert known_words to chr_by_chr too
 80 |     else:
 81 |         raise KeyError("mode provided invalid")
 82 | 
 83 |     target_text_content = shared.remove_exclusions(target_text_content, exclude_words)
 84 |     counted_target = Counter(target_text_content)
 85 | 
 86 |     # get rid of punctuations, as this is used for writing and stats
 87 |     punctuations = (
 88 |         ",.:()!@[]+/\\！?？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\""
 89 |     )
 90 |     punctuations = [e for e in punctuations]
 91 |     # delete elements in punctuations from counted_target Counter object
 92 |     for e in punctuations:
 93 |         del counted_target[e]
 94 | 
 95 |     target_length = len(target_text_content)  # includes punctuation
 96 | 
 97 |     total_unique_words = len(counted_target)
 98 |     counter = 0
 99 |     crosstext_count = 0  # counter of words that are understood
100 |     unknown_words = []
101 |     unknown_word_counter = 0
102 | 
103 |     for hanzi, count in counted_target.items():  # hanzzi represents a full word (unless simple mode)
104 |         counter += 1
105 |         print(f"-- {counter/total_unique_words * 100:.2f}% complete --", end="\r")
106 |         if hanzi in known_words:
107 |             crosstext_count += count
108 |         elif set([char for char in hanzi]).issubset(set(known_words)) and args.characters:
109 |             # ex. user knows 慢 的，慢慢的=对
110 |             crosstext_count += count
111 |         else:
112 |             unknown_word_counter += 1
113 |             if outputfile is not None:
114 |                 unknown_words.append((hanzi, count))
115 | 
116 |     unknown_words.sort(key=sort_by_count, reverse=True)
117 |     if outputfile is not None:
118 |         try:
119 |             with open(outputfile, "w+", encoding="utf8") as file:
120 |                 for ele, count in unknown_words:
121 |                     file.write(ele + " : " + str(count) + "\n")
122 |         except KeyError as ke:
123 |             return ke
124 |             
125 |     return (
126 |         f"\nWord Count: {len(target_text_content)} (excluding 'exclusions')"
127 |         + "\nTotal Unique " + f"{character_word_text}" + ": "
128 |         + f"{total_unique_words}"
129 |         +"\nComprehension: "
130 |         + f"{crosstext_count/target_length * 100:.3f}%"
131 |         + "\nUnique Unknown " + f"{character_word_text}" + ": "
132 |         + f"{unknown_word_counter}"
133 |     )
134 | 
135 | def sort_by_count(e):
136 |   return e[1]
137 | 
138 | if __name__ == "__main__":
139 |     print(comprehension_checker(args.known, args.target, args.mode, args.unknown, args.exclude))
140 | 


--------------------------------------------------------------------------------
/analyzer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import csv
  4 | from tabulate import tabulate
  5 | from collections import Counter
  6 | from re import compile as _Re
  7 | import core.shared as shared
  8 | 
  9 | os.system("")
 10 | 
 11 | parser = argparse.ArgumentParser(
 12 |     description="Calculate unique words and character count of a text file - result is rounded to nearest 50"
 13 | )
 14 | parser.add_argument(
 15 |     "-k",
 16 |     "--known",
 17 |     required=False,
 18 |     help="Relative path to .txt file with newline-separated known words for *ing in output.",
 19 | )
 20 | parser.add_argument(
 21 |     "-t",
 22 |     "--target",
 23 |     required=True,
 24 |     help="Relative path to .txt target file in Chinese.",
 25 | )
 26 | parser.add_argument(
 27 |     "-o",
 28 |     "--output",
 29 |     required=False,
 30 |     help="Path to output file with all words & characters words from text. Skip to not create an output file.",
 31 | )
 32 | parser.add_argument(
 33 |     "-e",
 34 |     "--exclude",
 35 |     required=False,
 36 |     help="Path to .txt file with newline-separated words to exclude (e.g. proper nouns).",
 37 | )
 38 | parser.add_argument(
 39 |     "-n",
 40 |     "--no-words",
 41 |     dest="no_words",
 42 |     action="store_true",
 43 |     help="Setting this flag will mean that the tool does not segment words, so you will not have a calculating of # of words, # of unique words, and HSK breakdown. Can lead to a significant speedup, as segmentation takes approx. 1 minute per 1 million characters. Off by default. To set, simply add -n."
 44 | )
 45 | 
 46 | parser.set_defaults(no_words=False)
 47 | args = parser.parse_args()
 48 | 
 49 | _unicode_chr_splitter = _Re("(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)").split
 50 | 
 51 | 
 52 | def split_unicode_chrs(text):
 53 |     """
 54 |     Split a Chinese text character by character.
 55 | 
 56 |     Curtesy of `flow` on StackOverflow: https://stackoverflow.com/a/3798790/12876940
 57 |     """
 58 |     return [chr for chr in _unicode_chr_splitter(text) if chr]
 59 | 
 60 | 
 61 | def text_analyzer(
 62 |     knownfile: str, targetfile: str, outputfile: str, excludefile: str, no_words: bool = False
 63 | ) -> str:
 64 |     try:
 65 |         known_words = shared.load_word_list_from_file(knownfile)
 66 |     except TypeError:
 67 |         known_words = []
 68 | 
 69 |     exclude_words = []
 70 |     if excludefile != None:
 71 |         exclude_words = shared.load_word_list_from_file(excludefile)
 72 | 
 73 |     # access text in .txt format
 74 |     target_text = shared.text_setup(targetfile)
 75 | 
 76 |     target_text_content = shared.text_clean_up(target_text)
 77 |     target_text_content = ''.join(shared.remove_exclusions(target_text_content, exclude_words))
 78 |     target_character_content = split_unicode_chrs(target_text_content)
 79 |     counted_target_character = Counter(shared.remove_exclusions(target_character_content, exclude_words))
 80 | 
 81 |     punctuations = (
 82 |         ",.:()!@[]+/\\！?？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─\''\""
 83 |     )
 84 |     punctuations = [e for e in punctuations]
 85 |     # delete the elements in punctuations from counted_target Counter object
 86 |     for e in punctuations:
 87 |         del counted_target_character[e]
 88 | 
 89 |     total_unique_characters = len(counted_target_character)
 90 | 
 91 |     if not no_words:
 92 |         # import LAC for segmentation
 93 |         from LAC import LAC
 94 | 
 95 |         # initialize the parser
 96 |         print("Initializing parser...", end="\r")
 97 |         lac = LAC(mode='seg')
 98 |         print("Initializing parser... done\n")
 99 | 
100 |         target_word_content = list(lac.run(target_text_content))
101 |         stripped_target_word_content = shared.remove_exclusions(target_word_content, exclude_words, True)  # get rid of punctuation that inflates the word count
102 |         counted_target_word = Counter(stripped_target_word_content)  # yes for getting rid of punctuations
103 |         total_unique_words = len(counted_target_word)
104 | 
105 |         # calculate hsk distribution
106 |         hsk_distribution = {}
107 |         with open('data/hsk_list.csv', mode='r', encoding="utf8") as csv_file:
108 |             rows = csv.reader(csv_file, delimiter=",")
109 |             for row in rows:
110 |                 if row[0] != "hanzi":  # first row
111 |                     hsk_distribution[row[0]] = {
112 |                         "level": row[1],
113 |                         "pinyin": row[2],
114 |                         "meaning": row[3]
115 |                     }
116 | 
117 |         hsk_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, "-": 0}
118 |         for word in stripped_target_word_content:
119 |             try:
120 |                 hsk_counts[int(hsk_distribution[word]["level"])] += 1
121 |             except:
122 |                 hsk_counts["-"] += 1
123 | 
124 |         total_value = 0
125 |         all_values = sum(hsk_counts.values())
126 |         for (key, value) in hsk_counts.items():
127 |             total_value += value
128 |             percentage = round((total_value / all_values) * 100, 3)
129 |             value = [str(value), f" ({percentage}%)"]
130 |             hsk_counts[key] = value
131 | 
132 |         hsk_output = []
133 |         for (key, value) in hsk_counts.items():
134 |             hsk_output.append([key, value[0], value[1]])
135 | 
136 |     if outputfile is not None:
137 |         try:
138 |             with open(outputfile, "w+", encoding="utf8") as file:
139 |                 if not no_words:
140 |                     file.write("=== All Unique Words ===\n")
141 |                     total_count = sum(counted_target_word.values())
142 |                     current_cumulative_count = 0
143 |                     for ele, count in counted_target_word.most_common():
144 |                         current_cumulative_count += count
145 |                         if ele not in known_words:
146 |                             ele = "*" + str(ele)
147 |                         file.write(ele + (8 - len(ele)) * " " + ": " + str(count) + (7 - len(str(count))) * " " + ": " + 8 * " " + str(round((current_cumulative_count * 100) / total_count, 3)) + "%\n")
148 | 
149 |                     file.write("\n\n\n")
150 |                 file.write("=== All Unique Characters ===\n")
151 |                 total_count = sum(counted_target_character.values())
152 |                 current_cumulative_count = 0
153 |                 for ele, count in counted_target_character.most_common():
154 |                     current_cumulative_count += count
155 |                     if ele not in known_words:
156 |                         ele = "*" + str(ele)
157 |                     file.write(ele + (8 - len(ele)) * " " + ": " + str(count) + (7 - len(str(count))) * " " + ": " + 8 * " " + str(round((current_cumulative_count * 100) / total_count, 3)) + "%\n")
158 |         except KeyError as ke:
159 |             return ke
160 |             
161 |     if not no_words:
162 |         return (
163 |             "\nTotal Words: "
164 |             + f"{shared.round_to_nearest_50(len(stripped_target_word_content))}"  # stripped as in no punctuations
165 |             "\nTotal Unique Words: "
166 |             + f"{shared.round_to_nearest_50(total_unique_words)}"
167 |             "\nTotal Characters: "
168 |             + f"{shared.round_to_nearest_50(len(target_text_content))}"
169 |             "\nTotal Unique Characters: "
170 |             + f"{shared.round_to_nearest_50(total_unique_characters)}"
171 |             + "\n\n=== HSK Breakdown ===\n"
172 |             + tabulate(hsk_output, headers=["Level", "Count", "Cumulative Frequency"])
173 |         )
174 |     else:
175 |         return (
176 |             "Total Characters: "
177 |             + f"{shared.round_to_nearest_50(len(target_text_content))}"
178 |             "\nTotal Unique Characters: "
179 |             + f"{shared.round_to_nearest_50(total_unique_characters)}"
180 |         )
181 | 
182 | if __name__ == "__main__":
183 |     print(text_analyzer(args.known, args.target, args.output, args.exclude, args.no_words))
184 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # chinese-comprehension
  2 | Analyze a Chinese text using your known words to gauge comprehension.
  3 | 
  4 | # Requirements
  5 | * Python 3.9 or below
  6 | * [LAC](https://github.com/baidu/lac/) - Chinese character segmentation library
  7 | 
  8 | ![image](https://user-images.githubusercontent.com/61620873/118395391-88917000-b64a-11eb-8341-3fe3a12aec27.png)
  9 | ![image](https://user-images.githubusercontent.com/61620873/118395410-a65ed500-b64a-11eb-8d3c-5546e65b9d5a.png)
 10 | ![image](https://user-images.githubusercontent.com/61620873/118395468-db6b2780-b64a-11eb-8d29-cc745228ea18.png)
 11 | 
 12 | 
 13 | ## Features
 14 | - Count unique words in text
 15 |   - Count unique unknown words in text
 16 | - Calculate comprehension of text based on your known words
 17 |   - Calculate the above splitting text and known vocab word-by-word or character-by-character
 18 | - Generate a breakdown of the text by HSK level
 19 | - Exclude words such as proper nouns to improve comprehension accuracy 
 20 | - Output unknown words into a file, sorted by frequency
 21 | - Add all words from book to known wordlist
 22 | 
 23 | ## Installation
 24 | *Non-technical explanation for those not familiar with programming.*
 25 | 
 26 | Step 1: Download Python 3.9 or below. [Link](https://www.python.org/downloads/release/python-3912/)
 27 | 
 28 | **Note program will not run on Python 3.10 or above**
 29 | 
 30 | Step 2: Install Python 3.9. When installing you need to make sure you click the box on the bottom that says "install path variable".
 31 | 
 32 | Step 3: After Python is installed click on the search bar and type cmd to start command prompt. This will pull up a window that is "Command Prompt" (Windows).
 33 | 
 34 | If you are using Mac, simply open a terminal window by pressing cmd-space and then typing terminal and opening said app.
 35 | 
 36 | Step 4: Verify you have python installed by typing `python --version`, it should tell you that you have python 3.X.X installed. If the version says 2.X.X, try typing `python3 --version` instead, and use `python3 -m pip` instead of `pip` in future steps.
 37 | 
 38 | Step 5: Download the comprehension zip file [here](https://github.com/Destaq/chinese-comprehension) by clicking the green button and clicking download zip.
 39 | 
 40 | Step 6: Once the comprehension zip file is installed you can extract it to where you want it to be.
 41 | 
 42 | Step 7: Open the command prompt/terminal window and navigate to the folder you extracted the comprehension zip file to. [How to navigate in the terminal tutorial](https://tutorials.codebar.io/command-line/introduction/tutorial.html). You can also navigate by dragging the folder to the command prompt.
 43 | 
 44 | Step 8: Type in `pip install -r requirements.txt` in the commmand line once you navigate to the comprehension folder.
 45 | 
 46 | Step 9: You can now refer to the documentation on this page to use the tool!
 47 | 
 48 | Finally (technical), note that if you are using an **M1 Mac**, you will need to run everything via a virtual environment as this chip is not supported yet by LAC. This can be easily done with conda by following the following [steps](https://github.com/conda-forge/miniforge/issues/165#issuecomment-860233092).
 49 | 
 50 | ## Usage
 51 | ```
 52 | usage: comprehension.py [-h] -k KNOWN -t TARGET [-m MODE] [-u UNKNOWN]
 53 |                         [-e EXCLUDE]
 54 | 
 55 | Calculates percentage comprehension of a text file based on known words.
 56 | 
 57 | optional arguments:
 58 |   -h, --help            show this help message and exit
 59 |   -k KNOWN, --known KNOWN
 60 |                         Relative path to .txt file with newline-separated known words.
 61 |   -t TARGET, --target TARGET
 62 |                         Relative path to .txt target file in Chinese.
 63 |   -m MODE, --mode MODE  Mode for separating text and known vocab: 'smart' (default, word-by-word using jieba) 'simple' (character-by-character)
 64 |   -c, --characters      Add this flag (just -c, no extra info) if you know all the characters in your wordlist. This is due to segmentation limitation. For ex. 慢慢的 is seen as one word, if this word is not in your wordlist,
 65 |                         it will be unknown. By setting this flag (and having the characters 慢 and 的 in your wordlist (can be part of other words), 慢慢的 will be an 'understood' word.
 66 |   -u UNKNOWN, --unknown UNKNOWN
 67 |                         Path to output file with unknown words from text. Skip to not create an output file.
 68 |   -e EXCLUDE, --exclude EXCLUDE
 69 |                         Path to .txt file with newline-separated words to exclude (e.g. proper nouns)
 70 | ```
 71 | 
 72 | The `--known` parameter takes the filename containing known words. These words represent all words the user knows for best accuracy. Methods for fetching these words:
 73 | - export from Anki
 74 | - export from Pleco
 75 | - take HSK test
 76 | - consult [HelloChinese word list](https://docs.google.com/spreadsheets/d/1PppWybtv_ch5QMqtWlU4kAm08uFuhYK-6HGVnGeT63Y/edit#gid=121546596)
 77 | 
 78 | The file should have words separated line-by-line:
 79 | ```
 80 | 是
 81 | 你好
 82 | 再见
 83 | 有
 84 | 五
 85 | ...
 86 | ```
 87 | 
 88 | The `--target` parameter takes the filename containing the target text. This should be normally formatted:
 89 | ```
 90 | 美猴王一見，倒身下拜，磕頭不計其數，口中只道：「師父，師父，我弟子志心
 91 | 朝禮，志心朝禮。」祖師道：「你是那方人氏？且說個鄉貫、姓名明白，再拜。」
 92 | 猴王道：「弟子乃東勝神洲傲來國花果山水簾洞人氏。」祖師喝令：「趕出去！
 93 | 他本是個撒詐搗虛之徒，那裏
 94 | ...
 95 | ```
 96 | 
 97 | The `-c` or `--comprehension` flag allows you to mark words which would otherwise be unknown as known, as long as you know all of the characters that make it up. Due to the way the word segmenter words, many words that learners are likely to know are graded separately, and thus would not be present on the `known.txt` file.
 98 | 
 99 | For example, say a learner knows the word `开心` and the particle `地`. Logically, they would be expected to understand the word `开心地`, or happily. However, because this word is parsed *standalone*, unless it is explicitly on the wordlist, it would be viewed as unknown. This behavior can be bypassed by setting the `-c` flag, ex. `python3 comprehension.py -k "known.txt" -t "myfile.pdf" -c`. Keep in mind that this method is also not perfect, because independent words made up of known characters may have differing meanings (e.g. 头发 - learners may know 头 and 发 but not them in conjunction).
100 | 
101 | Quick side note: the `"` here are not required, but it's best to put them here anywhere. If your filename has a space in it (for example `my known.txt`), then that will obviously mess with the command line, so this would be reason to put speech marks around it. If you're not sure, just always put speech marks around the arguments.
102 | 
103 | It is advised to take comprehension using the `-c` flag with a grain of salt, based on the difficulty of the text the level is likely to be some percentage points lower. But it is still far more accurate then without the flag.
104 | 
105 | `--mode` allows you to switch between 'simple' and 'smart' mode, where the default is 'smart' - segmenting text word-by-word (ex. 你/有/什么/名字/？ for smart vs 你/有/什/么…… for simple).
106 | 
107 | `--unknown` allows you to create a file with all the unknown words in the text, in the format:
108 | ```
109 | Hanzi : Count
110 | Hanzi : Count
111 | ...
112 | ```
113 | 
114 | which is sorted by frequency. Ideal when preparing for a more difficult text or wanting to recap words. __This file has to be .txt. format__. Ex. `python3 comprehension.py -k "data/known.txt" -t "books/Earth_Vernes.pdf" -u "data/unknown_words.txt"`.
115 | 
116 | The `--exclude` parameter takes the filename containing words to exclude words. Exclude any proper nouns such as character names & company names to improve accuracy.
117 | 
118 | The file should have words separated line-by-line:
119 | ```
120 | 安琪
121 | 赵宁一
122 | 爱丽丝
123 | 麦当劳
124 | ...
125 | ```
126 | 
127 | ### Example
128 | 
129 | *Code*: `python3 comprehension.py --known "known.txt" -t "samples/books/Great_Expectations.pdf" -u "output.txt"`
130 | *Description*: Gathers known words from `known.txt`, and analyzes `samples/books/Great_Expectations.pdf` using the default word-by-word splitting. Unknown words are outputted to `output.txt`.
131 | 
132 | *Content of `output.txt`*
133 | ```
134 | 道 : 4621
135 | 行者 : 2575
136 | 來 : 1665
137 | 裏 : 1591
138 | 與 : 1498
139 | 又 : 1485
140 | 卻 : 1264
141 | ...
142 | ```
143 | 
144 | # Analyzer
145 | 
146 | ## Usage
147 | ```
148 | usage: analyzer.py [-h] -t TARGET [-o OUTPUT] [-e EXCLUDE] [-n NOWORDS]
149 | 
150 | Calculate unique words and character count of a text file - result is rounded
151 | to nearest 50. Note that character counts may not line up with character counts
152 | seen in official webnovel figures, as this tool does not count punctuation.
153 | 
154 | optional arguments:
155 |   -h, --help            show this help message and exit
156 |   -t TARGET, --target TARGET
157 |                         Relative path to .txt or .pdf target file in Chinese.
158 |   -o OUTPUT, --output OUTPUT
159 |                         Path to output file with all words & characters words
160 |                         from text. Skip to not create an output file.
161 |   -e EXCLUDE, --exclude EXCLUDE
162 |                         Path to .txt file with newline-separated words to
163 |                         exclude (e.g. proper nouns)
164 |   -n NOWORDS, --no-words NOWORDS
165 |                         Setting this flag will mean that the tool does not segment words, so you will not have a calculating of # of words, # of unique words, and HSK breakdown. Can lead to a significant speedup, as segmentation takes approx. 1 minute per 1 million characters. Off by default. To set, simply add -n.
166 | ```
167 | 
168 | Finally, you can use the `--known` or `-k` argument linking to a file in your system that has newline-separated words that you know. By doing so, and also outputting a file, any words/characters that you don't know will have a star symbol (*) by them.
169 | 
170 | ### Example
171 | 
172 | *Code*: `python3 analyzer.py -t "samples/books/journey_to_the_west.txt" -o "output.txt"`
173 | *Description*: Analyzes `samples/books/journey_to_the_west.txt` using the word-by-word and character-by-character splitting. Outputs all characters and words to `output.txt`.
174 | 
175 | *Output*
176 | ```
177 | Total Unique Words: 32226
178 | Total Unique Characters: 3572
179 | ```
180 | 
181 | *Content of `output.txt`*
182 | ```
183 | === All Unique Words ===
184 | 的 : 18840
185 | 了 : 15791
186 | 昇 : 10683
187 | 周 : 9155
188 | 余皓 : 8995
189 | 我 : 7512
190 | ...
191 | 
192 | === All Unique Characters ===
193 | 的 : 19664 (4.2%)
194 | 了 : 17135 (5.6%)
195 | 余 : 14223 ...
196 | *皓 : 14103
197 | 一 : 12667
198 | 周 : 11641
199 | 昇 : 10756
200 | 不 : 10217
201 | 我 : 8723
202 | ```
203 | 
204 | Note the *. This will be put next to words that are not known if a knownfile is provided. Likewise, note the percentages. This is the cumulative frequency percentages of these texts.
205 | Currently only possible if smart mode is selected.
206 | 
207 | # Vocab Adder
208 | The `vocab_adder` file is extremely simple. It allows you to input a file and your known vocab list, and will append all unknown words in the file to your vocab list.
209 | 
210 | Example:
211 | `python3 vocab_adder.py -t books/Earth_Vernes.pdf -k data/known.txt`
212 | 
213 | You can specify the mode (default is smart, which is segmentation) with the `-m` flag by typing `--mode simple`.
214 | 
215 | # FAQ
216 | 
217 | Q: How is punctuation counted?
218 | A: As is the industry standard, punctuation (periods, commas, etc.) *are* included in total character count. However, they are not included in unique character count nor in any output files.
219 | 


--------------------------------------------------------------------------------