33 | """
34 | logger = logging.getLogger(__file__)
35 | logging_level = getattr(logging, verbosity_level)
36 | logger.setLevel(logging_level)
37 | formatter = logging.Formatter(
38 | fmt='%(asctime)s %(levelname)s %(filename)s: %(message)s')
39 | stdout_handler = logging.StreamHandler(sys.stdout)
40 | stdout_handler.setLevel(logging_level)
41 | stdout_handler.setFormatter(formatter)
42 | logger.addHandler(stdout_handler)
43 | if use_error_log:
44 | stderr_handler = logging.StreamHandler(sys.stderr)
45 | stderr_handler.setLevel(logging.WARNING)
46 | stderr_handler.setFormatter(formatter)
47 | logger.addHandler(stderr_handler)
48 | logger.propagate = False
49 | return logger
50 |
51 |
52 | LOGGER = get_logger(VERBOSITY_LEVEL)
53 |
54 |
55 | def _here(*args):
56 | """Helper function for getting the current directory of the script."""
57 | here_dir = os.path.dirname(os.path.realpath(__file__))
58 | return os.path.abspath(join(here_dir, *args))
59 |
60 |
61 | def _get_solution(solution_dir):
62 | """Get the solution array from solution directory."""
63 | solution_file = join(solution_dir, SOLUTION_FILE)
64 | solution = pd.read_csv(solution_file, sep='\t')
65 | return solution
66 |
67 |
68 | def _get_prediction(prediction_dir):
69 | pred_file = join(prediction_dir, 'predictions')
70 | return pd.read_csv(pred_file)['label']
71 |
72 |
73 | def _get_score(solution_dir, prediction_dir):
74 | """get score"""
75 | LOGGER.info('===== get solution')
76 | solution = _get_solution(solution_dir)['label']
77 | LOGGER.info('===== read prediction')
78 | prediction = _get_prediction(prediction_dir)
79 | if solution.shape != prediction.shape:
80 | raise ValueError(f"Bad prediction shape: {prediction.shape}. "
81 | f"Expected shape: {solution.shape}")
82 |
83 | LOGGER.info('===== calculate score')
84 | LOGGER.debug(f'solution shape = {solution.shape}')
85 | LOGGER.debug(f'prediction shape = {prediction.shape}')
86 | score = accuracy_score(solution, prediction)
87 |
88 | def get_df(counter, name):
89 | counter = {k: v for (k, v) in sorted(counter.items(), key=lambda x: x[0])}
90 | keys = counter.keys()
91 | values = counter.values()
92 | return pd.DataFrame({name: list(values)}, index=keys)
93 |
94 | labels_count = Counter(solution)
95 | length = len(solution)
96 | labels = get_df(labels_count, "Label num")
97 | labels_ratio = get_df({k: labels_count[k] / length for k in labels_count}, "Label ratio")
98 | errors_count = Counter(solution[solution != prediction])
99 | errors = get_df(errors_count, "Error")
100 | errors_ratio = get_df({k: errors_count[k] / labels_count[k] for k in errors_count}, "Error ratio")
101 | desc = labels.join(labels_ratio).join(errors).join(errors_ratio)
102 | LOGGER.debug(f"Desc:\n{desc}")
103 |
104 | return score
105 |
106 |
107 | def _update_score(args, duration):
108 | score = _get_score(solution_dir=args.solution_dir,
109 | prediction_dir=args.prediction_dir)
110 | # Update learning curve page (detailed_results.html)
111 | _write_scores_html(args.score_dir)
112 | # Write score
113 | LOGGER.info('===== write score')
114 | write_score(args.score_dir, score, duration)
115 | LOGGER.info(f"accuracy: {score:.4}")
116 | return score
117 |
118 |
119 | def _init_scores_html(detailed_results_filepath):
120 | html_head = (' '
121 | '')
122 | html_end = '
'
123 | with open(detailed_results_filepath, 'a') as html_file:
124 | html_file.write(html_head)
125 | html_file.write("Starting training process...
Please be patient. "
126 | "Learning curves will be generated when first "
127 | "predictions are made.")
128 | html_file.write(html_end)
129 |
130 |
131 | def _write_scores_html(score_dir, auto_refresh=True, append=False):
132 | filename = 'detailed_results.html'
133 | if auto_refresh:
134 | html_head = (' '
135 | '')
136 | else:
137 | html_head = """"""
138 | html_end = '
'
139 | if append:
140 | mode = 'a'
141 | else:
142 | mode = 'w'
143 | filepath = join(score_dir, filename)
144 | with open(filepath, mode) as html_file:
145 | html_file.write(html_head)
146 | html_file.write(html_end)
147 | LOGGER.debug(f"Wrote learning curve page to {filepath}")
148 |
149 |
150 | def write_score(score_dir, score, duration):
151 | """Write score and duration to score_dir/scores.txt"""
152 | score_filename = join(score_dir, 'scores.txt')
153 | with open(score_filename, 'w') as ftmp:
154 | ftmp.write(f'score: {score}\n')
155 | ftmp.write(f'Duration: {duration}\n')
156 | LOGGER.debug(f"Wrote to score_filename={score_filename} with "
157 | f"score={score}, duration={duration}")
158 |
159 |
160 | class IngestionError(Exception):
161 | """Ingestion error"""
162 |
163 |
164 | class ScoringError(Exception):
165 | """scoring error"""
166 |
167 |
168 | def get_ingestion_info(prediction_dir):
169 | """get ingestion information"""
170 | ingestion_info = None
171 | endfile_path = os.path.join(prediction_dir, 'end.yaml')
172 |
173 | if not os.path.isfile(endfile_path):
174 | raise IngestionError("[-] No end.yaml exist, ingestion failed")
175 |
176 | LOGGER.info('===== Detected end.yaml file, get ingestion information')
177 | with open(endfile_path, 'r') as ftmp:
178 | ingestion_info = yaml.safe_load(ftmp)
179 |
180 | return ingestion_info
181 |
182 |
183 | def get_ingestion_pid(prediction_dir):
184 | """get ingestion pid"""
185 | # Wait 60 seconds for ingestion to start and write 'start.txt',
186 | # Otherwise, raise an exception.
187 | wait_time = 60
188 | startfile = os.path.join(prediction_dir, 'start.txt')
189 | lockfile = os.path.join(prediction_dir, 'start.txt.lock')
190 |
191 | for i in range(wait_time):
192 | if os.path.exists(startfile):
193 | with FileLock(lockfile):
194 | with open(startfile, 'r') as ftmp:
195 | ingestion_pid = ftmp.read()
196 | LOGGER.info(
197 | f'Detected the start of ingestion after {i} seconds.')
198 | return int(ingestion_pid)
199 | else:
200 | time.sleep(1)
201 | raise IngestionError(f'[-] Failed: scoring didn\'t detected the start of'
202 | 'ingestion after {wait_time} seconds.')
203 |
204 |
205 | def is_process_alive(ingestion_pid):
206 | """detect ingestion alive"""
207 | try:
208 | os.kill(ingestion_pid, 0)
209 | except OSError:
210 | return False
211 | else:
212 | return True
213 |
214 |
215 | def _parse_args():
216 | # Default I/O directories:
217 | root_dir = _here(os.pardir)
218 | default_solution_dir = join(root_dir, "sample_data")
219 | default_prediction_dir = join(root_dir, "sample_result_submission")
220 | default_score_dir = join(root_dir, "scoring_output")
221 | parser = argparse.ArgumentParser()
222 | parser.add_argument('--solution_dir', type=str,
223 | default=default_solution_dir,
224 | help=("Directory storing the solution with true "
225 | "labels, e.g. adult.solution."))
226 | parser.add_argument('--prediction_dir', type=str,
227 | default=default_prediction_dir,
228 | help=("Directory storing the predictions. It should"
229 | "contain e.g. [start.txt, adult.predict_0, "
230 | "adult.predict_1, ..., end.yaml]."))
231 | parser.add_argument('--score_dir', type=str,
232 | default=default_score_dir,
233 | help=("Directory storing the scoring output e.g. "
234 | "`scores.txt` and `detailed_results.html`."))
235 | args = parser.parse_args()
236 | LOGGER.debug(f"Parsed args are: {args}")
237 | LOGGER.debug("-" * 50)
238 | LOGGER.debug(f"Using solution_dir: {args.solution_dir}")
239 | LOGGER.debug(f"Using prediction_dir: {args.prediction_dir}")
240 | LOGGER.debug(f"Using score_dir: {args.score_dir}")
241 | return args
242 |
243 |
244 | def _init(args):
245 | if not os.path.isdir(args.score_dir):
246 | os.mkdir(args.score_dir)
247 | detailed_results_filepath = join(
248 | args.score_dir, 'detailed_results.html')
249 | # Initialize detailed_results.html
250 | _init_scores_html(detailed_results_filepath)
251 |
252 |
253 | def _finalize(score, scoring_start):
254 | """finalize the scoring"""
255 | # Use 'end.yaml' file to detect if ingestion program ends
256 | duration = time.time() - scoring_start
257 | LOGGER.info(
258 | "[+] Successfully finished scoring! "
259 | f"Scoring duration: {duration:.2} sec. "
260 | f"The score of your algorithm on the task is: {score:.6}.")
261 |
262 | LOGGER.info("[Scoring terminated]")
263 |
264 |
265 | def main():
266 | """main entry"""
267 | scoring_start = time.time()
268 | LOGGER.info('===== init scoring program')
269 | args = _parse_args()
270 | _init(args)
271 | score = DEFAULT_SCORE
272 |
273 | ingestion_pid = get_ingestion_pid(args.prediction_dir)
274 |
275 | LOGGER.info("===== wait for the exit of ingestion.")
276 | while is_process_alive(ingestion_pid):
277 | time.sleep(1)
278 |
279 | # Compute/write score
280 | ingestion_info = get_ingestion_info(args.prediction_dir)
281 | duration = ingestion_info['ingestion_duration']
282 | score = _update_score(args, duration)
283 |
284 | _finalize(score, scoring_start)
285 |
286 |
287 | if __name__ == "__main__":
288 | main()
289 |
--------------------------------------------------------------------------------