├── .gitignore ├── README.md ├── analyzer.py ├── archive.py ├── config.yaml ├── interactive.py ├── job.py ├── manager.py ├── plot_util.py ├── plotman.py └── reporting.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | config.yaml 3 | *.bak 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `windows plotman`: an attempt to get plotman to work on windows 2 | 3 | THIS IS A BETA. Not ready for production use just yet. Almost, but not quite there yet. 4 | 5 | This is a tool for managing Chia based on Plotman (https://github.com/ericaltendorf/plotman) 6 | 7 | This is a work in progress. Please see the original linux version for additional info: 8 | 9 | https://github.com/ericaltendorf/plotman 10 | 11 | Please contact me at: chia@ifhya.com or Wolfrage on discord in the #chia channel: https://discord.gg/JESmva9R - pop in and say hi! 12 | 13 | ## Known issues: 14 | 15 | - Archiving hasn't been touched 16 | - IO stats does not work on windows 17 | - Resizing terminal messes up the curses display 18 | - Common dir prefix does not work so whole dir path is shown for temps and dest drives 19 | - Various other issues 20 | 21 | ## Installation 22 | 23 | Copy your chia.exe file to chia2.exe and use that for now! Avoids conflicts with plots running from GUI or commandline. Will adjust after testing complete. 24 | 25 | This program requires `psutil`, `pyfakefs`,`texttable`, `windows-curses`, `pyreadline`, `pyyaml`, and `pypsutil`. 26 | 27 | First, Edit manager.py to hardcode your chia.exe location (sorry, manual for now). Second, Edit config.yaml for your settings. Third, provide feedback and help me with this because I am soooo not a python guy. I'm really, really, really not. Fourthly, send me pizza. 28 | 29 | Run command in Windows Powershell: `python plotman.py interactive` 30 | 31 | ![image](https://user-images.githubusercontent.com/75458290/113492313-8c0ad680-94a4-11eb-93da-e93521dddde3.png) 32 | -------------------------------------------------------------------------------- /analyzer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import re 5 | import statistics 6 | import sys 7 | import texttable as tt 8 | 9 | import plot_util 10 | 11 | class LogAnalyzer: 12 | # Map from key (e.g. logdir or the like) to (map from measurement name to list of values) 13 | all_measures = ['phase 1', 'phase 2', 'phase 3', 'phase 4', 'total time'] 14 | 15 | def analyze(self, logfilenames): 16 | data = {} 17 | for logfilename in logfilenames: 18 | with open(logfilename, 'r') as f: 19 | key = 'x' # TODO 20 | for line in f: 21 | # 22 | # Aggregation specification 23 | # 24 | 25 | # Starting phase 1/4: Forward Propagation into tmp files... Sun Nov 15 00:35:57 2020 26 | # TODO: This only does by day!!! 27 | m = re.search(r'^Starting phase 1/4.*files.*\d\d (\d\d):\d\d:\d\d \d\d\d\d', line) 28 | if m: 29 | bucketsize = 2 30 | hour = int(m.group(1)) 31 | hourbucket = int(hour / bucketsize) 32 | # key += '-%02d-%02d' % (hourbucket * bucketsize, (hourbucket + 1) * bucketsize) 33 | 34 | # Starting plotting progress into temporary dirs: /mnt/tmp/01 and /mnt/tmp/a 35 | m = re.search(r'^Starting plotting.*dirs: (.*) and (.*)', line) 36 | if False and m: 37 | tmpdir = m.group(1) 38 | # Hack to split data for backing hardware 39 | tmpdir_idx = tmpdir[-2:] 40 | if tmpdir_idx in ['00', '01']: 41 | key += '-wd-raid' 42 | if tmpdir_idx in ['02', '03', '04', '05']: 43 | key += '-samsung' 44 | 45 | # 46 | # Data collection 47 | # 48 | 49 | # Time for phase 1 = 22796.7 seconds. CPU (98%) Tue Sep 29 17:57:19 2020 50 | for phase in ['1', '2', '3', '4']: 51 | m = re.search(r'^Time for phase ' + phase + ' = (\d+.\d+) seconds..*', line) 52 | if m: 53 | data.setdefault(key, {}).setdefault('phase ' + phase, []).append(float(m.group(1))) 54 | 55 | # Total time = 49487.1 seconds. CPU (97.26%) Wed Sep 30 01:22:10 2020 56 | m = re.search(r'^Total time = (\d+.\d+) seconds.*', line) 57 | if m: 58 | data.setdefault(key, {}).setdefault('total time', []).append(float(m.group(1))) 59 | 60 | # Prepare report 61 | tab = tt.Texttable() 62 | headings = ['Key'] + self.all_measures 63 | tab.header(headings) 64 | 65 | #for logdir in logdirs: 66 | for key in data.keys(): 67 | row = [key] 68 | for measure in self.all_measures: 69 | values = data.get(key, {}).get(measure, []) 70 | if(len(values) > 1): 71 | row.append('μ=%s σ=%s' % ( 72 | plot_util.human_format(statistics.mean(values), 1), 73 | plot_util.human_format(statistics.stdev(values), 0) 74 | )) 75 | elif(len(values) == 1): 76 | row.append(plot_util.human_format(values[0], 1)) 77 | else: 78 | row.append('N/A') 79 | tab.add_row(row) 80 | 81 | (rows, columns) = os.popen('stty size', 'r').read().split() 82 | tab.set_max_width(int(columns)) 83 | s = tab.draw() 84 | print(s) 85 | 86 | 87 | -------------------------------------------------------------------------------- /archive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from datetime import datetime 4 | import subprocess 5 | import argparse 6 | import math 7 | import os 8 | import psutil 9 | import re 10 | import random 11 | import sys 12 | 13 | import texttable as tt 14 | 15 | import manager 16 | import plot_util 17 | 18 | # TODO : write-protect and delete-protect archived plots 19 | 20 | def compute_priority(phase, gb_free, n_plots): 21 | # All these values are designed around dst buffer dirs of about 22 | # ~2TB size and containing k32 plots. TODO: Generalize, and 23 | # rewrite as a sort function. 24 | 25 | priority = 50 26 | 27 | # To avoid concurrent IO, we should not touch drives that 28 | # are about to receive a new plot 29 | if (phase == (3, 4)): 30 | priority -= 4 31 | elif (phase == (3, 5)): 32 | priority -= 8 33 | elif (phase == (3, 6)): 34 | priority -= 16 35 | elif (phase >= (3, 7)): 36 | priority -= 32 37 | 38 | # If a drive is getting full, we should prioritize it 39 | if (gb_free < 1000): 40 | priority += 1 + int((1000 - gb_free) / 100) 41 | if (gb_free < 500): 42 | priority += 1 + int((500 - gb_free) / 100) 43 | 44 | # Finally, least importantly, pick drives with more plots 45 | # over those with fewer. 46 | priority += n_plots 47 | 48 | return priority 49 | 50 | def get_archdir_freebytes(arch_cfg): 51 | archdir_freebytes = { } 52 | df_cmd = ('ssh %s@%s df -BK | grep " %s/"' % 53 | (arch_cfg['rsyncd_user'], arch_cfg['rsyncd_host'], arch_cfg['rsyncd_path']) ) 54 | with subprocess.Popen(df_cmd, shell=True, stdout=subprocess.PIPE) as proc: 55 | for line in proc.stdout.readlines(): 56 | fields = line.split() 57 | freebytes = int(fields[3][:-1]) * 1024 # Strip the final 'K' 58 | archdir = (fields[5]).decode('ascii') 59 | archdir_freebytes[archdir] = freebytes 60 | return archdir_freebytes 61 | 62 | def rsync_dest(arch_cfg, arch_dir): 63 | rsync_path = arch_dir.replace(arch_cfg['rsyncd_path'], arch_cfg['rsyncd_module']) 64 | if rsync_path.startswith('/'): 65 | rsync_path = rsync_path[1:] # Avoid dup slashes. TODO use path join? 66 | rsync_url = 'rsync://%s@%s:12000/%s' % ( 67 | arch_cfg['rsyncd_user'], arch_cfg['rsyncd_host'], rsync_path) 68 | return rsync_url 69 | 70 | # TODO: maybe consolidate with similar code in job.py? 71 | def get_running_archive_jobs(arch_cfg): 72 | '''Look for running rsync jobs that seem to match the pattern we use for archiving 73 | them. Return a list of PIDs of matching jobs.''' 74 | jobs = [] 75 | dest = rsync_dest(arch_cfg, '/') 76 | for proc in psutil.process_iter(['pid', 'name']): 77 | if proc.name() == 'rsync': 78 | args = proc.cmdline() 79 | for arg in args: 80 | if arg.startswith(dest): 81 | jobs.append(proc.pid) 82 | return jobs 83 | 84 | def archive(dir_cfg, all_jobs): 85 | '''Configure one archive job. Needs to know all jobs so it can avoid IO 86 | contention on the plotting dstdir drives. Returns either (False, ) 87 | if we should not execute an archive job or (True, ) with the archive 88 | command if we should.''' 89 | 90 | dstdirs = dir_cfg['dst'] 91 | arch_cfg = dir_cfg['archive'] 92 | 93 | dir2ph = manager.dstdirs_to_furthest_phase(all_jobs) 94 | best_priority = -100000000 95 | chosen_plot = None 96 | 97 | for d in dstdirs: 98 | ph = dir2ph.get(d, (0, 0)) 99 | dir_plots = plot_util.list_k32_plots(d) 100 | gb_free = plot_util.df_b(d) / plot_util.GB 101 | n_plots = len(dir_plots) 102 | priority = compute_priority(ph, gb_free, n_plots) 103 | if priority >= best_priority and dir_plots: 104 | best_priority = priority 105 | chosen_plot = dir_plots[0] 106 | 107 | if not chosen_plot: 108 | return (False, 'No plots found') 109 | 110 | # TODO: sanity check that archive machine is available 111 | # TODO: filter drives mounted RO 112 | 113 | # 114 | # Pick first archive dir with sufficient space 115 | # 116 | archdir_freebytes = get_archdir_freebytes(arch_cfg) 117 | if not archdir_freebytes: 118 | return(False, 'No free archive dirs found.') 119 | 120 | archdir = '' 121 | for (d, space) in sorted(archdir_freebytes.items()): 122 | # TODO: make buffer configurable 123 | if space > 1.2 * plot_util.get_k32_plotsize(): # Leave a little buffer 124 | archdir = d 125 | freespace = space 126 | break 127 | 128 | if not archdir: 129 | return(False, 'No archive directories found with enough free space') 130 | 131 | msg = 'Found %s with ~%d GB free' % (archdir, freespace / plot_util.GB) 132 | 133 | bwlimit = arch_cfg['rsyncd_bwlimit'] 134 | throttle_arg = ('--bwlimit=%d' % bwlimit) if bwlimit else '' 135 | cmd = ('rsync %s --remove-source-files -P %s %s' % 136 | (throttle_arg, chosen_plot, rsync_dest(arch_cfg, archdir))) 137 | 138 | return (True, cmd) 139 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # Where to plot and log. 2 | directories: 3 | # One directory in which to store all plot job logs (the STDOUT/ 4 | # STDERR of all plot jobs). In order to monitor progress, plotman 5 | # reads these logs on a regular basis, so using a fast drive is 6 | # recommended. 7 | log: C:\chia 8 | 9 | # One or more directories to use as tmp dirs for plotting. The 10 | # scheduler will use all of them and distribute jobs among them. 11 | # It assumes that IO is independent for each one (i.e., that each 12 | # one is on a different physical device). 13 | # 14 | # If multiple directories share a common prefix, reports will 15 | # abbreviate and show just the uniquely identifying suffix. 16 | tmp: 17 | - e:\1 18 | - h:\1 19 | - e:\2 20 | - i:\1 21 | - e:\3 22 | - h:\2 23 | - i:\2 24 | - e:\4 25 | 26 | 27 | # Optional: tmp2 directory. If specified, will be passed to 28 | # chia plots create as -2. Only one tmp2 directory is supported. 29 | # tmp2: /mnt/tmp/a 30 | 31 | # One or more directories; the scheduler will use all of them. 32 | # These again are presumed to be on independent physical devices, 33 | # so writes (plot jobs) and reads (archivals) can be scheduled 34 | # to minimize IO contention. 35 | dst: 36 | - z:\ 37 | - f:\ 38 | 39 | 40 | # Archival configuration. Optional; if you do not wish to run the 41 | # archiving operation, comment this section out. 42 | # 43 | # Currently archival depends on an rsync daemon running on the remote 44 | # host, and that the module is configured to match the local path. 45 | # See code for details. 46 | # archive: 47 | # rsyncd_module: plots 48 | # rsyncd_path: /plots 49 | # rsyncd_bwlimit: 80000 # Bandwidth limit in KB/s 50 | # rsyncd_host: myfarmer 51 | # rsyncd_user: chia 52 | 53 | 54 | # Plotting scheduling parameters 55 | scheduling: 56 | # Don't run a job on a particular temp dir until all existing jobs 57 | # have progresed at least this far. Phase major corresponds to the 58 | # plot phase, phase minor corresponds to the table or table pair 59 | # in sequence. 60 | tmpdir_stagger_phase_major: 2 61 | tmpdir_stagger_phase_minor: 1 62 | 63 | # Don't run more than this many jobs at a time on a single temp dir. 64 | tmpdir_max_jobs: 1 65 | 66 | # Don't run any jobs (across all temp dirs) more often than this. 67 | global_stagger_m: 35 68 | 69 | # How often the daemon wakes to consider starting a new plot job 70 | polling_time_s: 5 71 | 72 | 73 | # Plotting parameters. These are pass-through parameters to chia plots create. 74 | # See documentation at 75 | # https://github.com/Chia-Network/chia-blockchain/wiki/CLI-Commands-Reference#create 76 | plotting: 77 | k: 32 78 | e: False # Use -e plotting option 79 | n_threads: 3 # Threads per job 80 | n_buckets: 128 # Number of buckets to split data into 81 | job_buffer: 6500 # Per job memory 82 | -------------------------------------------------------------------------------- /interactive.py: -------------------------------------------------------------------------------- 1 | import curses 2 | import datetime 3 | import locale 4 | import os 5 | import subprocess 6 | import threading 7 | import yaml 8 | 9 | #from unicurses import * 10 | 11 | from job import Job 12 | import archive 13 | import manager 14 | import reporting 15 | 16 | class Log: 17 | def __init__(self): 18 | self.entries = [] 19 | self.cur_pos = 0 20 | 21 | # TODO: store timestamp as actual timestamp indexing the messages 22 | def log(self, msg): 23 | '''Log the message and scroll to the end of the log''' 24 | ts = datetime.datetime.now().strftime('%m-%d %H:%M:%S') 25 | self.entries.append(ts + ' ' + msg) 26 | self.cur_pos = len(self.entries) 27 | 28 | def tail(self, num_entries): 29 | '''Return the entries at the end of the log. Consider cur_slice() instead.''' 30 | return self.entries[-num_entries:] 31 | 32 | def shift_slice(self, offset): 33 | '''Positive shifts towards end, negative shifts towards beginning''' 34 | self.cur_pos = max(0, min(len(self.entries), self.cur_pos + offset)) 35 | 36 | def shift_slice_to_end(self): 37 | self.cur_pos = len(self.entries) 38 | 39 | def get_cur_pos(self): 40 | return self.cur_pos 41 | 42 | def cur_slice(self, num_entries): 43 | '''Return num_entries log entries up to the current slice position''' 44 | return self.entries[max(0, self.cur_pos - num_entries) : self.cur_pos] 45 | 46 | def fill_log(self): 47 | '''Add a bunch of stuff to the log. Useful for testing.''' 48 | for i in range(100): 49 | self.log('Log line %d' % i) 50 | 51 | def plotting_status_msg(active, status): 52 | if active: 53 | return '(active) ' + status 54 | else: 55 | return '(inactive) ' + status 56 | 57 | def archiving_status_msg(configured, active, status): 58 | if configured: 59 | if active: 60 | return '(active) ' + status 61 | else: 62 | return '(inactive) ' + status 63 | else: 64 | return '(not configured)' 65 | 66 | def curses_main(stdscr): 67 | # TODO: figure out how to pass the configs in from plotman.py instead of 68 | # duplicating the code here. 69 | with open('config.yaml', 'r') as ymlfile: 70 | cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) 71 | dir_cfg = cfg['directories'] 72 | sched_cfg = cfg['scheduling'] 73 | plotting_cfg = cfg['plotting'] 74 | 75 | log = Log() 76 | 77 | plotting_active = True 78 | archiving_configured = 'archive' in dir_cfg 79 | archiving_active = archiving_configured 80 | 81 | (n_rows, n_cols) = map(int, stdscr.getmaxyx()) 82 | 83 | # Page layout. Currently requires at least ~40 rows. 84 | # TODO: make everything dynamically figure to best use available space 85 | header_height = 3 86 | jobs_height = 10 87 | dirs_height = 14 88 | logscreen_height = n_rows - (header_height + jobs_height + dirs_height) 89 | 90 | header_pos = 0 91 | jobs_pos = header_pos + header_height 92 | dirs_pos = jobs_pos + jobs_height 93 | logscreen_pos = dirs_pos + dirs_height 94 | 95 | plotting_status = '' # todo rename these msg? 96 | archiving_status = '' 97 | 98 | refresh_period = int(sched_cfg['polling_time_s']) 99 | 100 | stdscr.nodelay(True) # make getch() non-blocking 101 | stdscr.timeout(2000) 102 | 103 | header_win = curses.newwin(header_height, n_cols, header_pos, 0) 104 | log_win = curses.newwin(logscreen_height, n_cols, logscreen_pos, 0) 105 | jobs_win = curses.newwin(jobs_height, n_cols, jobs_pos, 0) 106 | dirs_win = curses.newwin(dirs_height, n_cols, dirs_pos, 0) 107 | 108 | jobs = Job.get_running_jobs(dir_cfg['log']) 109 | last_refresh = datetime.datetime.now() 110 | 111 | pressed_key = '' # For debugging 112 | 113 | while True: 114 | 115 | # TODO: handle resizing. Need to (1) figure out how to reliably get 116 | # the terminal size -- the recommended method doesn't seem to work: 117 | # (n_rows, n_cols) = [int(v) for v in stdscr.getmaxyx()] 118 | # Consider instead: 119 | # ...[int(v) for v in os.popen('stty size', 'r').read().split()] 120 | # and then (2) implement the logic to resize all the subwindows as above 121 | 122 | # stdscr.clear() 123 | linecap = n_cols - 1 124 | logscreen_height = n_rows - (header_height + jobs_height + dirs_height) 125 | 126 | elapsed = (datetime.datetime.now() - last_refresh).total_seconds() 127 | 128 | # A full refresh scans for and reads info for running jobs from 129 | # scratch (i.e., reread their logfiles). Otherwise we'll only 130 | # initialize new jobs, and mostly rely on cached info. 131 | do_full_refresh = elapsed >= refresh_period 132 | 133 | if not do_full_refresh: 134 | jobs = Job.get_running_jobs(dir_cfg['log'], cached_jobs=jobs) 135 | 136 | else: 137 | last_refresh = datetime.datetime.now() 138 | jobs = Job.get_running_jobs(dir_cfg['log']) 139 | 140 | if plotting_active: 141 | (started, msg) = manager.maybe_start_new_plot(dir_cfg, sched_cfg, plotting_cfg) 142 | if (started): 143 | log.log(msg) 144 | plotting_status = '' 145 | jobs = Job.get_running_jobs(dir_cfg['log'], cached_jobs=jobs) 146 | else: 147 | plotting_status = msg 148 | 149 | if archiving_configured and archiving_active: 150 | # Look for running archive jobs. Be robust to finding more than one 151 | # even though the scheduler should only run one at a time. 152 | arch_jobs = archive.get_running_archive_jobs(dir_cfg['archive']) 153 | if arch_jobs: 154 | archiving_status = 'pid: ' + ', '.join(map(str, arch_jobs)) 155 | else: 156 | (should_start, status_or_cmd) = archive.archive(dir_cfg, jobs) 157 | if not should_start: 158 | archiving_status = status_or_cmd 159 | else: 160 | cmd = status_or_cmd 161 | log.log('Starting archive: ' + cmd) 162 | 163 | # TODO: do something useful with output instead of DEVNULL 164 | p = subprocess.Popen(cmd, 165 | shell=True, 166 | stdout=subprocess.DEVNULL, 167 | stderr=subprocess.STDOUT, 168 | start_new_session=True) 169 | 170 | # Directory prefixes, for abbreviation 171 | tmp_prefix = ''#os.path.commonpath(dir_cfg['tmp']) 172 | dst_prefix = ''#os.path.commonpath(dir_cfg['dst']) 173 | if archiving_configured: 174 | arch_prefix = dir_cfg['archive']['rsyncd_path'] 175 | 176 | # Header 177 | header_win.addnstr(0, 0, 'Plotman', linecap, curses.A_BOLD) 178 | timestamp = datetime.datetime.now().strftime("%H:%M:%S") 179 | refresh_msg = "now" if do_full_refresh else f"{int(elapsed)}s/{refresh_period}" 180 | header_win.addnstr(f" {timestamp} (refresh {refresh_msg})", linecap) 181 | header_win.addnstr(' |

lotting: ', linecap, curses.A_BOLD) 182 | header_win.addnstr( 183 | plotting_status_msg(plotting_active, plotting_status), linecap) 184 | header_win.addnstr(' rchival: ', linecap, curses.A_BOLD) 185 | header_win.addnstr( 186 | archiving_status_msg(archiving_configured, 187 | archiving_active, archiving_status), linecap) 188 | 189 | # Oneliner progress display 190 | header_win.addnstr(1, 0, 'Jobs (%d): ' % len(jobs), linecap) 191 | header_win.addnstr('[' + reporting.job_viz(jobs) + ']', linecap) 192 | 193 | # These are useful for debugging. 194 | # header_win.addnstr(' term size: (%d, %d)' % (n_rows, n_cols), linecap) # Debuggin 195 | # if pressed_key: 196 | # header_win.addnstr(' (keypress %s)' % str(pressed_key), linecap) 197 | header_win.addnstr(2, 0, 'Prefixes:', linecap, curses.A_BOLD) 198 | header_win.addnstr(' tmp=', linecap, curses.A_BOLD) 199 | header_win.addnstr(tmp_prefix, linecap) 200 | header_win.addnstr(' dst=', linecap, curses.A_BOLD) 201 | header_win.addnstr(dst_prefix, linecap) 202 | if archiving_configured: 203 | header_win.addnstr(' archive=', linecap, curses.A_BOLD) 204 | header_win.addnstr(arch_prefix, linecap) 205 | header_win.addnstr(' (remote)', linecap) 206 | 207 | 208 | # Jobs 209 | jobs_win.addstr(0, 0, reporting.status_report(jobs, n_cols, jobs_height, 210 | tmp_prefix, dst_prefix)) 211 | jobs_win.chgat(0, 0, curses.A_REVERSE) 212 | 213 | # Dirs. Collect reports as strings, then lay out. 214 | n_tmpdirs = len(dir_cfg['tmp']) 215 | n_tmpdirs_half = int(n_tmpdirs / 2) 216 | tmp_report_1 = reporting.tmp_dir_report( 217 | jobs, dir_cfg['tmp'], sched_cfg, n_cols, 0, n_tmpdirs_half, tmp_prefix) 218 | tmp_report_2 = reporting.tmp_dir_report( 219 | jobs, dir_cfg['tmp'], sched_cfg, n_cols, n_tmpdirs_half, n_tmpdirs, tmp_prefix) 220 | 221 | dst_report = reporting.dst_dir_report( 222 | jobs, dir_cfg['dst'], n_cols, dst_prefix) 223 | 224 | if archiving_configured: 225 | arch_report = reporting.arch_dir_report( 226 | archive.get_archdir_freebytes(dir_cfg['archive']), n_cols, arch_prefix) 227 | if not arch_report: 228 | arch_report = '' 229 | else: 230 | arch_report = '' 231 | 232 | tmp_h = max(len(tmp_report_1.splitlines()), 233 | len(tmp_report_2.splitlines())) 234 | tmp_w = len(max(tmp_report_1.splitlines() + 235 | tmp_report_2.splitlines(), key=len)) + 1 236 | dst_h = len(dst_report.splitlines()) 237 | dst_w = len(max(dst_report.splitlines(), key=len)) + 1 238 | arch_h = len(arch_report.splitlines()) + 1 239 | arch_w = n_cols 240 | 241 | tmpwin_12_gutter = 3 242 | tmpwin_dstwin_gutter = 6 243 | 244 | maxtd_h = max([tmp_h, dst_h]) 245 | 246 | tmpwin_1 = curses.newwin( 247 | tmp_h, tmp_w, 248 | dirs_pos + int((maxtd_h - tmp_h) / 2), 0) 249 | tmpwin_1.addstr(tmp_report_1) 250 | 251 | tmpwin_2 = curses.newwin( 252 | tmp_h, tmp_w, 253 | dirs_pos + int((maxtd_h - tmp_h) / 2), 254 | tmp_w + tmpwin_12_gutter) 255 | tmpwin_2.addstr(tmp_report_2) 256 | 257 | tmpwin_1.chgat(0, 0, curses.A_REVERSE) 258 | tmpwin_2.chgat(0, 0, curses.A_REVERSE) 259 | 260 | dstwin = curses.newwin( 261 | dst_h, dst_w, 262 | dirs_pos + int((maxtd_h - dst_h) / 2), 2 * tmp_w + tmpwin_12_gutter + tmpwin_dstwin_gutter) 263 | dstwin.addstr(dst_report) 264 | dstwin.chgat(0, 0, curses.A_REVERSE) 265 | 266 | #archwin = curses.newwin(arch_h, arch_w, dirs_pos + maxtd_h, 0) 267 | #archwin.addstr(0, 0, 'Archive dirs free space', curses.A_REVERSE) 268 | #archwin.addstr(1, 0, arch_report) 269 | 270 | # Log. Could use a pad here instead of managing scrolling ourselves, but 271 | # this seems easier. 272 | log_win.addnstr(0, 0, ('Log: %d (// to scroll)\n' % log.get_cur_pos() ), 273 | linecap, curses.A_REVERSE) 274 | for i, logline in enumerate(log.cur_slice(logscreen_height - 1)): 275 | log_win.addnstr(i + 1, 0, logline, linecap) 276 | 277 | stdscr.noutrefresh() 278 | header_win.noutrefresh() 279 | jobs_win.noutrefresh() 280 | tmpwin_1.noutrefresh() 281 | tmpwin_2.noutrefresh() 282 | dstwin.noutrefresh() 283 | #archwin.noutrefresh() 284 | log_win.noutrefresh() 285 | curses.doupdate() 286 | 287 | key = stdscr.getch() 288 | if key == curses.KEY_UP: 289 | log.shift_slice(-1) 290 | pressed_key = 'up' 291 | elif key == curses.KEY_DOWN: 292 | log.shift_slice(1) 293 | pressed_key = 'dwn' 294 | elif key == curses.KEY_END: 295 | log.shift_slice_to_end() 296 | pressed_key = 'end' 297 | elif key == ord('p'): 298 | plotting_active = not plotting_active 299 | pressed_key = 'p' 300 | elif key == ord('a'): 301 | archiving_active = not archiving_active 302 | pressed_key = 'a' 303 | elif key == ord('q'): 304 | break 305 | else: 306 | pressed_key = key 307 | 308 | 309 | def run_interactive(): 310 | locale.setlocale(locale.LC_ALL, '') 311 | code = locale.getpreferredencoding() 312 | # Then use code as the encoding for str.encode() calls. 313 | 314 | curses.wrapper(curses_main) 315 | -------------------------------------------------------------------------------- /job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # TODO do we use all these? 4 | from datetime import datetime 5 | from enum import Enum, auto 6 | from subprocess import call 7 | import argparse 8 | 9 | import contextlib 10 | import logging 11 | import os 12 | import re 13 | import threading 14 | import time 15 | import psutil 16 | import random 17 | import sys 18 | 19 | class UnmatchedJobError(Exception): 20 | pass 21 | 22 | def job_phases_for_tmpdir(d, all_jobs): 23 | '''Return phase 2-tuples for jobs running on tmpdir d''' 24 | return sorted([j.progress() for j in all_jobs if j.tmpdir == d]) 25 | 26 | def job_phases_for_dstdir(d, all_jobs): 27 | '''Return phase 2-tuples for jobs outputting to dstdir d''' 28 | return sorted([j.progress() for j in all_jobs if j.dstdir == d]) 29 | 30 | def is_plotting_cmdline(cmdline): 31 | 32 | return ( 33 | len(cmdline) >= 4 34 | #and 'python' in cmdline[0] 35 | and 'chia2.exe' in cmdline[0] 36 | and 'plots' == cmdline[1] 37 | and 'create' == cmdline[2] 38 | ) 39 | 40 | # TODO: be more principled and explicit about what we cache vs. what we look up 41 | # dynamically from the logfile 42 | class Job: 43 | 'Represents a plotter job' 44 | 45 | # These are constants, not updated during a run. 46 | k = 0 47 | r = 0 48 | u = 0 49 | b = 0 50 | n = 0 # probably not used 51 | tmpdir = '' 52 | tmp2dir = '' 53 | dstdir = '' 54 | logfile = '' 55 | jobfile = '' 56 | job_id = 0 57 | plot_id = '--------' 58 | proc = None # will get a psutil.Process 59 | 60 | # These are dynamic, cached, and need to be udpated periodically 61 | phase = (None, None) # Phase/subphase 62 | 63 | def get_running_jobs(logroot, cached_jobs=()): 64 | '''Return a list of running plot jobs. If a cache of preexisting jobs is provided, 65 | reuse those previous jobs without updating their information. Always look for 66 | new jobs not already in the cache.''' 67 | jobs = [] 68 | cached_jobs_by_pid = { j.proc.pid: j for j in cached_jobs } 69 | 70 | for proc in psutil.process_iter(['pid', 'cmdline']): 71 | try: 72 | # Ignore processes which most likely have terminated between the time of 73 | # iteration and data access. 74 | with contextlib.suppress(psutil.NoSuchProcess): 75 | if is_plotting_cmdline(proc.cmdline()): 76 | if proc.pid in cached_jobs_by_pid.keys(): 77 | jobs.append(cached_jobs_by_pid[proc.pid]) # Copy from cache 78 | else: 79 | with contextlib.suppress(UnmatchedJobError): 80 | jobs.append(Job(proc, logroot)) 81 | except (PermissionError, psutil.AccessDenied): 82 | test=0 83 | #print ("Permission error or access denied on process") 84 | return jobs 85 | 86 | 87 | def __init__(self, proc, logroot): 88 | '''Initialize from an existing psutil.Process object. must know logroot in order to understand open files''' 89 | self.proc = proc 90 | 91 | with self.proc.oneshot(): 92 | # Parse command line args 93 | args = self.proc.cmdline() 94 | assert len(args) > 4 95 | assert 'chia2.exe' in args[0] 96 | assert 'plots' == args[1] 97 | assert 'create' == args[2] 98 | args_iter = iter(args[3:]) 99 | for arg in args_iter: 100 | val = None if arg in ['-e'] else next(args_iter) 101 | if arg == '-k': 102 | self.k = val 103 | elif arg == '-r': 104 | self.r = val 105 | elif arg == '-b': 106 | self.b = val 107 | elif arg == '-u': 108 | self.u = val 109 | elif arg == '-t': 110 | self.tmpdir = val 111 | elif arg == '-2': 112 | self.tmp2dir = val 113 | elif arg == '-d': 114 | self.dstdir = val 115 | elif arg == '-n': 116 | self.n = val 117 | elif arg == '-e': 118 | pass 119 | # TODO: keep track of -e 120 | else: 121 | print('Warning: unrecognized args: %s %s' % (arg, val)) 122 | 123 | # Find logfile (whatever file is open under the log root). The 124 | # file may be open more than once, e.g. for STDOUT and STDERR. 125 | for f in self.proc.open_files(): 126 | if logroot in f.path: 127 | if self.logfile: 128 | assert self.logfile == f.path 129 | else: 130 | self.logfile = f.path 131 | break 132 | 133 | # Initialize data that needs to be loaded from the logfile 134 | self.init_from_logfile() 135 | 136 | 137 | def init_from_logfile(self): 138 | '''Read plot ID and job start time from logfile. Return true if we 139 | find all the info as expected, false otherwise''' 140 | if not self.logfile: 141 | raise UnmatchedJobError() 142 | # Try reading for a while; it can take a while for the job to get started as it scans 143 | # existing plot dirs (especially if they are NFS). 144 | found_id = False 145 | found_log = False 146 | for attempt_number in range(3): 147 | with open(self.logfile, 'r') as f: 148 | for line in f: 149 | m = re.match('^ID: ([0-9a-f]*)', line) 150 | if m: 151 | self.plot_id = m.group(1) 152 | found_id = True 153 | m = re.match(r'^Starting phase 1/4:.*\.\.\. (.*)', line) 154 | if m: 155 | # Mon Nov 2 08:39:53 2020 156 | self.start_time = datetime.strptime(m.group(1), '%a %b %d %H:%M:%S %Y') 157 | found_log = True 158 | break # Stop reading lines in file 159 | 160 | if found_id and found_log: 161 | break # Stop trying 162 | else: 163 | time.sleep(1) # Sleep and try again 164 | 165 | # If we couldn't find the line in the logfile, the job is probably just getting started 166 | # (and being slow about it). In this case, use the last metadata change as the start time. 167 | # TODO: we never come back to this; e.g. plot_id may remain uninitialized. 168 | if not found_log: 169 | self.start_time = datetime.fromtimestamp(os.path.getctime(self.logfile)) 170 | 171 | # Load things from logfile that are dynamic 172 | self.update_from_logfile() 173 | 174 | def update_from_logfile(self): 175 | self.set_phase_from_logfile() 176 | 177 | def set_phase_from_logfile(self): 178 | 179 | assert self.logfile 180 | 181 | # Map from phase number to subphase number reached in that phase. 182 | # Phase 1 subphases are , table1, table2, ... 183 | # Phase 2 subphases are , table7, table6, ... 184 | # Phase 3 subphases are , tables1&2, tables2&3, ... 185 | # Phase 4 subphases are 186 | phase_subphases = {} 187 | 188 | with open(self.logfile, 'r') as f: 189 | for line in f: 190 | # "Starting phase 1/4: Forward Propagation into tmp files... Sat Oct 31 11:27:04 2020" 191 | m = re.match(r'^Starting phase (\d).*', line) 192 | if m: 193 | phase = int(m.group(1)) 194 | phase_subphases[phase] = 0 195 | 196 | # Phase 1: "Computing table 2" 197 | m = re.match(r'^Computing table (\d).*', line) 198 | if m: 199 | phase_subphases[1] = max(phase_subphases[1], int(m.group(1))) 200 | 201 | # Phase 2: "Backpropagating on table 2" 202 | m = re.match(r'^Backpropagating on table (\d).*', line) 203 | if m: 204 | phase_subphases[2] = max(phase_subphases[2], 7 - int(m.group(1))) 205 | 206 | # Phase 3: "Compressing tables 4 and 5" 207 | m = re.match(r'^Compressing tables (\d) and (\d).*', line) 208 | if m: 209 | phase_subphases[3] = max(phase_subphases[3], int(m.group(1))) 210 | 211 | # TODO also collect timing info: 212 | 213 | # "Time for phase 1 = 22796.7 seconds. CPU (98%) Tue Sep 29 17:57:19 2020" 214 | # for phase in ['1', '2', '3', '4']: 215 | # m = re.match(r'^Time for phase ' + phase + ' = (\d+.\d+) seconds..*', line) 216 | # data.setdefault.... 217 | 218 | # Total time = 49487.1 seconds. CPU (97.26%) Wed Sep 30 01:22:10 2020 219 | # m = re.match(r'^Total time = (\d+.\d+) seconds.*', line) 220 | # if m: 221 | # data.setdefault(key, {}).setdefault('total time', []).append(float(m.group(1))) 222 | 223 | if phase_subphases: 224 | phase = max(phase_subphases.keys()) 225 | self.phase = (phase, phase_subphases[phase]) 226 | else: 227 | self.phase = (0, 0) 228 | 229 | def progress(self): 230 | '''Return a 2-tuple with the job phase and subphase (by reading the logfile)''' 231 | return self.phase 232 | 233 | def plot_id_prefix(self): 234 | return self.plot_id[:8] 235 | 236 | # TODO: make this more useful and complete, and/or make it configurable 237 | def status_str_long(self): 238 | return '{plot_id}\nk={k} r={r} b={b} u={u}\npid:{pid}\ntmp:{tmp}\ntmp2:{tmp2}\ndst:{dst}\nlogfile:{logfile}'.format( 239 | plot_id = self.plot_id, 240 | k = self.k, 241 | r = self.r, 242 | b = self.b, 243 | u = self.u, 244 | pid = self.proc.pid, 245 | tmp = self.tmpdir, 246 | tmp2 = self.tmp2dir, 247 | dst = self.dstdir, 248 | plotid = self.plot_id, 249 | logfile = self.logfile 250 | ) 251 | 252 | def get_mem_usage(self): 253 | return self.proc.memory_info().vms # Total, inc swapped 254 | 255 | def get_tmp_usage(self): 256 | total_bytes = 0 257 | with os.scandir(self.tmpdir) as it: 258 | for entry in it: 259 | if self.plot_id in entry.name: 260 | try: 261 | total_bytes += entry.stat().st_size 262 | except FileNotFoundError: 263 | # The file might disappear; this being an estimate we don't care 264 | pass 265 | return total_bytes 266 | 267 | def get_run_status(self): 268 | '''Running, suspended, etc.''' 269 | status = self.proc.status() 270 | if status == psutil.STATUS_RUNNING: 271 | return 'RUN' 272 | elif status == psutil.STATUS_SLEEPING: 273 | return 'SLP' 274 | elif status == psutil.STATUS_DISK_SLEEP: 275 | return 'DSK' 276 | elif status == psutil.STATUS_STOPPED: 277 | return 'STP' 278 | else: 279 | return self.proc.status() 280 | 281 | def get_time_wall(self): 282 | return int((datetime.now() - self.start_time).total_seconds()) 283 | 284 | def get_time_user(self): 285 | return int(self.proc.cpu_times().user) 286 | 287 | def get_time_sys(self): 288 | return int(self.proc.cpu_times().system) 289 | 290 | def get_time_iowait(self): 291 | return 0 292 | #return int(self.proc.cpu_times().iowait) 293 | 294 | def suspend(self, reason=''): 295 | self.proc.suspend() 296 | self.status_note = reason 297 | 298 | def resume(self): 299 | self.proc.resume() 300 | 301 | def get_temp_files(self): 302 | temp_files = [] 303 | for f in self.proc.open_files(): 304 | if self.tmpdir in f.path or self.tmp2dir in f.path or self.dstdir in f.path: 305 | temp_files.append(f.path) 306 | return temp_files 307 | 308 | def cancel(self): 309 | 'Cancel an already running job' 310 | # We typically suspend the job as the first action in killing it, so it 311 | # doesn't create more tmp files during death. However, terminate() won't 312 | # complete if the job is supsended, so we also need to resume it. 313 | # TODO: check that this is best practice for killing a job. 314 | self.proc.resume() 315 | self.proc.terminate() 316 | 317 | def check_status(self, expected_status): 318 | if (self.status == expected_status): 319 | return 1 320 | else: 321 | print('Expected status %s, actual %s', expected_status, self.status) 322 | return 0 323 | -------------------------------------------------------------------------------- /manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from datetime import datetime 4 | 5 | import logging 6 | import operator 7 | import os 8 | import re 9 | import threading 10 | import time 11 | import psutil 12 | import random 13 | import readline # For nice CLI 14 | import subprocess 15 | import sys 16 | from pathlib import Path, PureWindowsPath 17 | 18 | # Plotman libraries 19 | import job 20 | import plot_util 21 | import archive # for get_archdir_freebytes(). TODO: move to avoid import loop 22 | 23 | # Constants 24 | MIN = 60 # Seconds 25 | HR = 3600 # Seconds 26 | 27 | MAX_AGE = 1000_000_000 # Arbitrary large number of seconds 28 | 29 | 30 | 31 | def dstdirs_to_furthest_phase(all_jobs): 32 | '''Return a map from dst dir to a phase tuple for the most progressed job 33 | that is emitting to that dst dir.''' 34 | result = {} 35 | for j in all_jobs: 36 | if not j.dstdir in result.keys() or result[j.dstdir] < j.progress(): 37 | result[j.dstdir] = j.progress() 38 | return result 39 | 40 | def dstdirs_to_youngest_phase(all_jobs): 41 | '''Return a map from dst dir to a phase tuple for the least progressed job 42 | that is emitting to that dst dir.''' 43 | result = {} 44 | for j in all_jobs: 45 | if not j.dstdir in result.keys() or result[j.dstdir] > j.progress(): 46 | result[j.dstdir] = j.progress() 47 | return result 48 | 49 | def phases_permit_new_job(phases, sched_cfg): 50 | '''Scheduling logic: return True if it's OK to start a new job on a tmp dir 51 | with existing jobs in the provided phases.''' 52 | if len(phases) == 0: 53 | return True 54 | 55 | milestone_1 = ( sched_cfg['tmpdir_stagger_phase_major'], 56 | sched_cfg['tmpdir_stagger_phase_minor'] ) 57 | # milestone_2 = (4, 0) 58 | 59 | if len([p for p in phases if p < milestone_1]) > 0: 60 | return False 61 | 62 | # if len([p for p in phases if milestone_1 <= p and p 1: 63 | # return False 64 | 65 | # No more than 3 jobs total on the tmpdir 66 | if len(phases) >= sched_cfg['tmpdir_max_jobs']: 67 | return False 68 | 69 | return True 70 | 71 | def maybe_start_new_plot(dir_cfg, sched_cfg, plotting_cfg): 72 | jobs = job.Job.get_running_jobs(dir_cfg['log']) 73 | 74 | wait_reason = None # If we don't start a job this iteration, this says why. 75 | 76 | youngest_job_age = min(jobs, key=job.Job.get_time_wall).get_time_wall() if jobs else MAX_AGE 77 | global_stagger = int(sched_cfg['global_stagger_m'] * MIN) 78 | if (youngest_job_age < global_stagger): 79 | wait_reason = 'stagger (%ds/%ds)' % ( 80 | youngest_job_age, global_stagger) 81 | else: 82 | tmp_to_all_phases = [ (d, job.job_phases_for_tmpdir(d, jobs)) 83 | for d in dir_cfg['tmp'] ] 84 | eligible = [ (d, phases) for (d, phases) in tmp_to_all_phases 85 | if phases_permit_new_job(phases, sched_cfg) ] 86 | rankable = [ (d, phases[0]) if phases else (d, (999, 999)) 87 | for (d, phases) in eligible ] 88 | 89 | if not eligible: 90 | wait_reason = 'no eligible tempdirs' 91 | else: 92 | # Plot to oldest tmpdir. 93 | tmpdir = max(rankable, key=operator.itemgetter(1))[0] 94 | 95 | # Select the dst dir least recently selected 96 | dir2ph = dstdirs_to_youngest_phase(jobs) 97 | unused_dirs = [d for d in dir_cfg['dst'] if d not in dir2ph.keys()] 98 | dstdir = '' 99 | if unused_dirs: 100 | dstdir = random.choice(unused_dirs) 101 | else: 102 | dstdir = max(dir2ph, key=dir2ph.get) 103 | 104 | 105 | logpath = Path(dir_cfg['log']) 106 | logfile = PureWindowsPath(logpath / datetime.now().strftime('%Y-%m-%d-%H.%M.%S.log')) 107 | #logfile = os.path.join(Path(dir_cfg['log']),) 108 | print(logfile) 109 | 110 | 111 | 112 | plot_args = [r'C:\Users\Wofl\AppData\Local\Chia-Blockchain\app-1.0.3\resources\app.asar.unpacked\daemon\chia2.exe','plots', 'create', 113 | '-k', str(plotting_cfg['k']), 114 | '-r', str(plotting_cfg['n_threads']), 115 | '-u', str(plotting_cfg['n_buckets']), 116 | '-b', str(plotting_cfg['job_buffer']), 117 | '-t', tmpdir, 118 | '-d', dstdir ] 119 | if 'e' in plotting_cfg and plotting_cfg['e']: 120 | plot_args.append('-e') 121 | if 'tmp2' in dir_cfg: 122 | plot_args.append('-2') 123 | plot_args.append(dir_cfg['tmp2']) 124 | 125 | # logfile = repr(logfile) 126 | logmsg = ('Starting plot job: %s ; logging to %s' % (' '.join(plot_args), logfile)) 127 | 128 | #print(logfile) 129 | #print(logmsg) 130 | # start_new_sessions to make the job independent of this controlling tty. 131 | p = subprocess.Popen(plot_args, 132 | stdout=open(logfile, 'w'), 133 | stderr=subprocess.STDOUT, 134 | start_new_session=True) 135 | psutil.Process(p.pid) #.nice(ABOVE_NORMAL_PRIORITY_CLASS) 136 | # print( psutil.Process(p.pid).cmdline()) 137 | # x = psutil.Process(p.pid) 138 | # x.nice(15) 139 | return (True, logmsg) 140 | 141 | return (False, wait_reason) 142 | 143 | def select_jobs_by_partial_id(jobs, partial_id): 144 | selected = [] 145 | for j in jobs: 146 | if j.plot_id.startswith(partial_id): 147 | selected.append(j) 148 | return selected 149 | 150 | -------------------------------------------------------------------------------- /plot_util.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import re 4 | 5 | import shutil 6 | 7 | GB = 1_000_000_000 8 | def df_b2(d): 9 | 'Return free space for directory (in bytes)' 10 | stat = os.statvfs(d) 11 | return stat.f_frsize * stat.f_bfree 12 | 13 | def df_b(d): 14 | stat = shutil.disk_usage(d) 15 | return(stat[2]) 16 | 17 | 18 | 19 | def get_k32_plotsize(): 20 | return 108 * GB 21 | 22 | def human_format(num, precision): 23 | magnitude = 0 24 | while abs(num) >= 1000: 25 | magnitude += 1 26 | num /= 1000.0 27 | return (('%.' + str(precision) + 'f%s') % 28 | (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])) 29 | 30 | def time_format(sec): 31 | if sec < 60: 32 | return '%ds' % sec 33 | else: 34 | return '%d:%02d' % (int(sec / 3600), int((sec % 3600) / 60)) 35 | 36 | def tmpdir_phases_str(tmpdir_phases_pair): 37 | tmpdir = tmpdir_phases_pair[0] 38 | phases = tmpdir_phases_pair[1] 39 | phase_str = ', '.join(['%d:%d' % ph_subph for ph_subph in sorted(phases)]) 40 | return ('%s (%s)' % (tmpdir, phase_str)) 41 | 42 | def split_path_prefix(items): 43 | if not items: 44 | return ('', []) 45 | 46 | prefix = '' #os.path.commonpath(items) #commonoath doesn't work on Windows 47 | if prefix == '/': 48 | return ('', items) 49 | else: 50 | remainders = [ os.path.relpath(i, prefix) for i in items ] 51 | return (prefix, remainders) 52 | 53 | def list_k32_plots(d): 54 | 'List completed k32 plots in a directory (not recursive)' 55 | plots = [] 56 | for plot in os.listdir(d): 57 | if re.match(r'^plot-k32-.*plot$', plot): 58 | plot = os.path.join(d, plot) 59 | try: 60 | if os.stat(plot).st_size > (0.95 * get_k32_plotsize()): 61 | plots.append(plot) 62 | except Exception: 63 | 'file not found' 64 | 65 | return plots 66 | 67 | def column_wrap(items, n_cols, filler=None): 68 | '''Take items, distribute among n_cols columns, and return a set 69 | of rows containing the slices of those columns.''' 70 | rows = [] 71 | n_rows = math.ceil(len(items) / n_cols) 72 | for row in range(n_rows): 73 | row_items = items[row : : n_rows] 74 | # Pad and truncate 75 | rows.append( (row_items + ([filler] * n_cols))[:n_cols] ) 76 | return rows 77 | 78 | -------------------------------------------------------------------------------- /plotman.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | 4 | from datetime import datetime 5 | from subprocess import call 6 | 7 | import argparse 8 | import os 9 | import re 10 | import threading 11 | import random 12 | #import readline 13 | import readline # For nice CLI 14 | import sys 15 | import time 16 | import yaml 17 | import shutil 18 | 19 | # Plotman libraries 20 | from job import Job 21 | import analyzer 22 | import archive 23 | import interactive 24 | import manager 25 | import plot_util 26 | import reporting 27 | 28 | class PlotmanArgParser: 29 | def add_idprefix_arg(self, subparser): 30 | subparser.add_argument( 31 | 'idprefix', 32 | type=str, 33 | nargs='+', 34 | help='disambiguating prefix of plot ID') 35 | 36 | def parse_args(self): 37 | parser = argparse.ArgumentParser(description='Chia plotting manager.') 38 | sp = parser.add_subparsers(dest='cmd') 39 | 40 | p_status = sp.add_parser('status', help='show current plotting status') 41 | 42 | p_dirs = sp.add_parser('dirs', help='show directories info') 43 | 44 | p_interactive = sp.add_parser('interactive', help='run interactive control/montioring mode') 45 | 46 | p_dst_sch = sp.add_parser('dsched', help='print destination dir schedule') 47 | 48 | p_plot = sp.add_parser('plot', help='run plotting loop') 49 | 50 | p_archive = sp.add_parser('archive', 51 | help='move completed plots to farming location') 52 | 53 | p_details = sp.add_parser('details', help='show details for job') 54 | self.add_idprefix_arg(p_details) 55 | 56 | p_files = sp.add_parser('files', help='show temp files associated with job') 57 | self.add_idprefix_arg(p_files) 58 | 59 | p_kill = sp.add_parser('kill', help='kill job (and cleanup temp files)') 60 | self.add_idprefix_arg(p_kill) 61 | 62 | p_suspend = sp.add_parser('suspend', help='suspend job') 63 | self.add_idprefix_arg(p_suspend) 64 | 65 | p_resume = sp.add_parser('resume', help='resume suspended job') 66 | self.add_idprefix_arg(p_resume) 67 | 68 | p_analyze = sp.add_parser('analyze', 69 | help='analyze timing stats of completed jobs') 70 | p_analyze.add_argument('logfile', type=str, nargs='+', 71 | help='logfile(s) to analyze') 72 | 73 | args = parser.parse_args() 74 | return args 75 | 76 | 77 | if __name__ == "__main__": 78 | random.seed() 79 | 80 | pm_parser = PlotmanArgParser() 81 | args = pm_parser.parse_args() 82 | 83 | with open('config.yaml', 'r') as ymlfile: 84 | cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) 85 | dir_cfg = cfg['directories'] 86 | sched_cfg = cfg['scheduling'] 87 | plotting_cfg = cfg['plotting'] 88 | 89 | # 90 | # Stay alive, spawning plot jobs 91 | # 92 | if args.cmd == 'plot': 93 | print('...starting plot loop') 94 | while True: 95 | wait_reason = manager.maybe_start_new_plot(dir_cfg, sched_cfg, plotting_cfg) 96 | 97 | # TODO: report this via a channel that can be polled on demand, so we don't spam the console 98 | sleep_s = int(sched_cfg['polling_time_s']) 99 | if wait_reason: 100 | print('...sleeping %d s: %s' % (sleep_s, wait_reason)) 101 | 102 | time.sleep(sleep_s) 103 | 104 | # 105 | # Analysis of completed jobs 106 | # 107 | elif args.cmd == 'analyze': 108 | analyzer = analyzer.LogAnalyzer() 109 | analyzer.analyze(args.logfile) 110 | 111 | else: 112 | # print('...scanning process tables') 113 | jobs = Job.get_running_jobs(dir_cfg['log']) 114 | 115 | # Status report 116 | if args.cmd == 'status': 117 | (columns, rows) = shutil.get_terminal_size() 118 | print(reporting.status_report(jobs, int(columns))) 119 | 120 | # Directories report 121 | elif args.cmd == 'dirs': 122 | (columns, rows) = shutil.get_terminal_size() 123 | print(reporting.dirs_report(jobs, dir_cfg, sched_cfg, int(columns))) 124 | 125 | elif args.cmd == 'interactive': 126 | interactive.run_interactive() 127 | 128 | # Start running archival 129 | elif args.cmd == 'archive': 130 | print('...starting archive loop') 131 | firstit = True 132 | while True: 133 | if not firstit: 134 | print('Sleeping 60s until next iteration...') 135 | time.sleep(60) 136 | jobs = Job.get_running_jobs(dir_cfg['log']) 137 | firstit = False 138 | archive.archive(dir_cfg, jobs) 139 | 140 | # Debugging: show the destination drive usage schedule 141 | elif args.cmd == 'dsched': 142 | dstdirs = dir_cfg['dst'] 143 | for (d, ph) in manager.dstdirs_to_furthest_phase(jobs).items(): 144 | print(' %s : %s' % (d, str(ph))) 145 | 146 | # 147 | # Job control commands 148 | # 149 | elif args.cmd in [ 'details', 'files', 'kill', 'suspend', 'resume' ]: 150 | print(args) 151 | 152 | selected = [] 153 | 154 | # TODO: clean up treatment of wildcard 155 | if args.idprefix[0] == 'all': 156 | selected = jobs 157 | else: 158 | # TODO: allow multiple idprefixes, not just take the first 159 | selected = manager.select_jobs_by_partial_id(jobs, args.idprefix[0]) 160 | if (len(selected) == 0): 161 | print('Error: %s matched no jobs.' % id_spec) 162 | elif len(selected) > 1: 163 | print('Error: "%s" matched multiple jobs:' % id_spec) 164 | for j in selected: 165 | print(' %s' % j.plot_id) 166 | selected = [] 167 | 168 | for job in selected: 169 | if args.cmd == 'details': 170 | print(job.status_str_long()) 171 | 172 | elif args.cmd == 'files': 173 | temp_files = job.get_temp_files() 174 | for f in temp_files: 175 | print(' %s' % f) 176 | 177 | elif args.cmd == 'kill': 178 | # First suspend so job doesn't create new files 179 | print('Pausing PID %d, plot id %s' % (job.proc.pid, job.plot_id)) 180 | job.suspend() 181 | 182 | temp_files = job.get_temp_files() 183 | print('Will kill pid %d, plot id %s' % (job.proc.pid, job.plot_id)) 184 | print('Will delete %d temp files' % len(temp_files)) 185 | conf = input('Are you sure? ("y" to confirm): ') 186 | if (conf != 'y'): 187 | print('canceled. If you wish to resume the job, do so manually.') 188 | else: 189 | print('killing...') 190 | job.cancel() 191 | print('cleaing up temp files...') 192 | for f in temp_files: 193 | os.remove(f) 194 | 195 | elif args.cmd == 'suspend': 196 | print('Suspending ' + job.plot_id) 197 | job.suspend() 198 | elif args.cmd == 'resume': 199 | print('Resuming ' + job.plot_id) 200 | job.resume() 201 | 202 | -------------------------------------------------------------------------------- /reporting.py: -------------------------------------------------------------------------------- 1 | import texttable as tt # from somewhere? 2 | import os 3 | import psutil 4 | 5 | import archive 6 | import job 7 | import manager 8 | import math 9 | import plot_util 10 | 11 | def abbr_path(path, putative_prefix): 12 | if putative_prefix and path.startswith(putative_prefix): 13 | return os.path.relpath(path, putative_prefix) 14 | else: 15 | return path 16 | 17 | def phases_str(phases, max_num=None): 18 | '''Take a list of phase-subphase pairs and return them as a compact string''' 19 | if not max_num or len(phases) <= max_num: 20 | return ' '.join(['%d:%d' % pair for pair in phases]) 21 | else: 22 | n_first = math.floor(max_num / 2) 23 | n_last = max_num - n_first 24 | n_elided = len(phases) - (n_first + n_last) 25 | first = ' '.join(['%d:%d' % pair for pair in phases[:n_first]]) 26 | elided = " [+%d] " % n_elided 27 | last = ' '.join(['%d:%d' % pair for pair in phases[n_first + n_elided:]]) 28 | return first + elided + last 29 | 30 | def n_at_ph(jobs, ph): 31 | return sum([1 for j in jobs if j.progress() == ph]) 32 | 33 | def n_to_char(n): 34 | n_to_char_map = dict(enumerate(" .:;!")) 35 | 36 | if n < 0: 37 | return 'X' # Should never be negative 38 | elif n >= len(n_to_char_map): 39 | n = len(n_to_char_map) - 1 40 | 41 | return n_to_char_map[n] 42 | 43 | def job_viz(jobs): 44 | # TODO: Rewrite this in a way that ensures we count every job 45 | # even if the reported phases don't line up with expectations. 46 | result = '' 47 | result += '1' 48 | for i in range(0, 8): 49 | result += n_to_char(n_at_ph(jobs, (1, i))) 50 | result += '2' 51 | for i in range(0, 8): 52 | result += n_to_char(n_at_ph(jobs, (2, i))) 53 | result += '3' 54 | for i in range(0, 7): 55 | result += n_to_char(n_at_ph(jobs, (3, i))) 56 | result += '4' 57 | result += n_to_char(n_at_ph(jobs, (4, 0))) 58 | return result 59 | 60 | 61 | def status_report(jobs, width, height=None, tmp_prefix='', dst_prefix=''): 62 | '''height, if provided, will limit the number of rows in the table, 63 | showing first and last rows, row numbers and an elipsis in the middle.''' 64 | abbreviate_jobs_list = False 65 | n_begin_rows = 0 66 | n_end_rows = 0 67 | if height and height < len(jobs) + 1: # One row for header 68 | abbreviate_jobs_list = True 69 | 70 | if abbreviate_jobs_list: 71 | n_rows = height - 2 # One for header, one for elipsis 72 | n_begin_rows = int(n_rows / 2) 73 | n_end_rows = n_rows - n_begin_rows 74 | 75 | tab = tt.Texttable() 76 | headings = ['plot id', 'k', 'tmp', 'dst', 'wall', 'phase', 'tmp', 77 | 'pid', 'stat', 'mem', 'user', 'sys', 'io'] 78 | if height: 79 | headings.insert(0, '#') 80 | tab.header(headings) 81 | tab.set_cols_dtype('t' * len(headings)) 82 | tab.set_cols_align('r' * len(headings)) 83 | tab.set_header_align('r' * len(headings)) 84 | for i, j in enumerate(sorted(jobs, key=job.Job.get_time_wall)): 85 | # Elipsis row 86 | if abbreviate_jobs_list and i == n_begin_rows: 87 | row = ['...'] + ([''] * 13) 88 | # Omitted row 89 | elif abbreviate_jobs_list and i > n_begin_rows and i < (len(jobs) - n_end_rows): 90 | continue 91 | 92 | # Regular row 93 | else: 94 | try: 95 | row = [j.plot_id[:8] + '...', 96 | j.k, 97 | abbr_path(j.tmpdir, tmp_prefix), 98 | abbr_path(j.dstdir, dst_prefix), 99 | plot_util.time_format(j.get_time_wall()), 100 | '%d:%d' % j.progress(), 101 | plot_util.human_format(j.get_tmp_usage(), 0), 102 | j.proc.pid, 103 | j.get_run_status(), 104 | plot_util.human_format(j.get_mem_usage(), 1), 105 | plot_util.time_format(j.get_time_user()), 106 | plot_util.time_format(j.get_time_sys()), 107 | plot_util.time_format(j.get_time_iowait()) 108 | ] 109 | except psutil.NoSuchProcess: 110 | # In case the job has disappeared 111 | row = [j.plot_id[:8] + '...'] + (['--'] * 12) 112 | 113 | if height: 114 | row.insert(0, '%3d' % i) 115 | 116 | tab.add_row(row) 117 | 118 | tab.set_max_width(width) 119 | tab.set_deco(0) # No borders 120 | # return ('tmp dir prefix: %s ; dst dir prefix: %s\n' % (tmp_prefix, dst_prefix) 121 | return tab.draw() 122 | 123 | def tmp_dir_report(jobs, tmpdirs, sched_cfg, width, start_row=None, end_row=None, prefix=''): 124 | '''start_row, end_row let you split the table up if you want''' 125 | tab = tt.Texttable() 126 | headings = ['tmp', 'ready', 'phases'] 127 | tab.header(headings) 128 | tab.set_cols_dtype('t' * len(headings)) 129 | tab.set_cols_align('r' * (len(headings) - 1) + 'l') 130 | for i, d in enumerate(sorted(tmpdirs)): 131 | if (start_row and i < start_row) or (end_row and i >= end_row): 132 | continue 133 | phases = sorted(job.job_phases_for_tmpdir(d, jobs)) 134 | ready = manager.phases_permit_new_job(phases, sched_cfg) 135 | row = [abbr_path(d, prefix), 'OK' if ready else '--', phases_str(phases)] 136 | tab.add_row(row) 137 | 138 | tab.set_max_width(width) 139 | tab.set_deco(tt.Texttable.BORDER | tt.Texttable.HEADER ) 140 | tab.set_deco(0) # No borders 141 | return tab.draw() 142 | 143 | def dst_dir_report(jobs, dstdirs, width, prefix=''): 144 | tab = tt.Texttable() 145 | dir2oldphase = manager.dstdirs_to_furthest_phase(jobs) 146 | dir2newphase = manager.dstdirs_to_youngest_phase(jobs) 147 | headings = ['dst', 'plots', 'GBfree', 'inbnd phases', 'pri'] 148 | tab.header(headings) 149 | tab.set_cols_dtype('t' * len(headings)) 150 | 151 | for d in sorted(dstdirs): 152 | # TODO: This logic is replicated in archive.py's priority computation, 153 | # maybe by moving more of the logic in to directory.py 154 | eldest_ph = dir2oldphase.get(d, (0, 0)) 155 | phases = job.job_phases_for_dstdir(d, jobs) 156 | 157 | dir_plots = plot_util.list_k32_plots(d) 158 | 159 | gb_free = int(plot_util.df_b(d) / plot_util.GB) 160 | 161 | n_plots = len(dir_plots) 162 | priority = archive.compute_priority(eldest_ph, gb_free, n_plots) 163 | row = [abbr_path(d, prefix), n_plots, gb_free, 164 | phases_str(phases, 5), priority] 165 | tab.add_row(row) 166 | tab.set_max_width(width) 167 | tab.set_deco(tt.Texttable.BORDER | tt.Texttable.HEADER ) 168 | tab.set_deco(0) # No borders 169 | return tab.draw() 170 | 171 | def arch_dir_report(archdir_freebytes, width, prefix=''): 172 | cells = ['%s:%5dGB' % (abbr_path(d, prefix), int(int(space) / plot_util.GB)) 173 | for (d, space) in sorted(archdir_freebytes.items())] 174 | if not cells: 175 | return '' 176 | 177 | n_columns = int(width / (len(max(cells, key=len)) + 3)) 178 | tab = tt.Texttable() 179 | tab.set_max_width(width) 180 | for row in plot_util.column_wrap(cells, n_columns, filler=''): 181 | tab.add_row(row) 182 | tab.set_cols_align('r' * (n_columns)) 183 | tab.set_deco(tt.Texttable.VLINES) 184 | return tab.draw() 185 | 186 | # TODO: remove this 187 | def dirs_report(jobs, dir_cfg, sched_cfg, width): 188 | tmpdirs = dir_cfg['tmp'] 189 | dstdirs = dir_cfg['dst'] 190 | arch_cfg = dir_cfg['archive'] 191 | return (tmp_dir_report(jobs, tmpdirs, sched_cfg, width) + '\n' + 192 | dst_dir_report(jobs, dstdirs, width) + '\n' + 193 | 'archive dirs free space:\n' + 194 | arch_dir_report(archive.get_archdir_freebytes(arch_cfg), width) + '\n') 195 | 196 | 197 | --------------------------------------------------------------------------------