├── .gitignore ├── README.md ├── data.zip ├── getting_started.pdf └── sg ├── __init__.py ├── __init__.pyc ├── data ├── __init__.py ├── __init__.pyc ├── bchydro │ ├── __init__.py │ ├── __init__.pyc │ ├── bcholidays.txt │ ├── bchydro.py │ ├── bchydro.pyc │ ├── data_for_testing.csv │ ├── demo_bchydro.py │ ├── holiday_parser.py │ ├── readme.txt │ ├── test_bchydro.py │ └── test_holiday_parser.py ├── dataset.py ├── dataset.pyc ├── demo_dataset.py ├── eklima │ ├── __init__.py │ ├── __init__.pyc │ ├── parse_eklima_xml.py │ └── parse_eklima_xml.pyc ├── eunite │ ├── __init__.py │ ├── eunite.py │ ├── import_csv_to_sqlite.py │ ├── readme.txt │ └── test_eunite.py ├── sintef │ ├── Makefile │ ├── README.txt │ ├── __init__.py │ ├── __init__.pyc │ ├── anonymize_gs2.cpp │ ├── convert_scikits_to_pandas_hdf5.py │ ├── create_full_temp_data.py │ ├── create_full_temp_data.pyc │ ├── data_for_eirik.py │ ├── eb_userloads.py │ ├── find_binary.cpp │ ├── gs2-do.sh │ ├── gs2-grep.sh │ ├── gs2.txt │ ├── gs2_for_prediction.txt │ ├── gs2_short.txt │ ├── make-list-of-gs2-files.sh │ ├── map_EIA_to_anonymous.py │ ├── parse_gs2.py │ ├── parse_gs2.pyc │ ├── plot_temp.py │ ├── plot_temp_misc.py │ ├── preprocess_gs2.py │ ├── select_meters.py │ ├── test_parse_gs2.py │ ├── test_userloads.py │ ├── test_userloads.pyc │ ├── testfile.gs2 │ ├── testfile_short.gs2 │ ├── unique.py │ ├── userloads.py │ └── userloads.pyc ├── test_dataset.py └── yr.no │ ├── README.txt │ ├── crontab.txt │ └── get-forecasts.sh ├── globals.py ├── globals.pyc ├── models ├── __init__.py ├── __init__.pyc ├── arima.py ├── bfgs.py ├── demo_cleansing.py ├── error_functions.py ├── esn.py ├── esn.pyc ├── exp_cleaning.py ├── filter-R-messages.py ├── ga.py ├── ga.pyc ├── gaussian_process_intro.py ├── genome_evaluator.py ├── gridopt_load_prediction.py ├── gui.py ├── gui.pyc ├── lib_atlas │ ├── BsplineAnalyticSmoother.cpp │ ├── BsplineAnalyticSmoother.h │ └── Makefile ├── lib_mkl │ ├── BsplineAnalyticSmoother.cpp │ ├── BsplineAnalyticSmoother.h │ └── Makefile ├── linear.py ├── load_cleansing.py ├── load_cleansing.pyc ├── load_prediction.py ├── load_prediction.py.orig ├── load_prediction.pyc ├── load_prediction.py~ ├── load_prediction_CBR.py ├── load_prediction_ar.py ├── load_prediction_ar24.py ├── load_prediction_arima.py ├── load_prediction_averagedaily.py ├── load_prediction_averagehourly.py ├── load_prediction_dshw.py ├── load_prediction_esn.py ├── load_prediction_esn24.py ├── load_prediction_identity.py ├── load_prediction_regul_ar.py ├── load_prediction_taohong.py ├── load_prediction_wavelet.py ├── load_prediction_wavelet.py.orig ├── load_prediction_wavelet24.py ├── manual_load_prediction_gridsearch.py ├── mixture_of_experts.py ├── model.py ├── model.pyc ├── onemax_mpi.py ├── pattern_eliminators.py ├── regul_ar.py ├── regul_ar_grid_search.py ├── roughness.tex ├── run_experiments.py ├── run_experiments_params.py ├── spclean.py ├── spclean.pyc ├── spclean_wrapper.py ├── splines.py ├── splines.pyc ├── static.py ├── static.pyc ├── subset_runs │ └── make-runfiles.sh ├── taohong.py ├── test_arima.py ├── test_esn.py ├── test_sequence_scan.py ├── test_spclean.py ├── test_splines.py ├── test_wavelet.py ├── test_wavelet_retrieve.py ├── wavelet.py └── wavelet.pyc ├── requirements.txt └── utils ├── __init__.py ├── __init__.pyc ├── _test_template.py ├── analyze_gefcom_temp_genes.py ├── cache.py ├── genemapper.py ├── genemapper.pyc ├── output.py ├── output.pyc ├── plot_fitnesses.py ├── pyevolve_mpi.py ├── pyevolve_utils.py ├── pyevolve_utils.py~ ├── queue_jobs.py ├── scripts ├── best-genomes-found.sh ├── list-finished-jobs.sh ├── parse-logs-into-csv.sh ├── resubmit-jobs.sh ├── split-test-validate.sh └── summarize-simulation-results.sh ├── test_cache.py ├── test_genemapper.py ├── test_output.py ├── test_pyevolve_utils.py ├── test_timer.py ├── test_utils.py ├── testutils.py ├── testutils.pyc ├── timer.py ├── timer.pyc ├── utils.py ├── utils.py.orig ├── utils.pyc └── visualize.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pickle 2 | *.db 3 | .hg* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | load_forecasting 2 | ================ 3 | 4 | A framework for load forecasting, where seasonal cycles are removed from the input signal. A genetic algorithm is used to parameterize various forecasting models. 5 | 6 | If you use this software, please cite: 7 | 8 | ``` 9 | @Article{hoverstadr:_three_stage_approac_load_forec, 10 | author = {Boye Annfelt Høverstad and Axel Tidemann and Helge Langseth and Pinar {\"O}zt{\"u}rk}, 11 | title = {Short term load forecasting with seasonal decomposition using evolution for parameter tuning}, 12 | journal = {IEEE Transactions on Smart Grid}, 13 | year = 2015} 14 | ``` 15 | 16 | Because of data property issues, we can only publish two datasets: GEFCOM 2012 and BCHydro. They are in the data.zip file. You will have to arrange the path to these data files accordingly in the source code. For installation instructions, see getting_started.pdf. 17 | 18 | -------------------------------------------------------------------------------- /data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/data.zip -------------------------------------------------------------------------------- /getting_started.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/getting_started.pdf -------------------------------------------------------------------------------- /sg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/__init__.py -------------------------------------------------------------------------------- /sg/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/__init__.pyc -------------------------------------------------------------------------------- /sg/data/__init__.py: -------------------------------------------------------------------------------- 1 | from dataset import * 2 | -------------------------------------------------------------------------------- /sg/data/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/__init__.pyc -------------------------------------------------------------------------------- /sg/data/bchydro/__init__.py: -------------------------------------------------------------------------------- 1 | from bchydro import * 2 | -------------------------------------------------------------------------------- /sg/data/bchydro/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/bchydro/__init__.pyc -------------------------------------------------------------------------------- /sg/data/bchydro/bcholidays.txt: -------------------------------------------------------------------------------- 1 | Holiday Day Observed Notes 2 | New Year's Day January 1 3 | Good Friday Friday before Easter Sunday 4 | Easter Monday Monday after Easter Sunday Not a statutory holiday, but bank holiday and federally regulated employers give the day off. 5 | Victoria Day Monday before May 25 6 | Canada Day July 1 or July 2 if July 1 is a Sunday 7 | Civic Holiday First Monday of August A.k.a. British Columbia Day in BC 8 | Labour Day First Monday of September 9 | Thanksgiving Second Monday of October 10 | Remembrance Day November 11 11 | Christmas Day December 25 12 | Family Day Second Monday of February Introduced in 2013 13 | -------------------------------------------------------------------------------- /sg/data/bchydro/bchydro.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/bchydro/bchydro.pyc -------------------------------------------------------------------------------- /sg/data/bchydro/data_for_testing.csv: -------------------------------------------------------------------------------- 1 | January 1, 2004; 1; 7160 2 | January 1, 2004; 2; 6853 3 | January 1, 2004; 3; 6634 4 | January 1, 2004; 4; 6468 5 | January 1, 2004; 5; 6430 6 | January 1, 2004; 6; 6503 7 | January 1, 2004; 7; 6608 8 | January 1, 2004; 8; 6812 9 | January 1, 2004; 9; 6976 10 | January 1, 2004; 10; 7225 11 | January 1, 2004; 11; 7461 12 | January 1, 2004; 12; 7585 13 | January 1, 2004; 13; 7634 14 | January 1, 2004; 14; 7564 15 | January 1, 2004; 15; 7504 16 | January 1, 2004; 16; 7644 17 | January 1, 2004; 17; 8238 18 | January 1, 2004; 18; 8734 19 | January 1, 2004; 19; 8647 20 | January 1, 2004; 20; 8485 21 | January 1, 2004; 21; 8333 22 | January 1, 2004; 22; 8116 23 | January 1, 2004; 23; 7674 24 | January 1, 2004; 24; 7173 25 | January 2, 2004; 1; 6797 26 | January 2, 2004; 2; 6604 27 | January 2, 2004; 3; 6535 28 | January 2, 2004; 4; 6537 29 | January 2, 2004; 5; 6538 30 | January 2, 2004; 6; 6785 31 | January 2, 2004; 7; 7166 32 | January 2, 2004; 8; 7749 33 | January 2, 2004; 9; 8160 34 | January 2, 2004; 10; 8433 35 | January 2, 2004; 11; 8514 36 | January 2, 2004; 12; 8445 37 | January 2, 2004; 13; 8406 38 | January 2, 2004; 14; 8256 39 | January 2, 2004; 15; 8125 40 | January 2, 2004; 16; 8281 41 | January 2, 2004; 17; 8789 42 | January 2, 2004; 18; 9238 43 | January 2, 2004; 19; 9176 44 | January 2, 2004; 20; 8999 45 | January 2, 2004; 21; 8845 46 | January 2, 2004; 22; 8641 47 | January 2, 2004; 23; 8203 48 | January 2, 2004; 24; 7729 49 | January 3, 2004; 1; 7293 50 | January 3, 2004; 2; 7045 51 | January 3, 2004; 3; 6957 52 | January 3, 2004; 4; 6958 53 | January 3, 2004; 5; 6985 54 | January 3, 2004; 6; 7105 55 | January 3, 2004; 7; 7382 56 | January 3, 2004; 8; 7809 57 | January 3, 2004; 9; 8238 58 | January 3, 2004; 10; 8639 59 | January 3, 2004; 11; 8873 60 | January 3, 2004; 12; 8862 61 | January 3, 2004; 13; 8768 62 | January 3, 2004; 14; 8617 63 | January 3, 2004; 15; 8550 64 | January 3, 2004; 16; 8625 65 | January 3, 2004; 17; 9178 66 | January 3, 2004; 18; 9758 67 | January 3, 2004; 19; 9677 68 | January 3, 2004; 20; 9437 69 | January 3, 2004; 21; 9241 70 | January 3, 2004; 22; 9041 71 | January 3, 2004; 23; 8700 72 | January 3, 2004; 24; 8207 73 | January 4, 2004; 1; 7814 74 | January 4, 2004; 2; 7597 75 | January 4, 2004; 3; 7482 76 | January 4, 2004; 4; 7474 77 | January 4, 2004; 5; 7559 78 | January 4, 2004; 6; 7682 79 | January 4, 2004; 7; 7872 80 | January 4, 2004; 8; 8245 81 | January 4, 2004; 9; 8608 82 | January 4, 2004; 10; 8823 83 | January 4, 2004; 11; 8905 84 | January 4, 2004; 12; 8865 85 | January 4, 2004; 13; 8762 86 | January 4, 2004; 14; 8657 87 | January 4, 2004; 15; 8622 88 | January 4, 2004; 16; 8762 89 | January 4, 2004; 17; 9428 90 | January 4, 2004; 18; 10030 91 | January 4, 2004; 19; 10024 92 | January 4, 2004; 20; 9841 93 | January 4, 2004; 21; 9627 94 | January 4, 2004; 22; 9367 95 | January 4, 2004; 23; 8777 96 | January 4, 2004; 24; 8297 97 | January 5, 2004; 1; 7938 98 | January 5, 2004; 2; 7798 99 | January 5, 2004; 3; 7786 100 | January 5, 2004; 4; 7816 101 | -------------------------------------------------------------------------------- /sg/data/bchydro/demo_bchydro.py: -------------------------------------------------------------------------------- 1 | # Short demonstration of the utilities to load BCHydro data 2 | import sys 3 | import os 4 | from datetime import timedelta as dt 5 | 6 | import matplotlib.pyplot as plt 7 | 8 | import sg.data.bchydro as bc 9 | 10 | if __name__ == "__main__": 11 | # Option 1: load the entire dataset as a timeseries 12 | timeseries = bc.load() 13 | filtered = [x if x > 10 else 4000 for x in timeseries] 14 | plt.plot(filtered, '-') 15 | plt.title("The entire BC Hydro dataset") 16 | # Option 2: load the using the Dataset class 17 | dataset = bc.Dataset(period=dt(days=30), step_length=dt(days=7)) 18 | plt.figure() 19 | plt.plot(dataset.get_random_period(), '-') 20 | plt.title("A randomly selected 30-day period from the BC Hydro dataset") 21 | plt.show() 22 | -------------------------------------------------------------------------------- /sg/data/bchydro/readme.txt: -------------------------------------------------------------------------------- 1 | Transmission load data downloaded from 2 | 3 | http://transmission.bchydro.com/transmission_system/balancing_authority_load_data/historical_transmission_data.htm 4 | 5 | Original Excel spreadsheets stored in directory 'originals'. 6 | 7 | First sheet of each file exported to csv. 8 | 9 | -------------------------------------------------------------------------------- /sg/data/dataset.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from datetime import timedelta as dt 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import copy 7 | 8 | class Dataset(object): 9 | def __init__(self, series, period_length, step_length=None): 10 | """Initialize the dataset with the entire timeseries, and a 11 | datetime.timedelta indicating the length of each period to be 12 | extracted. 13 | 14 | If step_length is provided, this should be a datetime.timedelta that 15 | indicates the step length between each period that may be selected. For 16 | instance, a step_length of 1 day indicates that all the selected 17 | periods will start at the same hour of day, even if the dataset has 18 | higher frequency.""" 19 | self._series = series 20 | self._period_length = \ 21 | self._convert_timedelta_to_timeseries(period_length) 22 | if step_length is None: 23 | self._step_length = 1 24 | else: 25 | self._step_length = \ 26 | self._convert_timedelta_to_timeseries(step_length) 27 | self._num_periods = (len(series) - self._period_length + 1) / \ 28 | self._step_length 29 | 30 | def _get_start_and_end_times(self): 31 | """Return the start and end times of the time series. End time is the 32 | start time of the last entry in the series, not the end time, i.e. the 33 | duration of the last timestep is not included.""" 34 | start_time = self._series.first_valid_index() 35 | end_time = self._series.last_valid_index() 36 | if isinstance(start_time, pd.Period): 37 | # start_time and end_time for a Period seem to be equivalent 38 | start_time = start_time.start_time 39 | end_time = end_time.start_time 40 | return (start_time, end_time) 41 | 42 | def _convert_timedelta_to_timeseries(self, period_length): 43 | """Return the length of the period (a timedelta) represented as an 44 | integer, based on the frequency of the dataset.""" 45 | # Calculating this cannot be done using the timeseries frequency, as 46 | # that falls apart when the frequency is undefined. This method should 47 | # work for all frequencies, as long as the time step is constant 48 | # between data points. 49 | start_time, end_time = self._get_start_and_end_times() 50 | dt_series = (end_time - start_time) / (len(self._series) - 1) 51 | if dt_series >= period_length: 52 | return 1 53 | else: 54 | # datetime.timedelta doesn't support division, so count the steps 55 | # incrementally. 56 | dt_acc = dt_series 57 | steps = 1 58 | while dt_acc < period_length: 59 | steps += 1 60 | dt_acc += dt_series 61 | if dt_acc > period_length: 62 | msg = "Could not create dataset, failed to convert time " \ 63 | "period length to a number steps in the time series array. " \ 64 | "The selected period length (%s) is not a multiple of " \ 65 | "the time step of the original data set (%s)." % \ 66 | (period_length, dt_series) 67 | raise RuntimeError(msg) 68 | return steps 69 | 70 | @property 71 | def num_periods(self): 72 | """The number of selectable periods. This is a read-only property.""" 73 | return self._num_periods 74 | 75 | @property 76 | def series(self): 77 | """The entire time series from which dataset periods are selected. This 78 | is a read-only property.""" 79 | return self._series 80 | 81 | def index_of(self, period_number): 82 | """Return index in entire time series of period number 83 | period_number.""" 84 | return period_number * self._step_length 85 | 86 | def get_period(self, period_number): 87 | """Return period number period_number.""" 88 | first = self.index_of(period_number) 89 | last = first + self._period_length 90 | return self._series[first:last] 91 | 92 | def get_random_period(self, return_period_number=False): 93 | """Select a random period of the predefined length. If 94 | return_period_number, return a tuple consisting of a random period and 95 | the period number the selected period. Otherwise return only the data.""" 96 | number = np.random.randint(0, self.num_periods) 97 | data = self.get_period(number) 98 | if return_period_number: 99 | return (data, number) 100 | else: 101 | return data 102 | 103 | def split(self, ratio=0.5): 104 | """Splits the current dataset into two datasets defined by the ratio.""" 105 | first = copy.copy(self) 106 | first._series = first._series[:int(len(first._series)*ratio)] 107 | last = copy.copy(self) 108 | last._series = last._series[int(len(last._series)*ratio):] 109 | return first, last 110 | 111 | def remove_outlier_set_previous(dataset, outlier_val=0): 112 | """Set all 'outlier'-valued elements in the dataset to be the value at the 113 | position before. This routine does not copy the dataset before cleaning. 114 | 115 | If there are several consecutive outliers, they will all be set to the 116 | preceding non-outlier value.""" 117 | outliers = np.where(dataset[1:] == outlier_val) 118 | for outlier in outliers[0]: 119 | dataset[outlier + 1] = dataset[outlier] 120 | return dataset 121 | 122 | if __name__ == "__main__": 123 | from unittest import main 124 | main(module="test_" + __file__[:-3]) 125 | -------------------------------------------------------------------------------- /sg/data/dataset.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/dataset.pyc -------------------------------------------------------------------------------- /sg/data/demo_dataset.py: -------------------------------------------------------------------------------- 1 | # Code demonstrating the use of Dataset (actually data.bchydro.Dataset). 2 | 3 | from datetime import timedelta as dt 4 | 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | import sg.data.bchydro as bc 9 | import sg.src.spclean as cln 10 | 11 | # 7-day periods, selected with one day overlap (step length 6 days) 12 | duration = 7 13 | step = 6 14 | 15 | # Create the dataset, specifying period and step length as datetime.timedelta 16 | dataset = bc.Dataset(period=dt(days=duration), step_length=dt(days=step)) 17 | 18 | # Plot the first 5 periods sequentially with overlap 19 | for period in (0, 1, 2, 3, 4): 20 | period_start_hrs = period * step * 24 21 | period_end_hrs = period_start_hrs + duration * 24 22 | x = np.arange(period_start_hrs, period_end_hrs) 23 | y = dataset.get_period(period) 24 | plt.plot(x, y) 25 | plt.title("A sequence of 7-day periods selected with 1 day overlap.") 26 | 27 | # Plot the same sequence using the original time series directly 28 | plt.figure() 29 | plt.plot(dataset.series[0:4*step*24+duration*24]) 30 | plt.title("Same data plotted by manually selecting a slice from the time series") 31 | 32 | # Plot a random sequence 33 | plt.figure() 34 | (data, period_number) = dataset.get_random_period(True) 35 | plt.plot(data) 36 | plt.title("Period number %d (randomly selected).\n\nThis period starts at " \ 37 | "index %d in the original time series." % \ 38 | (period_number, dataset.index_of(period_number))) 39 | 40 | # Show all figures 41 | plt.show() 42 | -------------------------------------------------------------------------------- /sg/data/eklima/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/__init__.py -------------------------------------------------------------------------------- /sg/data/eklima/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/__init__.pyc -------------------------------------------------------------------------------- /sg/data/eklima/parse_eklima_xml.py: -------------------------------------------------------------------------------- 1 | import xml.etree.cElementTree as et 2 | import sys, pdb 3 | import re 4 | import pandas as pd 5 | from datetime import datetime 6 | import calendar 7 | import sg.utils 8 | 9 | def parse(file): 10 | cal = dict((v,k) for k,v in enumerate(calendar.month_name)) 11 | xml = et.parse(file) 12 | root = xml.getroot() 13 | station_name = root.findall('table/Stnr/Name')[0].text 14 | TS = [] 15 | 16 | for table in root.findall('table'): 17 | if station_name in table.attrib['name']: 18 | month, year = table.attrib['name'].split(station_name)[-1].split() 19 | for date in table.findall('Date'): 20 | try: 21 | day = int(date.attrib['id']) 22 | data = [ float(ele.text) for ele in date.getchildren() if re.search('TA_\d*', ele.tag) ] 23 | hours = [ int(ele.tag.split('_')[-1]) for ele in date.getchildren() if re.search('TA_\d*', ele.tag) ] 24 | dates = [ datetime(year=int(year), month=cal[month], day=day, hour=hour) for hour in hours ] 25 | TS.append(pd.Series(data=data, index=dates)) 26 | except ValueError: 27 | pass 28 | 29 | return pd.concat((TS)) 30 | 31 | if __name__ == "__main__": 32 | sg.utils.plot_time_series([parse(sys.argv[1])], ['-'], ['Dummy station']) 33 | -------------------------------------------------------------------------------- /sg/data/eklima/parse_eklima_xml.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/parse_eklima_xml.pyc -------------------------------------------------------------------------------- /sg/data/eunite/__init__.py: -------------------------------------------------------------------------------- 1 | from eunite import * 2 | -------------------------------------------------------------------------------- /sg/data/eunite/eunite.py: -------------------------------------------------------------------------------- 1 | """Import EUNITE dataset. Concatenates 1997, 1998 and January 1999 data. The 2 | competition used Jan 1999 as test set.""" 3 | 4 | import os 5 | import sys 6 | import sqlite3 7 | import datetime 8 | import numpy as np 9 | 10 | import pandas as pd 11 | 12 | from sg.globals import SG_DATA_PATH 13 | import sg.data 14 | 15 | PATH_TO_EUNITE_DB = os.path.join(SG_DATA_PATH, "eunite", "eunite.db") 16 | 17 | class Dataset(sg.data.Dataset): 18 | def __init__(self, period, step_length=None): 19 | """Loads the EUNITE time series and sets up for extraction of random 20 | slices of length 'period', 'step_length' apart. See class Dataset for 21 | more info.""" 22 | sg.data.Dataset.__init__(self, load(), period, step_length) 23 | 24 | def load(dbpath=PATH_TO_EUNITE_DB): 25 | """Read the load data from the given database. Return a pandas.DataFrame 26 | containing the data.""" 27 | with sqlite3.connect(dbpath, detect_types=sqlite3.PARSE_DECLTYPES| 28 | sqlite3.PARSE_COLNAMES) as conn: 29 | crs = conn.cursor() 30 | sel_stmt = "SELECT Timestamp as 'stamp [timestamp]', "\ 31 | "Deg_C as 'temp [float]', "\ 32 | "MWh as 'load [float]' "\ 33 | "FROM " 34 | crs.execute(sel_stmt + "training" + \ 35 | " UNION " + \ 36 | sel_stmt + "testing" + \ 37 | " ORDER BY Timestamp ASC") 38 | stamps, temps, loads = zip(*crs.fetchall()) 39 | return pd.DataFrame({'Temperature' : np.array(temps, dtype=float), 40 | 'Load' : np.array(loads, dtype=float)}, 41 | index=stamps) 42 | 43 | if __name__ == '__main__': 44 | from unittest import main 45 | main(module='test_'+__file__[:-3]) 46 | -------------------------------------------------------------------------------- /sg/data/eunite/import_csv_to_sqlite.py: -------------------------------------------------------------------------------- 1 | """Import load, temperature and holiday data from csv files into sqlite. 2 | 3 | May not work without modification, after separating data from code. The code in 4 | this file assumes the csv files are in the working directory of the 5 | interpreter. 6 | 7 | """ 8 | 9 | import csv 10 | import datetime 11 | import os 12 | import sqlite3 13 | 14 | from sg.data.eunite import PATH_TO_EUNITE_DB 15 | 16 | def import_data(load_path, temp_path, cursor, table_name): 17 | load_reader = csv.reader(open(load_path), delimiter=';') 18 | temp_reader = csv.reader(open(temp_path), delimiter=';') 19 | loads = [l for l in load_reader] 20 | temperatures = [t for t in temp_reader] 21 | assert(len(temperatures) == len(loads)) 22 | for (temp, load) in zip(temperatures, loads): 23 | load = [int(l) for l in load] 24 | ldate = datetime.datetime(year=load[0], month=load[1], day=load[2]) 25 | tdate = datetime.datetime.strptime(temp[0], "%Y-%m-%d") 26 | assert(ldate == tdate) 27 | deg_c = float(temp[1]) 28 | for half_hour in range(len(load)-3): 29 | stamp = ldate + datetime.timedelta(hours=float(half_hour) / 2) 30 | cursor.execute("INSERT INTO %s VALUES (?, ?, ?)" % table_name, 31 | (stamp, deg_c, load[half_hour + 3])) 32 | 33 | def import_holidays(cursor): 34 | with open("holidays.csv") as f: 35 | for l in f: 36 | date = datetime.datetime.strptime(l[:-1], "%Y-%m-%d") 37 | cursor.execute('INSERT INTO holidays VALUES (?)', (date,)) 38 | 39 | def _reformat_date_jan_1999(): 40 | """Run this function only once, to transform the date format of 41 | temperature_1999.csv into ISO.""" 42 | reader = csv.reader(open("temperatures_1999.csv"), delimiter=";") 43 | for (day, month, temp) in reader: 44 | date = datetime.datetime.strptime("-".join(["1999", month, day]), 45 | "%Y-%m-%d") 46 | print "%s; %s" % (date.strftime("%Y-%m-%d"), temp) 47 | 48 | def clear_db(cursor): 49 | try: 50 | cursor.execute("DROP TABLE training") 51 | except: 52 | pass 53 | try: 54 | cursor.execute("DROP TABLE testing") 55 | except: 56 | pass 57 | try: 58 | cursor.execute("DROP TABLE holidays") 59 | except: 60 | pass 61 | 62 | def setup_db(cursor): 63 | cursor.execute('CREATE TABLE holidays ' \ 64 | '("Timestamp" datetime unique not null primary key)') 65 | for table in ("training", "testing"): 66 | cursor.execute('CREATE TABLE %s ' \ 67 | '("Timestamp" datetime unique not null primary key, ' \ 68 | '"Deg_C" float, "MWh" float)' % table) 69 | 70 | if __name__ == "__main__": 71 | with sqlite3.connect(PATH_TO_EUNITE_DB, 72 | detect_types=sqlite3.PARSE_DECLTYPES) as conn: 73 | cursor = conn.cursor() 74 | clear_db(cursor) 75 | setup_db(cursor) 76 | import_holidays(cursor) 77 | import_data("loads.csv", "temperatures.csv", cursor, "training") 78 | import_data("loads_1999.csv", "temperatures_1999.csv", cursor, 79 | "testing") 80 | -------------------------------------------------------------------------------- /sg/data/eunite/readme.txt: -------------------------------------------------------------------------------- 1 | The data in this folder are those used in the EUNITE 2001 load forecasting competition. These have subsequently also been applied by other forecasting studies (e.g. T. Rashid and T. Kechadi, A Practical Approach for Electricity Load Forecasting, World Academy of Science, Engineering and Technology 5 2005). In the competition, the 1997 and 1998 data were used as training sets, while the competition used data from January 1999. 2 | 3 | See web page for more info: 4 | http://neuron.tuke.sk/competition/index.php 5 | 6 | * Web page of competition winners: 7 | http://www.csie.ntu.edu.tw/~cjlin/papers.html 8 | * Chang, Chen & Lin. EUNITE Network Competition: Electricity Load Forecasting: 9 | http://www.csie.ntu.edu.tw/~cjlin/papers/euniteelf.ps.gz 10 | Also saved in this directory as winner_model_article.pdf. 11 | 12 | The data were preprocessed as follows: 13 | 14 | * In Excel, all dates were formatted as ISO (YYYY-MM-DD). 15 | 16 | * All line endings were changed using mac2unix. 17 | 18 | * The data in Holidays.xls was manually transformed so all dates were in a row. 19 | 20 | * Temperature data for 1997 and 1998 (from "competition" folder) were manually opened in Excel, the two years were concatenated, the date format was set to ISO YYYY-MM-DD, and the file was saved as temperatures.csv. 21 | 22 | * temperature.csv and temperature_1999.csv were modified replacing ',' with '.' as decimal separator. 23 | 24 | * Load data for 1997 and 1998 (from "competition" folder) were manually concatenated and saves as loads.csv. 25 | 26 | * Dates for temperature 1999 were reformatted using the function _reformat_date_jan_1999 in import_csv_to_sqlite.py. 27 | 28 | -------------------------------------------------------------------------------- /sg/data/eunite/test_eunite.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import timedelta as dt 3 | 4 | import sg.utils.testutils as testutils 5 | from sg.data.eunite.eunite import * 6 | 7 | class TestEuniteDataset(testutils.ArrayTestCase): 8 | def setUp(self): 9 | self.data = Dataset(period=dt(days=2), step_length=dt(days=1)) 10 | 11 | def _test_two_days_correct(self, period, temps, loads): 12 | self.assertEqual(len(period), 2 * 48) 13 | temps = [temps[0] for i in range(48)] + [temps[1] for i in range(48)] 14 | fasit = np.array([[t, l] for (t, l) in zip(temps, loads)]) 15 | self.assertArraysEqual(period.data, fasit) 16 | 17 | def test_first_correct(self): 18 | day_1_to_2 = self.data.get_period(0) 19 | temps = [-7.6, -6.3] 20 | loads = [797, 794, 784, 787, 763, 749, 745, 730, 707, 706, 720, 657, 21 | 633, 595, 560, 540, 519, 601, 631, 621, 640, 643, 654, 653, 22 | 688, 688, 690, 690, 684, 679, 674, 677, 644, 660, 654, 683, 23 | 688, 698, 719, 733, 700, 671, 692, 685, 717, 694, 692, 686, 24 | 704, 697, 704, 676, 664, 668, 668, 662, 665, 666, 703, 677, 25 | 669, 660, 650, 672, 648, 682, 692, 724, 727, 739, 739, 733, 26 | 741, 754, 767, 768, 738, 734, 747, 733, 751, 746, 737, 750, 27 | 759, 776, 777, 777, 746, 724, 697, 708, 745, 705, 702, 722] 28 | self._test_two_days_correct(day_1_to_2, temps, loads) 29 | 30 | def test_feb17_18_1998_correct(self): 31 | days = self.data.get_period(365 + 31 + 16) 32 | temps = [4.1, 1.8] 33 | loads = [655, 621, 612, 611, 602, 621, 598, 608, 601, 595, 602, 632, 34 | 662, 699, 715, 671, 685, 723, 745, 711, 725, 734, 690, 708, 35 | 721, 729, 726, 695, 717, 725, 697, 681, 710, 678, 746, 744, 36 | 749, 770, 761, 759, 734, 715, 675, 658, 647, 686, 656, 671, 37 | 702, 698, 672, 659, 665, 655, 630, 637, 633, 672, 674, 715, 38 | 708, 747, 709, 711, 725, 719, 738, 742, 725, 729, 707, 715, 39 | 738, 746, 750, 712, 728, 709, 709, 698, 711, 720, 734, 751, 40 | 759, 782, 760, 773, 729, 707, 647, 660, 659, 643, 648, 658] 41 | self._test_two_days_correct(days, temps, loads) 42 | 43 | def test_last_correct(self): 44 | last_2_days = self.data.get_period(2 * 365 + 31 - 2) 45 | temps = [-7.8, -6.0] 46 | loads = [716, 714, 697, 686, 680, 686, 641, 658, 658, 645, 673, 640, 47 | 630, 604, 615, 628, 634, 660, 699, 696, 702, 732, 726, 717, 48 | 740, 753, 749, 734, 743, 718, 705, 708, 711, 727, 736, 747, 49 | 744, 740, 751, 763, 741, 714, 698, 701, 710, 697, 687, 703, 50 | 712, 720, 694, 698, 679, 648, 665, 656, 677, 651, 623, 604, 51 | 595, 578, 576, 598, 620, 644, 691, 666, 691, 700, 717, 700, 52 | 694, 714, 724, 702, 696, 691, 682, 677, 677, 688, 687, 713, 53 | 708, 735, 734, 743, 711, 717, 702, 698, 694, 691, 691, 704] 54 | self._test_two_days_correct(last_2_days, temps, loads) 55 | 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /sg/data/sintef/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CXX = g++ 3 | 4 | OPTS = -O3 5 | 6 | .SUFFIXES: 7 | 8 | clean: 9 | rm -f *~ *.o a.out 10 | 11 | %.o: %.cpp force 12 | $(CXX) $(OPTS) $(INCLUDE) -c $< 13 | 14 | %: %.cpp force 15 | $(CXX) $(OPTS) -o $(subst _,-,$@) $< 16 | force: ; 17 | -------------------------------------------------------------------------------- /sg/data/sintef/__init__.py: -------------------------------------------------------------------------------- 1 | from userloads import * 2 | -------------------------------------------------------------------------------- /sg/data/sintef/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/__init__.pyc -------------------------------------------------------------------------------- /sg/data/sintef/convert_scikits_to_pandas_hdf5.py: -------------------------------------------------------------------------------- 1 | """Convert SINTEF load data in HDF5 files from scikits.timeseries to 2 | pandas.DataFrames.""" 3 | 4 | import datetime 5 | 6 | import tables as h5 7 | import pandas as pd 8 | import scikits.timeseries.lib.tstables 9 | import scikits.timeseries as ts 10 | 11 | from preprocess_gs2 import PandasH5Storer 12 | 13 | class Converter(PandasH5Storer): 14 | def __init__(self, path_ts_in, path_pd_out): 15 | PandasH5Storer.__init__(self, path_pd_out) 16 | self._h5file_ts = h5.openFile(path_ts_in, "r") 17 | 18 | def __del__(self): 19 | PandasH5Storer.__del__(self) 20 | self._h5file_ts.close() 21 | 22 | def _load_ts_user(self, user_id): 23 | return self._h5file_ts.getNode("/loads/id_" + str(user_id)).read() 24 | 25 | def _convert_dates(self, series_ts): 26 | return [date.datetime for date in series_ts.dates] 27 | 28 | def _make_pd_series_from_scikits_series(self, series_ts): 29 | data = {'Load' : series_ts[:,0], 30 | 'Status Code' : series_ts[:,1]} 31 | dates = self._convert_dates(series_ts) 32 | return pd.DataFrame(data, index=dates) 33 | 34 | def _convert_user_id_lists(self): 35 | """The list of experiment users was stored in the original file. This 36 | must be carried over as a Series in the Pandas file.""" 37 | user_ids = self._h5file_ts.root.loads.cln_pred_exp_ids.read() 38 | self.store_list('user_ids_cln_pred_exp', user_ids) 39 | user_ids = self._h5file_ts.root.loads.user_ids.read() 40 | self.store_list('user_ids', user_ids) 41 | 42 | def _convert_users(self): 43 | user_ids = self._h5file_ts.root.loads.user_ids.read() 44 | for user_id in user_ids: 45 | series_ts = self._load_ts_user(user_id) 46 | series_pd = self._make_pd_series_from_scikits_series(series_ts) 47 | self.store_pd_user(user_id, series_pd) 48 | 49 | def convert(self): 50 | self._convert_user_id_lists() 51 | self._convert_users() 52 | 53 | def _get_targets_from_base_paths(paths): 54 | from os.path import split, join 55 | targets = [] 56 | for path in paths: 57 | dir, base = split(path) 58 | targets.append(join(dir, "pandas_" + base)) 59 | return targets 60 | 61 | def _get_sintef_paths(): 62 | import userloads as ul 63 | bases = (ul.DATA_WITH_DUPES_PATH, ul.DATA_WITHOUT_DUPES_PATH) 64 | targets = _get_targets_from_base_paths(bases) 65 | return zip(bases, targets) 66 | 67 | def convert_sintef_files(interactive=False): 68 | paths = _get_sintef_paths() 69 | print "This script will convert scikits.timeseries to pandas in the " \ 70 | "following files:" 71 | for (path_ts, path_pd) in paths: 72 | print "\n\t%s\nto\n\t%s" % (path_ts, path_pd) 73 | while True: 74 | response = raw_input("\nContinue (y/n)? ") 75 | if response == 'y': 76 | break 77 | elif response == 'n': 78 | return 79 | for (path_ts, path_pd) in paths: 80 | print "Converting %s to %s." % (path_ts, path_pd) 81 | Converter(path_ts, path_pd).convert() 82 | print "Done." 83 | 84 | if __name__ == "__main__": 85 | convert_sintef_files(interactive=True) 86 | 87 | 88 | -------------------------------------------------------------------------------- /sg/data/sintef/create_full_temp_data.py: -------------------------------------------------------------------------------- 1 | """6.5% of the Porsgrunn temperature readings from the SINTEF files 2 | are missing. They are concatenated with eklima.met.no data from 3 | Gvarv-Nes, and interpolated. Furthermore, two periods have obvious 4 | erroneous data readings, look up the periods 2004-11-11 14:00 -> 5 | 2004-11-22 23:00 and 2005-02-08 08:00 -> 2005-02-27 23:00. These two 6 | periods are replaced with data from eklima. The final stage is 7 | interpolation, so the dataset has hourly readings (eklima only reads 8 | data 4 times a day). Note: the following command must be issued 9 | beforehand, since it stores all the timeseries in a file that is loaded. 10 | 11 | ./gs2-grep.sh -l Grader | python plot_temp.py 12 | """ 13 | 14 | import os 15 | 16 | import numpy.ma as ma 17 | import numpy as np 18 | import pandas as pd 19 | 20 | import sg.data.eklima.parse_eklima_xml as xml 21 | import sg.utils 22 | from sg.globals import SG_DATA_PATH 23 | 24 | _TEMP_DATA = os.path.join(SG_DATA_PATH, "eklima", "Telemark", 25 | "Gvarv-Nes2004-2006.xml") 26 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 27 | 28 | def data(): 29 | temp = pd.read_pickle(os.path.join(_PATH_TO_HERE, 'temp_data.pickle')) 30 | temp = temp.sort_index().asfreq("H") 31 | # Extended periods with failed readings, replace with Gvarv 32 | temp['2004-11-11 14:00':'2004-11-21 23:00'] = np.nan 33 | temp['2005-02-08 08:00':'2005-02-27 23:00'] = np.nan 34 | # Shorter periods with failed readings, that we may leave to the cleansing 35 | # to take care of? 36 | # temp['2005-09-07 08:00':'2005-09-08 04:00'] = np.nan 37 | # temp['2006-02-28 05:00':'2006-02-28 04:00'] = np.nan 38 | # temp['2006-06-17 11:00':'2006-06-18 08:00'] = np.nan 39 | # temp['2006-12-19 06:00':'2006-12-21 03:00'] = np.nan 40 | gvarv = xml.parse(_TEMP_DATA)[temp.index[0]:].asfreq("H") 41 | gvarv_aligned = temp.align(gvarv, join="left")[1] 42 | # np.where returned a Pandas Timeseries with old Numpy, but now 43 | # returns an ndarray. Therefore we need to reassign to temp. 44 | temp[:] = np.where(np.isnan(temp), gvarv_aligned, temp) 45 | temp = temp.interpolate() 46 | temp.name = "Temperature" 47 | # Interpolate away a couple of outliers and zero-recordings, or leave to 48 | # cleansing? 49 | # temp['2004-11-29 08:00'] = np.nan 50 | # temp['2005-11-30 00:00':'2005-11-30 02:00'] = np.nan 51 | # temp['2006-10-27 09:00'] = np.nan 52 | # temp = temp.interpolate() 53 | return temp 54 | 55 | if __name__ == "__main__": 56 | data = data() 57 | sg.utils.plot_time_series([data], ['b.'], ['Porsgrunn + Gvarv temperature']) 58 | -------------------------------------------------------------------------------- /sg/data/sintef/create_full_temp_data.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/create_full_temp_data.pyc -------------------------------------------------------------------------------- /sg/data/sintef/data_for_eirik.py: -------------------------------------------------------------------------------- 1 | import sg.data.sintef.userloads as ul 2 | import sys 3 | 4 | tf = ul.tempfeeder_nodup() 5 | user_ids = tf.user_ids 6 | for user in user_ids: 7 | loads = tf[user][:,0] 8 | idx = 0 9 | while loads.dates[idx].hour != 0: 10 | idx += 1 11 | while idx < len(loads) - 48: 12 | sys.stdout.write("%d %s False " % (user, loads.dates[idx].strftime("%Y-%M-%d"))) 13 | for i in range(24): 14 | sys.stdout.write("%f " % loads[idx]) 15 | idx += 1 16 | print "" 17 | -------------------------------------------------------------------------------- /sg/data/sintef/find_binary.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************** 2 | * find_binary.cpp 3 | * Created on Tue Feb 07 2012 by Boye A. Hoeverstad. 4 | * 5 | * Given a list of files, as arguments on the command line and/or as input to 6 | * stdin, classify the files as binary or text. 7 | *******************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | 19 | string program_name; 20 | 21 | bool verbose = false; 22 | vector files; 23 | const int chunk_size = 1024*1024*500; 24 | vector buffer(chunk_size); 25 | set characters; 26 | 27 | void 28 | setup_text_character_set() 29 | { 30 | characters.insert(0x0a); // LF 31 | characters.insert(0x0d); // CR 32 | characters.insert(0xe6); // ae 33 | characters.insert(0xf8); // oe 34 | characters.insert(0xe5); // aa 35 | characters.insert(0xc6); // AE 36 | characters.insert(0xd8); // OE 37 | characters.insert(0xc5); // AA 38 | } 39 | 40 | bool 41 | is_binary(istream &stream, string path) 42 | { 43 | stream.read(&buffer[0], chunk_size); 44 | int num_read = stream.gcount(); 45 | for (int n = 0; n < num_read; n++) 46 | { 47 | unsigned char c = static_cast(buffer[n]); 48 | if ((c < 32 || c > 127) && characters.find(c) == characters.end()) 49 | { 50 | if (verbose) 51 | { 52 | cout << "Binary character: " << ios::hex << static_cast(c) 53 | << " at position " << n << " (probably) of file " 54 | << path << ". Context:\n"; 55 | copy(&buffer[max(n-10, 0)], &buffer[min(n+10, num_read)], ostream_iterator(cout, "")); 56 | cout << "\n" << flush; 57 | } 58 | return true; 59 | } 60 | } 61 | return false; 62 | } 63 | 64 | void 65 | exit_with_usage() 66 | { 67 | cerr << "Usage: " << program_name << " inputfile [more inputfiles]\n" 68 | << "Get input files from command line and/or standard input. " 69 | << "Output an indication of which files are binary and which are text.\n"; 70 | exit(1); 71 | } 72 | 73 | void 74 | parse_cmdline_arguments(int argc, char *argv[]) 75 | { 76 | if (argc == 2 && (!strcmp("-?", argv[1]) || !strcmp("--help", argv[1]))) 77 | exit_with_usage(); 78 | int next_arg = 1; 79 | if (argc >= 2 && (!strcmp("-v", argv[1]))) 80 | { 81 | next_arg++; 82 | verbose = true; 83 | } 84 | for (; next_arg < argc; next_arg++) 85 | files.push_back(argv[next_arg]); 86 | } 87 | 88 | void 89 | get_stdin_arguments() 90 | { 91 | if (isatty(fileno(stdin))) 92 | return; 93 | string path; 94 | while (getline(cin, path)) 95 | files.push_back(path); 96 | } 97 | 98 | void 99 | get_input_files(int argc, char *argv[]) 100 | { 101 | parse_cmdline_arguments(argc, argv); 102 | get_stdin_arguments(); 103 | if (files.size() == 0) 104 | exit_with_usage(); 105 | } 106 | 107 | int 108 | main(int argc, char *argv[]) 109 | { 110 | program_name = argv[0]; 111 | setup_text_character_set(); 112 | get_input_files(argc, argv); 113 | 114 | set binary_files, text_files; 115 | 116 | for (vector::const_iterator fit = files.begin(); fit != files.end(); fit++) 117 | { 118 | ifstream file(fit->c_str(), ios::binary); 119 | if (!file) 120 | { 121 | cerr << "Failed to open " << *fit << "!\n"; 122 | return 1; 123 | } 124 | cout << "Checking file " << *fit << "...\n" << flush; 125 | bool binary = false; 126 | while (!file.eof()) 127 | if (is_binary(file, *fit)) 128 | binary = true; 129 | if (binary) 130 | binary_files.insert(*fit); 131 | else 132 | text_files.insert(*fit); 133 | } 134 | 135 | cout << "Done.\n\nText files:\n"; 136 | copy(text_files.begin(), text_files.end(), ostream_iterator(cout, "\n")); 137 | cout << "\nBinary files:\n"; 138 | copy(binary_files.begin(), binary_files.end(), ostream_iterator(cout, "\n")); 139 | return 0; 140 | } 141 | -------------------------------------------------------------------------------- /sg/data/sintef/gs2-do.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Perform a command on each file in gs2.txt 4 | # Example usage: 5 | # ./gs2-do.sh sed -n -e'/Istad Nett/p' 6 | cat gs2.txt | while read path; do "$@" "$path"; done 7 | -------------------------------------------------------------------------------- /sg/data/sintef/gs2-grep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Perform a grep on each file in gs2.txt 4 | 5 | cat gs2.txt | while read path; do grep $@ "$path"; done 6 | -------------------------------------------------------------------------------- /sg/data/sintef/gs2_short.txt: -------------------------------------------------------------------------------- 1 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/133_4_2_2006010200_2006100123_20061129202638000_154753.exp 2 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/134_4_2_2006010200_2006100123_20061129132145000_154749.exp 3 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006010200_2006010900_20060110203859000_144086.exp 4 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006010900_2006011600_20060117153217000_144312.exp 5 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006011600_2006012300_20060124154601000_144524.exp 6 | -------------------------------------------------------------------------------- /sg/data/sintef/make-list-of-gs2-files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo -n "Creating a list of all the GS2 files and storing it in ./gs2.txt.. " 4 | find "`pwd`/../../../../data/sintef/raw" -iname '*.exp' -or -iname '*.gs2' >gs2.txt 5 | echo "Done." 6 | 7 | echo -n "Creating a list of a few small GS2 files and storing it in ./gs2_short.txt.. " 8 | find "`pwd`/../../../../data/sintef/raw" -iname '*.exp' -or -iname '*.gs2' -size -3MB |head -n 5 >gs2_short.txt 9 | echo "Done." 10 | -------------------------------------------------------------------------------- /sg/data/sintef/map_EIA_to_anonymous.py: -------------------------------------------------------------------------------- 1 | """This script was written to create a mapping from actual installation IDs to 2 | the anonymized ones, after the anonymization process has been performed. The 3 | motivation for this is that the anonymous IDs have already been used 4 | extensively, but for Buskerud we need to select readings based on actual IDs, 5 | in order to build a load profile from all meters below a certain substation or 6 | other grid connection point. 7 | 8 | The script expects the output of a 'diff' between original and anonymized files 9 | (in that order!) as input. If the files differ in any other way than in IDs, 10 | the script will fail by design.""" 11 | 12 | # A diff output in "normal format" consists of hunks of differences. Each hunk 13 | # has four parts: the change command, the text from the left input file, a 14 | # separator, and the text from the right input file (ref 15 | # http://www.chemie.fu-berlin.de/chemnet/use/info/diff/diff_3.html). 16 | 17 | # Usage: 18 | # cat gs2_buskerud.txt| while read line; do diff "$line" "`echo $line | sed -e's*/unanom**'`"; done |python map_EIA_to_anonymous.py 19 | # 20 | # Note that "for line in $(c or 40 | c. Returns the number of lines changed, or 0 41 | on end of file.""" 42 | line = self._stream.readline() 43 | if len(line) == 0: 44 | return 0 45 | self._lineno += 1 46 | left, right = line[:-1].split('c') 47 | if left != right: 48 | raise RuntimeError("Error in change command, line mismatch between files: %d vs %d." % (left, right)) 49 | try: 50 | (d1_1, d1_2) = [int(d) for d in left.split(',')] 51 | (d2_1, d2_2) = [int(d) for d in right.split(',')] 52 | return d1_2 - d1_1 + 1 53 | except: 54 | pass 55 | try: 56 | d1, d2 = (int(left), int(right)) 57 | return 1 58 | except: 59 | raise RuntimeError("Failed to parse change command.") 60 | 61 | def _next_line(self, desc): 62 | line = self._stream.readline() 63 | if len(line) == 0: 64 | raise RuntimeError("Error while parsing %s: Unexpected end of file." % desc) 65 | self._lineno += 1 66 | return line[:-1] 67 | 68 | def _parse_contents(self, prefix): 69 | line = self._next_line("diff contents") 70 | if len(prefix) > len(line) or line[:len(prefix)] != prefix: 71 | raise RuntimeError("Error while parsing diff contents: Expected '%s', got '%s'." \ 72 | % (prefix, line[:len(prefix)])) 73 | (tag, value) = line[len(prefix):].split(self._id_sep) 74 | if not tag in self._ids: 75 | raise RuntimeError("Error while parsing diff contents: Not a recognized identifier: '%s'." % tag) 76 | return value 77 | 78 | def _parse_separator(self): 79 | line = self._next_line("separator") 80 | sep = "---" 81 | if sep != line: 82 | raise RuntimeError("Error while parsing separator: Expected '%s', got '%s'." % (sep, line)) 83 | 84 | def _parse_hunk(self): 85 | num_changes = self._parse_change_command() 86 | if num_changes == 0: 87 | return False 88 | keys, values = [], [] 89 | for _ in range(num_changes): 90 | keys.append(self._parse_contents("< ")) 91 | self._parse_separator() 92 | for key in keys: 93 | self._map[key] = self._parse_contents("> ") 94 | return True 95 | 96 | def parse(self, input_stream): 97 | self._reset(input_stream) 98 | try: 99 | while self._parse_hunk(): 100 | pass 101 | except Exception as e: 102 | print >>sys.stderr, "Error while parsing, probably on line %d." % self._lineno 103 | print >>sys.stderr, "Exception message: " 104 | print >>sys.stderr, e 105 | return 106 | return self._map 107 | 108 | def _interactive(pickle_path): 109 | if ask_user("Input parsed. Save map to %s" % pickle_path, None): 110 | import cPickle as pickle 111 | with open(pickle_path, "wb") as f: 112 | pickle.save(f, id_map) 113 | print "Done." 114 | else: 115 | for (key, value) in id_map.iteritems(): 116 | print key, ":", value 117 | 118 | def _main(): 119 | id_map = DiffParser().parse(sys.stdin) 120 | if id_map is None: 121 | print >>sys.stderr, "Parsing failed." 122 | else: 123 | pickle_path = "id_map.pickle" 124 | import cPickle as pickle 125 | with open(pickle_path, "wb") as f: 126 | pickle.dump(id_map, f) 127 | print "Done, mapping saved to", pickle_path 128 | 129 | if __name__ == "__main__": 130 | _main() 131 | -------------------------------------------------------------------------------- /sg/data/sintef/parse_gs2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | 5 | from sg.utils.timer import SimpleTimer 6 | 7 | _KEYVAL_SEPARATOR = "=" 8 | _VALUE_OPEN = "<" 9 | _VALUE_CLOSE = ">" 10 | _ENTRY_INDICATOR = "#" 11 | _HEADING_INDICATOR = "##" 12 | 13 | def _is_element_type(line, element_indicator): 14 | return len(line) >= len(element_indicator) and \ 15 | line[:len(element_indicator)] == element_indicator 16 | 17 | def _is_entry(line): 18 | return _is_element_type(line, _ENTRY_INDICATOR) 19 | 20 | def _is_heading(line): 21 | return _is_element_type(line, _HEADING_INDICATOR) 22 | 23 | def _make_section(heading_line): 24 | return (heading_line[2:-1], dict()) 25 | 26 | def _close_value(stream): 27 | line = "" 28 | for next_line in stream: 29 | line += next_line 30 | if _VALUE_CLOSE in next_line: 31 | return line 32 | 33 | def _find_value(line, stream): 34 | if _VALUE_OPEN in line: 35 | if not _VALUE_CLOSE in line: 36 | line += _close_value(stream) 37 | contents = line[line.index(_VALUE_OPEN) + 1:line.index(_VALUE_CLOSE)] 38 | return contents.split() 39 | return [line[:-1]] 40 | 41 | def _split_line(line, stream): 42 | try: 43 | separator_idx = line.index(_KEYVAL_SEPARATOR) 44 | key = line[1:separator_idx] 45 | value = _find_value(line[separator_idx + 1:], stream) 46 | return (key, value) 47 | except ValueError: 48 | raise ValueError("Error while splitting a line into key and value " 49 | "constituents! Key/value separator sign (" + 50 | _KEYVAL_SEPARATOR + ") not found in input line:\n\t" + 51 | line) 52 | 53 | def _add_key_value(section, line, stream): 54 | key, value = _split_line(line, stream) 55 | contents = section[1] 56 | if key in contents: 57 | raise ValueError("Duplicate key '" + key + 58 | "' in section '" + section[0]) 59 | contents[key] = value 60 | 61 | def _find_first_heading(stream): 62 | for line in stream: 63 | if _is_heading(line): 64 | return line 65 | 66 | def section_generator(stream): 67 | """Iterator generator. Each call to the generated iterator will return the 68 | next section of a GS2 file. A section is represented as a two-element 69 | tuple, where the first element is the section heading and the second 70 | element is a dictionary. The dictionary holds the entries as key/value 71 | pairs. Each value is a list.""" 72 | line = _find_first_heading(stream) 73 | if line is None: 74 | return 75 | section = _make_section(line) 76 | for line in stream: 77 | if _is_heading(line): 78 | yield section 79 | section = _make_section(line) 80 | elif _is_entry(line): 81 | _add_key_value(section, line, stream) 82 | yield section 83 | 84 | def parse_file(path): 85 | """Parse a GS2 file and return a list where each element is a section in 86 | the GS2 file. See section_generator for info on the format of each 87 | section.""" 88 | with open(path, "r") as f: 89 | return [section for section in section_generator(f)] 90 | 91 | def parse_all_generator(pathfile): 92 | """Given a file containing a list of GS2 files, parse all the GS2 files one 93 | after the other.""" 94 | with open(pathfile) as paths: 95 | for path in paths: 96 | path = path[:-1] 97 | yield (path, parse_file(path)) 98 | 99 | 100 | if __name__ == '__main__': 101 | from unittest import main 102 | main(module='test_'+__file__[:-3]) 103 | -------------------------------------------------------------------------------- /sg/data/sintef/parse_gs2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/parse_gs2.pyc -------------------------------------------------------------------------------- /sg/data/sintef/plot_temp.py: -------------------------------------------------------------------------------- 1 | """Takes as input a list of gs2-files that has temperature data, reads 2 | them and plots them. 3 | Example: grep -l Grader *.exp | python path/to/plot_temp.py """ 4 | 5 | import string 6 | import sys 7 | import datetime 8 | 9 | import pandas as pd 10 | 11 | import sg.utils 12 | import sg.data.sintef.parse_gs2 as parse 13 | 14 | def collect_and_plot(files): 15 | TS = [] 16 | location = [] 17 | for f in files: 18 | temperatures = [ section[1] for section in parse.parse_file(f)[1:-1] if section[1]['Plant'] == ['tmp'] ] 19 | for t in temperatures: 20 | if t['Step'][0] != '0000-00-00.01:00:00': 21 | print 'Not hourly readings of temperature. Abort.' 22 | break 23 | start_time = datetime.datetime.strptime(t['Start'][0], "%Y-%m-%d.%H:%M:%S") 24 | dates = pd.date_range(start=start_time, periods=len(t['Value']), 25 | freq='H') 26 | data = [ float(value.rsplit('/')[0]) for value in t['Value'] ] 27 | TS.append(pd.Series(data=data, index=dates)) 28 | if location and t['Installation'][0] != location: 29 | print 'Location changed during reading of gs2 files. Probably some bad grouping of gs2 files.' 30 | location = t['Installation'][0] 31 | if TS: 32 | all_series = pd.concat(TS).sort_index() 33 | all_series_no_duplicates = all_series.groupby(level=0).first() 34 | all_series_no_duplicates.dump('temp_data.pickle') 35 | sg.utils.plot_time_series([all_series_no_duplicates], ['b-'], [location]) 36 | else: 37 | print 'No temperature data.' 38 | 39 | if __name__ == "__main__": 40 | if not sys.stdin.isatty(): 41 | collect_and_plot([ s.rstrip('\n') for s in sys.stdin.readlines() ]) 42 | 43 | -------------------------------------------------------------------------------- /sg/data/sintef/plot_temp_misc.py: -------------------------------------------------------------------------------- 1 | """Takes as input a list of gs2-files that has temperature data, reads 2 | them and plots them. 3 | Example: grep -l Grader *.exp | python path/to/plot_temp.py """ 4 | 5 | import string 6 | import matplotlib.pyplot as plt 7 | import sg.data.sintef.parse_gs2 as parse 8 | import sys, os 9 | import scikits.timeseries as ts 10 | import scikits.timeseries.lib.plotlib as tpl 11 | import sg.utils 12 | import sg.data.eklima.parse_eklima_xml as xml 13 | 14 | def _collect_and_plot(files): 15 | TS = [] 16 | location = [] 17 | for f in files: 18 | temperatures = [ section[1] for section in parse.parse_file(f)[1:-1] if section[1]['Plant'] == ['tmp'] ] 19 | for t in temperatures: 20 | if t['Step'][0] != '0000-00-00.01:00:00': 21 | print 'Not hourly readings of temperature. Abort.' 22 | break 23 | dates = ts.date_array(start_date=ts.Date('H', t['Start'][0]), length=len(t['Value'])) 24 | data = [ float(value.rsplit('/')[0]) for value in t['Value'] ] 25 | TS.append(ts.TimeSeries(data=data, dates=dates)) 26 | if location and t['Installation'][0] != location: 27 | print 'Location changed during reading of gs2 files. Probably some bad grouping of gs2 files.' 28 | location = t['Installation'][0] 29 | if TS: 30 | path = '/Users/tidemann/Documents/NTNU/devel/data/eklima/Telemark/' 31 | for file in os.listdir(path): 32 | try: 33 | series = xml.parse(path + file) 34 | sg.utils.plot_time_series([ts.concatenate((TS)), series], ['b-','r-'], [location, file]) 35 | except: 36 | print file, 'had no data.' 37 | else: 38 | print 'No temperature data.' 39 | 40 | if __name__ == "__main__": 41 | if not sys.stdin.isatty(): 42 | _collect_and_plot([ s.rstrip('\n') for s in sys.stdin.readlines() ]) 43 | 44 | -------------------------------------------------------------------------------- /sg/data/sintef/test_parse_gs2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import StringIO 4 | 5 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 6 | _PATH_TO_GS2_TESTDATA = os.path.join(_PATH_TO_HERE, "testfile.gs2") 7 | _PATH_TO_GS2_TESTDATA_SHORT = os.path.join(_PATH_TO_HERE, "testfile_short.gs2") 8 | 9 | import parse_gs2 as gs2 10 | 11 | class TestGS2Parser(unittest.TestCase): 12 | def test__is_entry(self): 13 | self.assertTrue(gs2._is_entry("#Sum=0\n")) 14 | self.assertFalse(gs2._is_entry("")) 15 | self.assertFalse(gs2._is_entry(" something is wrong")) 16 | 17 | def test__is_heading(self): 18 | self.assertTrue(gs2._is_heading("##Heading")) 19 | self.assertFalse(gs2._is_heading("#Entry")) 20 | self.assertFalse(gs2._is_heading("Neither")) 21 | 22 | def _set_up_section(self, heading): 23 | line = "##%s\n" % heading 24 | return gs2._make_section(line) 25 | 26 | def test_make_section(self): 27 | heading = "Time-series" 28 | section = self._set_up_section(heading) 29 | self.assertEqual(section[0], heading) 30 | self.assertIs(type(section[1]), dict) 31 | 32 | def test_split_single_line(self): 33 | key, value = gs2._split_line("#No-of-values=168\n", None) 34 | self.assertEqual(key, "No-of-values") 35 | self.assertEqual(value, ["168"]) 36 | 37 | def test_split_multi_line(self): 38 | line = "#Value=<\n" 39 | stream = StringIO.StringIO(" 2.2//0\n 2.5//0\n 2.8//0\n>\n") 40 | key, value = gs2._split_line(line, stream) 41 | self.assertEqual(key, "Value") 42 | self.assertEqual(value, ["2.2//0", "2.5//0", "2.8//0"]) 43 | 44 | def test_split_raises(self): 45 | self.assertRaises(ValueError, gs2._split_line, 46 | "#No-of-values:168\n", None) 47 | 48 | def _set_up_section_generator(self, path): 49 | stream = open(path, "r") 50 | return [section for section in gs2.section_generator(stream)] 51 | 52 | def test_section_generator(self): 53 | parsed = self._set_up_section_generator(_PATH_TO_GS2_TESTDATA_SHORT) 54 | self.assertEqual(len(parsed), 4) 55 | self.assertEqual(parsed[0][0], "Start-message") 56 | self.assertEqual(parsed[1][0], "Time-series") 57 | self.assertEqual(parsed[2][0], "Time-series") 58 | self.assertEqual(parsed[3][0], "End-message") 59 | self.assertEqual(parsed[1][1]["Value"], 60 | ["0//0", "1.285//0", "0//0", "1.285//0", "0//0", 61 | "1.285//0", "0//0"]) 62 | 63 | def test_parse_gs2(self): 64 | parsed_manual = self._set_up_section_generator(_PATH_TO_GS2_TESTDATA) 65 | parsed_auto = gs2.parse_file(_PATH_TO_GS2_TESTDATA) 66 | self.assertEqual(parsed_manual, parsed_auto) 67 | self.assertEqual(len(parsed_auto), 29) 68 | self.assertEqual(parsed_auto[0][0], "Start-message") 69 | for i in range(1, 28): 70 | self.assertEqual(parsed_auto[i][0], "Time-series") 71 | self.assertEqual(parsed_auto[-1][0], "End-message") 72 | 73 | def test_find_first_heading(self): 74 | stream = StringIO.StringIO("##Start-message\n#Id=PD-gs2exp\n") 75 | line = gs2._find_first_heading(stream) 76 | self.assertEqual(line, "##Start-message\n") 77 | 78 | def test_find_first_heading_no_heading(self): 79 | stream = StringIO.StringIO("#Start-message\n#Id=PD-gs2exp\n") 80 | line = gs2._find_first_heading(stream) 81 | self.assertIs(line, None) 82 | 83 | def test_find_first_heading_empty_file(self): 84 | stream = StringIO.StringIO("") 85 | line = gs2._find_first_heading(stream) 86 | self.assertIs(line, None) 87 | 88 | if __name__ == '__main__': 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /sg/data/sintef/test_userloads.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | import sg.utils.testutils as testutils 8 | 9 | from userloads import * 10 | 11 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | class TestUserLoads(testutils.ArrayTestCase): 14 | def setUp(self): 15 | pass 16 | 17 | def tearDown(self): 18 | pass 19 | 20 | def test_invalid_user(self): 21 | self.assertRaises(KeyError, tempfeeder_dup().__getitem__, -1) 22 | 23 | def test_users_equal(self): 24 | users_dup = tempfeeder_dup().user_ids 25 | users_nodup = tempfeeder_nodup().user_ids 26 | self.assertEqual(users_dup, users_nodup) 27 | 28 | def test_num_users(self): 29 | users_dup = tempfeeder_dup().user_ids 30 | self.assertEqual(len(users_dup), 2416) 31 | 32 | def test_getitem(self): 33 | user_id = 83169400 34 | ul = UserLoads(tempfeeder_dup().path) 35 | self.assertNotIn(user_id, ul.loads) 36 | user_loads = ul[user_id] 37 | self.assertIn(user_id, ul.loads) 38 | self.assertEqual(len(user_loads), 36602) 39 | self.assertIs(type(user_loads), pd.DataFrame) 40 | self.assertNaNArraysEqual(user_loads.ix[14077], np.array([1., np.nan])) 41 | self.assertArraysEqual(user_loads.ix[-1], np.array([1., 0.])) 42 | 43 | def test_get_set_get(self): 44 | user_id = 29605779 45 | idx = 15689 46 | ul = UserLoads(tempfeeder_nodup().path) 47 | user_loads = ul[user_id] 48 | user_loads.ix[idx] = np.array([123, 12]) 49 | user_loads.ix[-1] = np.array([124, 14]) 50 | self.assertArraysEqual(ul[user_id].ix[idx], np.array([123, 12])) 51 | self.assertArraysEqual(ul[user_id].ix[-1], np.array([124, 14])) 52 | ul.read(user_id) 53 | self.assertNaNArraysEqual(user_loads.ix[14077], np.array([0., np.nan])) 54 | self.assertArraysEqual(ul[user_id].ix[-1], np.array([3., 0.])) 55 | 56 | def test_pop(self): 57 | user_id = 448601 58 | ul = tempfeeder_dup() 59 | self.assertNotIn(user_id, ul.loads) 60 | loads = ul[user_id] 61 | self.assertIn(user_id, ul.loads) 62 | ul.pop(user_id) 63 | self.assertNotIn(user_id, ul.loads) 64 | 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | 69 | -------------------------------------------------------------------------------- /sg/data/sintef/test_userloads.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/test_userloads.pyc -------------------------------------------------------------------------------- /sg/data/sintef/testfile_short.gs2: -------------------------------------------------------------------------------- 1 | ##Start-message 2 | #Id=PD-gs2exp -> LogNo: 2724 3 | #Message-type=Settlement-data 4 | #Version=1.2 5 | #Time=2006-01-10.20:46:36 6 | #To=9999 7 | #From=981915550 8 | #GMT-reference=1 9 | #Description=Verdier fra 'MVS Buskerud Kraft Nett' 10 | 11 | 12 | ##Time-series 13 | #Start=2006-01-02.00:00:00 14 | #Stop=2006-01-08.24:00:00 15 | #Step=0000-00-00.01:00:00 16 | #Unit=kWh 17 | #Installation=16807 18 | #Plant=3 19 | #Meter-location=1 20 | #Value=< 21 | 0//0 22 | 1.285//0 23 | 0//0 24 | 1.285//0 25 | 0//0 26 | 1.285//0 27 | 0//0 28 | > 29 | #No-of-values=168 30 | #Sum=204.321 31 | #Description=4041 "ENERGI" 32 | 33 | ##Time-series 34 | #Start=2006-01-02.00:00:00 35 | #Stop=2006-01-08.24:00:00 36 | #Step=0000-00-00.01:00:00 37 | #Unit=kWh 38 | #Installation=282475249 39 | #Plant=3 40 | #Meter-location=1 41 | #Value=< 42 | 2.2//0 43 | 2.5//0 44 | 2.8//0 45 | 3.9//0 46 | > 47 | #No-of-values=168 48 | #Sum=464.5 49 | #Description=4041 "ENERGI" 50 | 51 | ##End-message 52 | #Id=PD-gs2exp -> LogNo: 2724 53 | 54 | -------------------------------------------------------------------------------- /sg/data/sintef/unique.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import collections 3 | import pprint 4 | 5 | tags = collections.defaultdict(int) 6 | for line in sys.stdin: 7 | tags[line[:-1]] += 1 8 | 9 | for key in tags: 10 | print key, ":", tags[key] 11 | 12 | -------------------------------------------------------------------------------- /sg/data/sintef/userloads.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/userloads.pyc -------------------------------------------------------------------------------- /sg/data/test_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import sg.utils.testutils as testutils 9 | import sg.data.bchydro as bchydro 10 | from dataset import * 11 | 12 | class DatasetTester(testutils.ArrayTestCase): 13 | def setUp(self): 14 | month_index = pd.period_range(start='2005-01-01', periods=12, freq='M') 15 | day_index = pd.period_range(start='2005-01-01', periods=365, freq='D') 16 | hour_index = pd.period_range(start='2005-01-01', periods=365*24, freq='H') 17 | self.months = pd.Series(np.arange(12), index=month_index) 18 | self.days = pd.Series(np.arange(365), index=day_index) 19 | self.hours = pd.Series(np.arange(365*24), index=hour_index) 20 | self.period = datetime.timedelta(days = 9) 21 | self.month_data = Dataset(self.months, self.period) 22 | self.day_data = Dataset(self.days, self.period) 23 | self.hour_data = Dataset(self.hours, self.period, 24 | datetime.timedelta(hours = 12)) 25 | 26 | def test_calculate_period(self): 27 | self.assertEqual(self.month_data._period_length, 1) 28 | self.assertEqual(self.day_data._period_length, 9) 29 | self.assertEqual(self.hour_data._period_length, 9 * 24) 30 | 31 | def test_number_of_periods(self): 32 | self.assertEqual(self.month_data.num_periods, 12) 33 | self.assertEqual(self.day_data.num_periods, 357) 34 | self.assertEqual(self.hour_data.num_periods, 356 * 2) 35 | 36 | def test_get_last_period(self): 37 | last_month = self.month_data.get_period(self.month_data.num_periods - 1) 38 | self.assertEqual(len(last_month), 1) 39 | self.assertEqual(last_month[0], 11) 40 | last_days = self.day_data.get_period(self.day_data.num_periods - 1) 41 | self.assertEqual(len(last_days), 9) 42 | self.assertArraysEqual(last_days, self.days[-9:]) 43 | hour_periods = self.hour_data.num_periods 44 | last_hours = self.hour_data.get_period(hour_periods - 1) 45 | self.assertEqual(len(last_hours), 9 * 24) 46 | self.assertArraysEqual(last_hours, self.hours[-9*24-12:-12]) 47 | 48 | def test_get_first_period(self): 49 | self.assertArraysEqual(self.month_data.get_period(0), self.months[0:1]) 50 | self.assertArraysEqual(self.day_data.get_period(0), self.days[0:9]) 51 | self.assertArraysEqual(self.hour_data.get_period(0), self.hours[0:9*24]) 52 | 53 | def test_get_second_period(self): 54 | self.assertArraysEqual(self.month_data.get_period(1), self.months[1:2]) 55 | self.assertArraysEqual(self.day_data.get_period(1), self.days[1:10]) 56 | self.assertArraysEqual(self.hour_data.get_period(1), 57 | self.hours[12:9*24+12]) 58 | 59 | def test_get_random_period(self): 60 | for i in range(100): 61 | (ts, number) = self.month_data.get_random_period(True) 62 | self.assertArraysEqual(ts, self.months[number:number+1]) 63 | (ts, number) = self.day_data.get_random_period(True) 64 | self.assertArraysEqual(ts, self.days[number:number+9]) 65 | (ts, number) = self.hour_data.get_random_period(True) 66 | index = self.hour_data.index_of(number) 67 | self.assertEqual(index, number * 12) 68 | self.assertArraysEqual(ts, self.hours[index:index+9*24]) 69 | 70 | class MiscTester(testutils.ArrayTestCase): 71 | def test_remove_one_outlier(self): 72 | dataset = np.array([0, 1, 2, 0, 3, 4, 0, 5]) 73 | remove_outlier_set_previous(dataset, outlier_val=0) 74 | self.assertArraysEqual(dataset, np.array([0, 1, 2, 2, 3, 4, 4, 5])) 75 | 76 | def test_remove_consecutive_outliers(self): 77 | dataset = np.array([0, 1, 0, 0, 0, 4, 0, 5]) 78 | retset = remove_outlier_set_previous(dataset) 79 | self.assertArraysEqual(retset, np.array([0, 1, 1, 1, 1, 4, 4, 5])) 80 | 81 | def test_remove_other_outliers(self): 82 | dataset = np.array([0, 1, 2, 0, 3, 4, 0, 5]) 83 | remove_outlier_set_previous(dataset, outlier_val=2) 84 | self.assertArraysEqual(dataset, np.array([0, 1, 1, 0, 3, 4, 0, 5])) 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /sg/data/yr.no/README.txt: -------------------------------------------------------------------------------- 1 | Denne mappen inneholder data lastet ned i XML- og GRIB-format fra 2 | yr.no, jfr retningslinjene på http://om.yr.no/verdata/xml/ og 3 | http://om.yr.no/verdata/grib/. 4 | 5 | ================================================== 6 | 7 | Forarbeid: 8 | ---------- 9 | 10 | Filen noreg.txt er lastet ned fra yr.no: 11 | $ wget http://fil.nrk.no/yr/viktigestader/noreg.txt 12 | 13 | Filen noreg_viktige.txt er en filtrert versjon av noreg.txt hvor 14 | alle steder med prioritet 99 er fjernet, jfr anbefalingen på 15 | yr.no: 16 | $ /store/gnu/bin/awk --field-separator='\t' '{ if ($4 != 99) print $0}' noreg.txt >noreg_viktige.txt 17 | 18 | Likeledes inneholder noreg_viktige_namn.txt kun stedsnavn og nummer: 19 | $ /store/gnu/bin/awk --field-separator='\t' '{print $1, $2}' noreg_viktige.txt >noreg_viktige_namn.txt 20 | 21 | Værvarslene samles i undermapper under mappen "steder". Hver 22 | mappe som skal inneholde værvarsel (i utgangspunktet alle løvnodene 23 | i mappetreet) må inneholde en fil kalt "address.txt". Denne filen 24 | skal inneholde en (og kun en) linje fra noreg.txt, som 25 | spesifiserer sted og URL for varselet som skal lastes ned. 26 | 27 | For de "enkle" stedene (ikke æøå i navnet og ingen undermapper) 28 | ble address.txt automatisk generert (sed-biten fjerner DOS linjeskift): 29 | $ for d in Bergen Drammen Oslo Stavanger Steinkjer Trondheim; do /store/gnu/bin/awk --field-separator='\t' "{ if (\$2 == \"$d\") print \$0}" noreg_viktige.txt | sed -e's/forecast\.xml.*$/forecast.xml/' >steder/$d/address.txt; done 30 | 31 | For de andre stedene ble address.txt laget manuelt vha 32 | klipp-og-lim-teknologi. 33 | 34 | ================================================== 35 | 36 | Nedlasting: 37 | ----------- 38 | 39 | Selve nedlastingen skjer vha skriptet "get-forecast.sh". 40 | 41 | Dette søker opp alle filene kalt address.txt og henter ut 42 | XML-adressen derfra. Adressen endres til å hente timesvarsel 43 | heller enn 6-timers. Varsel og wget-log lagres i de respektive 44 | mappene. 45 | 46 | Deretter lastes GRIB-filen for Nord-Europa ned og lagres i mappa 47 | "GRIB". 48 | -------------------------------------------------------------------------------- /sg/data/yr.no/crontab.txt: -------------------------------------------------------------------------------- 1 | # Don't redirect error messages -> get mail alert on failure 2 | 0 10 * * * $HOME/sg-shared/src/sg/data/yr.no/get-forecasts.sh >>sg-shared/data/yr.no/forecasts_log.sh 3 | 0 22 * * * $HOME/sg-shared/src/sg/data/yr.no/get-forecasts.sh >>sg-shared/data/yr.no/forecasts_log.sh 4 | 5 | -------------------------------------------------------------------------------- /sg/data/yr.no/get-forecasts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | find=/store/gnu/bin/find 4 | awk=/store/gnu/bin/awk 5 | sed=/store/gnu/bin/sed 6 | date=/store/gnu/bin/date 7 | wget=/opt/pkg/bin/wget 8 | 9 | BASE_DIR="$HOME/sg-shared/data/yr.no" 10 | GRIB_DIR="$BASE_DIR/GRIB" 11 | PLACES_DIR="$BASE_DIR/steder" 12 | ADDRESSES="address.txt" 13 | FORECAST_FILE="forecast_hour_by_hour.xml" 14 | NOW=`$date --iso-8601=hours` 15 | OUTPUT_FILE="${NOW}_forecast_hour_by_hour.xml" 16 | LOG_FILE="wget_log.txt" 17 | 18 | echo -n "Retrieving hourly forecasts for $NOW:" 19 | $find "$PLACES_DIR" -type f -name "$ADDRESSES" | while read ADDRESS_FILE; do 20 | PLACE_DIR=`dirname "$ADDRESS_FILE"` 21 | PLACE=`echo $PLACE_DIR | $awk --field-separator='/' '{print $NF}'` 22 | URL=`$awk --field-separator='\t' '{print $NF}' "$ADDRESS_FILE" | $sed -e"s/forecast.xml\$/$FORECAST_FILE/"` 23 | echo -n " $PLACE: " 24 | OUTPUT_PATH="$PLACE_DIR/$OUTPUT_FILE" 25 | $wget --output-document="$OUTPUT_PATH" "$URL" >>"$PLACE_DIR/$LOG_FILE" 2>&1 26 | if [ "$?" == 0 ]; then 27 | echo -n "Ok." 28 | else 29 | echo -n "FAILED!" 30 | rm -f "$OUTPUT_PATH" 31 | echo "Failed to retrieve forecasts for $PLACE from yr.no" >&2 32 | fi 33 | sleep 1 34 | done 35 | echo "" 36 | 37 | echo -n "Retrieving GRIB forecasts for $NOW..." 38 | OUTPUT_FILE="${NOW}_metno-neurope.grb" 39 | OUTPUT_PATH="$GRIB_DIR/$OUTPUT_FILE" 40 | URL="http://api.met.no/weatherapi/gribfiles/1.0/?area=north_europe;content=weather;content_type=application/octet-stream;" 41 | $wget --no-verbose --output-document="$OUTPUT_PATH" "$URL" >>"$GRIB_DIR/$LOG_FILE" 2>&1 42 | if [ "$?" == 0 ]; then 43 | echo "Ok." 44 | else 45 | echo "FAILED!" 46 | rm -f "$OUTPUT_PATH" 47 | echo "Failed to retrieve GRIB forecast from yr.no" >&2 48 | fi 49 | 50 | -------------------------------------------------------------------------------- /sg/globals.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 4 | SG_DATA_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "data") 5 | SG_PAPERS_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "papers") 6 | SG_SIM_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "simulations") 7 | SG_MODELS_PATH = os.path.join(_PATH_TO_HERE, "models") 8 | -------------------------------------------------------------------------------- /sg/globals.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/globals.pyc -------------------------------------------------------------------------------- /sg/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/__init__.py -------------------------------------------------------------------------------- /sg/models/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/__init__.pyc -------------------------------------------------------------------------------- /sg/models/demo_cleansing.py: -------------------------------------------------------------------------------- 1 | """Demonstrate the cleansing algorithm on datasets of varying length.""" 2 | 3 | import sys 4 | import time 5 | from datetime import timedelta as dt 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import sg.data.sintef.userloads as ul 12 | import spclean as cln 13 | from sg.utils.timer import SimpleTimer 14 | 15 | def _get_smoother(): 16 | # Set slow_smoother to True in order to see the actual time consumed by the 17 | # B-spline smoothing operation. If set to False, will use the default 18 | # smoother where the roughness matrices are cached. 19 | slow_smoother = True 20 | if slow_smoother: 21 | print "Using slow, analytic, non-caching smoother." 22 | return cln.BSplineAnalyticSmoother 23 | else: 24 | print "Using not quite so slow, caching smoother." 25 | return cln.BSplineSmoother 26 | 27 | # Load a dataset containing power load history. This set is divided into 28 | # training and test data, we only keep the traning part for now. 29 | dataset, _ = ul.total_experiment_load() 30 | 31 | # Set parameters for the B-spline smoother/cleanser 32 | smoothness = 10 33 | zscore = 0.5 34 | # Try smoothing/cleansing different time series lengths 35 | for hindsight_days in [1]: 36 | # Select data 37 | num_hours = 24 * hindsight_days 38 | data = dataset["Load"][-num_hours:].copy() 39 | # Some output and rough timing 40 | print "Cleansing %d hours of data with smoothness %.2f, z-score %.2f..." % \ 41 | (num_hours, smoothness, zscore) 42 | sys.stdout.flush() 43 | start_time = time.time() 44 | # This is the part that takes time 45 | smoother = _get_smoother()(data, smoothness) 46 | cleaner = cln.RegressionCleaner(smoother, zscore) 47 | cleaned, _ = cleaner.get_cleaned_data( 48 | method=cln.RegressionCleaner.replace_with_bound) 49 | # Wrap up and plot the result 50 | end_time = time.time() 51 | print "Done in %s." % SimpleTimer.period_to_string(start_time, end_time) 52 | 53 | print cleaned 54 | sys.stdout.flush() 55 | plt.figure() 56 | data.plot(style='r', label='Raw load') 57 | 58 | spline = pd.TimeSeries(data=smoother.splev(range(len(cleaned))), 59 | index=cleaned.index) 60 | spline.plot(style='g', label='Smoothing spline') 61 | 62 | # THE SAUSAGE! 63 | lower, upper = cleaner.get_confidence_interval() 64 | ax = plt.gca() 65 | ax.fill_between(cleaned.index, lower, upper, facecolor='g', alpha=0.1) 66 | 67 | cleaned.plot(style='b', label='Cleaned load') 68 | plt.legend(loc=3) 69 | 70 | plt.show() 71 | -------------------------------------------------------------------------------- /sg/models/esn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/esn.pyc -------------------------------------------------------------------------------- /sg/models/exp_cleaning.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import timedelta as dt 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | 7 | import sg.data.bchydro as bc 8 | import spclean as cln 9 | 10 | def clean_all_bc_data(period_days=7, step_days=6, 11 | smoothnesses=(0.1, 1, 3, 6, 10, 100)): 12 | """Clean the entire BC Hydro dataset period by period with the given 13 | smoothnesses. By default clean a week at a time with 1 day overlap (step 14 | length 6 days). 15 | 16 | Returns a dictionary keyed on the smoothness, where the values are lists of 17 | tuples, each tuple consisting of the period number and the outlier indices 18 | for all periods with outliers.""" 19 | 20 | dataset = bc.Dataset(period=dt(days=period_days), 21 | step_length=dt(days=step_days)) 22 | outliers_at = dict() 23 | for smoothness in smoothnesses: 24 | outliers_at[smoothness] = cln.clean_entire_dataset(dataset, smoothness) 25 | print "cleaned with smoothness", smoothness 26 | return (dataset, outliers_at) 27 | 28 | def clean_and_process_bc_data(period_days=7, step_days=6, 29 | smoothnesses=(0.1, 1, 3, 6, 10, 100)): 30 | """Clean BC data using clean_all_bc_data. Then plot the data to show the 31 | distribution of outliers per period and smoothness.""" 32 | data, outliers_at = clean_all_bc_data(period_days, step_days, smoothnesses) 33 | x = np.arange(data.num_periods) 34 | y_at = dict() 35 | for (smoothness, outliers) in outliers_at.iteritems(): 36 | y = np.zeros(data.num_periods) 37 | for (period, outlier_indices) in outliers: 38 | y[period] = len(outlier_indices) 39 | y_at[smoothness] = y 40 | plt.figure() 41 | plt.hold(True) 42 | plt.title("Number of cleaned points for various smoothnesses") 43 | axes = plt.gcf().gca() 44 | for (smoothness, y) in y_at.iteritems(): 45 | plt.figure() 46 | plt.plot(x, y, 'x') 47 | plt.title("Number of cleaned points for smoothness %.2f" % smoothness) 48 | axes.plot(x, y, 'x', label="Smoothness %.2f" % smoothness) 49 | plt.figure() 50 | plt.hist(y) 51 | plt.title("Histogram of number of cleaned points for " \ 52 | "smoothness %.2f" % smoothness) 53 | axes.legend() 54 | 55 | def show_max_cleaning(): 56 | week = 264 57 | dataset = bc.Dataset(period=dt(days=7), step_length=dt(days=6)) 58 | period = dataset.get_period(week) 59 | smoother = cln.BSplineSmoother(period, smoothness=3) 60 | cleaner = cln.RegressionCleaner(smoother, zscore=0.67) 61 | (clean_data, outliers) = cleaner.get_cleaned_data( 62 | cln.RegressionCleaner.replace_with_estimate) 63 | plt.figure() 64 | plt.hold(True) 65 | n = len(smoother.dataset) 66 | knots = smoother.knots 67 | t = np.linspace(knots[0], knots[-1], n * 25) 68 | y = smoother.splev(t) 69 | plt.hold(True) 70 | plt.plot(t, y) 71 | x = np.linspace(knots[0], knots[-1], n) 72 | plt.plot(x, smoother.dataset, 'mx') 73 | (lower, upper) = cleaner.get_confidence_interval() 74 | 75 | plt.plot(lower, 'g-') 76 | plt.plot(upper, 'g-') 77 | if len(outliers) > 0: 78 | print "Drawing %d outliers." % len(outliers) 79 | plt.plot(outliers, clean_data[outliers], 'r*', label="Cleaned data") 80 | else: 81 | print "No outliers!" 82 | 83 | if __name__ == "__main__": 84 | show_max_cleaning() 85 | print "Done cleaning, showing plot" 86 | plt.show() 87 | -------------------------------------------------------------------------------- /sg/models/filter-R-messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | 6 | def rfilter(stream): 7 | messages = ["In log\(s2\) : NaNs produced", 8 | "^Warning message[s]*:", 9 | "[Ii]n arima\(x = loads, order = order, xreg = temp_hc\)", 10 | "non-stationary AR part from CSS", 11 | "possible convergence problem: optim gave code=", 12 | "Error in optim\(init\[mask\], armafn", 13 | "non-finite finite-difference value", 14 | "There were [0-9]* warnings \(use warnings\(\) to see them\)"] 15 | re_objs = [re.compile(msg) for msg in messages] 16 | for line in stream: 17 | do_filter = False 18 | for prog in re_objs: 19 | if prog.search(line) is not None: 20 | do_filter = True 21 | if not do_filter: 22 | print line[:-1] 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) == 1: 26 | rfilter(sys.stdin) 27 | else: 28 | for path in sys.argv[1:]: 29 | with open(path, "r") as f: 30 | rfilter(f) 31 | 32 | -------------------------------------------------------------------------------- /sg/models/ga.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/ga.pyc -------------------------------------------------------------------------------- /sg/models/genome_evaluator.py: -------------------------------------------------------------------------------- 1 | """Use this program to evaluate one genome at a time, read from standard 2 | input.""" 3 | 4 | import sys 5 | import ast 6 | import traceback 7 | import random 8 | 9 | import matplotlib.pyplot as plt 10 | 11 | import sg.utils.pyevolve_utils as pu 12 | import sg.utils 13 | import ga 14 | import sg.data.sintef.userloads as ul 15 | import load_prediction as lp 16 | from load_prediction_ar import * 17 | from load_prediction_ar24 import * 18 | from load_prediction_arima import * 19 | from load_prediction_dshw import * 20 | from load_prediction_esn import * 21 | from load_prediction_esn24 import * 22 | try: 23 | from load_prediction_CBR import * 24 | from load_prediction_wavelet import * 25 | from load_prediction_wavelet24 import * 26 | except ImportError: 27 | print >>sys.stderr, "Genome evaluator can't import CBR/wavelet modules, probably some of the dependencies are not installed." 28 | 29 | options = None 30 | def get_options(): 31 | global options 32 | parser = lp.prediction_options() 33 | parser = lp.ga_options(parser) 34 | parser = lp.data_options(parser) 35 | parser.add_option("--model", dest="model", help="The model class that the genomes instantiate", default=None) 36 | parser.add_option("--test-set", dest="test_set", action="store_true", 37 | help="Test the genomes on the test set, rather than on the training set", default=False) 38 | parser.add_option("--plot", dest="plot", action="store_true", 39 | help="Make a plot (in combination with --test-set)", default=False) 40 | (options, args) = parser.parse_args() 41 | lp.options = options 42 | if options.model is None: 43 | print >>sys.stderr, "Model argument is required." 44 | sys.exit(1) 45 | 46 | def read_next_genome_list(): 47 | print "Enter genome to be evaluated: " 48 | line = sys.stdin.readline() 49 | if line == "": 50 | print "End of input, exiting." 51 | sys.exit(0) 52 | return ast.literal_eval(line) 53 | 54 | def next_indiv(): 55 | gl = read_next_genome_list() 56 | genome = pu.AllelesGenome() 57 | genome.setInternalList(gl) 58 | genome.setParams(num_trials=options.num_trials) 59 | return genome 60 | 61 | def gene_test_loop(model): 62 | while sys.stdin: 63 | ga._model = model 64 | indiv = next_indiv() 65 | if options.test_set: 66 | print "Evaluating genome on test set: ", indiv[:] 67 | sys.stdout.flush() 68 | try: 69 | (target, predictions) = lp.parallel_test_genome(indiv, model) if options.parallel else lp.test_genome(indiv, model) 70 | except Exception, e: 71 | print >>sys.stderr, "Exception raised, failed to evaluate genome." 72 | tb = " " + traceback.format_exc(limit=50)[:-1] 73 | print >>sys.stderr, tb.replace("\n", "\n ") 74 | continue 75 | error = sg.utils.concat_and_calc_error(predictions, target, model.error_func) 76 | print "Error on test phase: {}".format(error) 77 | if options.plot: 78 | sg.utils.plot_target_predictions(target, predictions) 79 | plt.show() 80 | else: 81 | print "Evaluating genome on training set: ", indiv[:] 82 | sys.stdout.flush() 83 | fitness = ga._fitness(indiv) 84 | print "Fitness:", fitness 85 | if fitness != 0: 86 | print "Error:", ga._fitness_to_error(fitness) 87 | else: 88 | print "Error not calculated for 0 fitness." 89 | 90 | def run(): 91 | """.""" 92 | get_options() 93 | prev_handler = np.seterrcall(lp.float_err_handler) 94 | prev_err = np.seterr(all='call') 95 | np.seterr(under='ignore') 96 | random.seed(options.seed) 97 | np.random.seed(options.seed) 98 | model_creator = eval(options.model + "(options)") 99 | model = model_creator.get_model() 100 | lp._print_sim_context(model._dataset) 101 | print "Number of training sequences: %d" % options.num_trials 102 | print "Start days of training sequences:", model._dataset.train_periods_desc 103 | gene_test_loop(model) 104 | ul.tempfeeder_exp().close() 105 | 106 | if __name__ == "__main__": 107 | run() 108 | 109 | -------------------------------------------------------------------------------- /sg/models/gridopt_load_prediction.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta as dt 2 | import math 3 | import random 4 | import os 5 | import sys 6 | import cPickle as pickle 7 | 8 | import numpy as np 9 | import Oger 10 | import mdp, mdp.nodes 11 | import matplotlib.pyplot as plt 12 | import scikits.timeseries as ts 13 | 14 | import esn 15 | import sg.utils 16 | from sg.data.sintef.create_full_temp_data import data as read_temperatures 17 | import sg.data.sintef.userloads as ul 18 | import load_prediction 19 | 20 | def optimize(postfix): 21 | # sg.utils.redirect(sys.stdout, "gridopt_output_%s.txt" % postfix) 22 | 23 | user_id = 55864860 24 | (dataset, test) = load_prediction.prepare_datasets(user_id) 25 | 26 | day = 24 27 | freerun = day 28 | today = 4600 29 | 30 | # [len_data, res_size, leak, input, bias, spectral, 31 | # seed, ridge, tmp_sm, load_sm] 32 | train_hours = 336 33 | 34 | datas = \ 35 | [sg.utils.Normalizer(dataset[today-train_hours:today+day-freerun,:], axis=0) 36 | for today in (1000, 2000, 3000, 4000)] 37 | 38 | input_data = [] 39 | for data in datas: 40 | temps, loads = zip(*data.normalized) 41 | input_data.append([np.array((temps[24:], loads[:-24], loads[24:])).T]) 42 | 43 | reservoir = Oger.nodes.LeakyReservoirNode(output_dim=400, 44 | leak_rate=1, 45 | input_scaling=0.5, 46 | bias_scaling=0.75, 47 | spectral_radius=1, 48 | reset_states=False) 49 | readout = Oger.nodes.RidgeRegressionNode(ridge_param = 0.001) 50 | flow = Oger.nodes.FreerunFlow(reservoir + readout, 51 | freerun_steps = freerun, 52 | external_input_range= \ 53 | np.array([0, 1])) 54 | 55 | # gridsearch_parameters = {reservoir: {'_instance': range(5), 56 | # 'spectral_radius': [0.6, 0.8, 1], 57 | # 'input_scaling': [0.1, 0.5, 0.9], 58 | # 'bias_scaling': [0.1, 0.5, 0.9], 59 | # 'leak_rate': [0.1, 0.5, 0.9]}, 60 | # readout: {'_instance': range(5), 61 | # 'ridge_param': [0.1, 0.5, 0.9]}} 62 | 63 | gridsearch_parameters = {reservoir: {'_instance': range(20)}, 64 | readout: {'ridge_param': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}} 65 | 66 | print "gridsearch_parameters = " + str(gridsearch_parameters) 67 | optimizer = Oger.evaluation.Optimizer(gridsearch_parameters, 68 | Oger.utils.nrmse) 69 | 70 | optimizer.grid_search([[], input_data], flow, 71 | cross_validate_function=Oger.evaluation.leave_one_out) 72 | 73 | return (optimizer, reservoir) 74 | 75 | def store_optimal_flow(optimizer, postfix): 76 | optflow = optimizer.get_optimal_flow(verbose=True) 77 | 78 | with open("gridopt_optimal_flow_%s.pickle" % postfix, "w") as f: 79 | pickle.dump(optflow, f) 80 | 81 | 82 | if __name__ == "__main__": 83 | #postfix = str(os.getpid()) 84 | postfix = "deleteme" 85 | optimizer, reservoir = optimize(postfix) 86 | store_optimal_flow(optimizer, postfix) 87 | optimizer.plot_results([(reservoir, '_instance')]) 88 | 89 | -------------------------------------------------------------------------------- /sg/models/gui.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/gui.pyc -------------------------------------------------------------------------------- /sg/models/lib_atlas/BsplineAnalyticSmoother.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BsplineAnalyticSmoother.h 3 | * 4 | * Created on: Dec 25, 2012 5 | * Author: Hasib 6 | */ 7 | 8 | #ifndef BSPLINEANALYTICSMOOTHER_H_ 9 | #define BSPLINEANALYTICSMOOTHER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #ifdef __cplusplus 15 | extern "C" 16 | { 17 | #endif 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | 30 | #define N_ALIGN (size_t)64 31 | 32 | class BsplineAnalyticSmoother { 33 | 34 | private: 35 | int degree; 36 | int n_threads; 37 | unsigned int n_data; 38 | unsigned int n_knot; 39 | unsigned int n_coef; 40 | double smoothness; 41 | double zscore; 42 | double *knots; 43 | double *dataset; 44 | double *S; 45 | double *smoothed_data; 46 | double *cleaned_data; 47 | 48 | double bsplinebasis(unsigned int i, int p, double t); 49 | double bsplinebasis_deriv(int i, int p, int n, double t); 50 | double* get_phi(); 51 | double* get_roughness(); 52 | void calc_hatMatrix(); 53 | 54 | public: 55 | BsplineAnalyticSmoother(double *dataset, unsigned int n_data, double *knots, unsigned int n_knot, int degree, double smoothness, double zscore, int n_threads); 56 | virtual ~BsplineAnalyticSmoother(); 57 | void calc_smoothedData(); 58 | double* calc_cleanedData(); 59 | double *get_smoothedData(); 60 | }; 61 | 62 | #endif /* BSPLINEANALYTICSMOOTHER_H_ */ 63 | -------------------------------------------------------------------------------- /sg/models/lib_atlas/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # source files 3 | #SRC = BsplineAnalyticSmoother.cpp demoCleansing.cpp 4 | SRC = BsplineAnalyticSmoother.cpp 5 | 6 | OBJ = $(SRC:.cpp=.o) 7 | 8 | PHYLIB = libspclean.so 9 | 10 | # #include directories 11 | #INCLUDES = -I. -I/usr/local/include -I/usr/local/include/atlas -I/usr/include 12 | #INCLUDES = -I. -I/usr/local/include -I/usr/local/include/atlas -I/home/hasib/software/mathlib/plasma-installer_2.5.0b1/install/include -I$(HOME)/include 13 | INCLUDES = -I. -I/usr/local/include -I/usr/include -I/usr/local/include/atlas -I$(HOME)/include 14 | 15 | #C++ compiler flags 16 | #CFLAGS = -O3 -fPIC -Wall #for lib 17 | CFLAGS = -O3 -fPIC -fopenmp -Wall #for lib 18 | #CFLAGS = -O3 -fopenmp #for executable 19 | #CFLAGS = -g -pg -O0 -Wall -fPIC # for debugging 20 | 21 | #compiler 22 | CC = g++ 23 | 24 | #library paths 25 | hostname := $(shell hostname) 26 | 27 | ifeq ($(hostname), rocks.hpc.ntnu.no) 28 | LIBS = -llapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -lgfortran 29 | else 30 | #LIBS = -L/home/hasib/software/mathlib/plasma-installer_2.5.0b1/install/lib -L/usr/lib64 -L/usr/local/lib64 31 | LIBS = -L/usr/local/lib64 -L$(HOME)/lib/ptlib -lptlapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran 32 | #LIBS = -L/usr/local/lib64 -L$(HOME)/lib/ptlib -lptlapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran 33 | endif 34 | 35 | default: $(PHYLIB) 36 | 37 | .cpp.o: 38 | $(CC) $(INCLUDES) $(CFLAGS) -c $< -o $@ 39 | 40 | $(PHYLIB):$(OBJ) 41 | $(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) 42 | 43 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcoreblas -lquark -lcblas -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran 44 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcblas -lcoreblas -lquark -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lpthread -lgomp 45 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -llapack -lptcblas -lptf77blas -latlas -lpthread -lgomp 46 | #$(CC) -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcoreblas -lquark -lcblas -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lrt -lpthread 47 | #$(CC) -o $(PHYLIB) $(OBJ) $(LIBS) -lpthread -lptcblas -latlas -llapack -llapacke -lquark -ltmg -lcoreblas -lplasma -lm -lrt 48 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lcblas -latlas -lgomp 49 | 50 | clean: 51 | rm -f *.o $(PHYLIB) 52 | 53 | -------------------------------------------------------------------------------- /sg/models/lib_mkl/BsplineAnalyticSmoother.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BsplineAnalyticSmoother.cpp 3 | * 4 | * Created on: Dec 25, 2012 5 | * Last Modified on: Feb 7, 2013 6 | * Feature: parallel BLAS and LAPACK 7 | * Author: Hasib 8 | * 9 | * */ 10 | 11 | #ifndef BSPLINEANALYTICSMOOTHER_H_ 12 | #define BSPLINEANALYTICSMOOTHER_H_ 13 | 14 | #include 15 | 16 | #ifdef __cplusplus 17 | extern "C" 18 | { 19 | #endif 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #define N_ALIGN (size_t)64 33 | 34 | class BsplineAnalyticSmoother { 35 | 36 | private: 37 | int degree; 38 | unsigned int n_data; 39 | unsigned int n_knot; 40 | unsigned int n_coef; 41 | double smoothness; 42 | double zscore; 43 | double *knots; 44 | double *dataset; 45 | double *S; 46 | double *smoothed_data; 47 | double *cleaned_data; 48 | 49 | double bsplinebasis(unsigned int i, int p, double t); 50 | double bsplinebasis_deriv(int i, int p, int n, double t); 51 | double* get_phi(); 52 | double* get_roughness(); 53 | void calc_hatMatrix(); 54 | 55 | public: 56 | BsplineAnalyticSmoother(double *dataset, unsigned int n_data, double *knots, unsigned int n_knot, int degree, double smoothness, double zscore); 57 | virtual ~BsplineAnalyticSmoother(); 58 | void calc_smoothedData(); 59 | double* calc_cleanedData(); 60 | void print_cleanedData(); 61 | double *get_smoothedData(); 62 | 63 | }; 64 | 65 | #endif /* BSPLINEANALYTICSMOOTHER_H_ */ 66 | -------------------------------------------------------------------------------- /sg/models/lib_mkl/Makefile: -------------------------------------------------------------------------------- 1 | 2 | #source files 3 | SRC = BsplineAnalyticSmoother.cpp 4 | 5 | OBJ = $(SRC:.cpp=.o) 6 | 7 | PHYLIB = libspclean.so 8 | 9 | #include directories 10 | INCLUDES = -I. -I$(MKLROOT)/include 11 | 12 | #C++ compiler flags 13 | CFLAGS = -O3 -fPIC -openmp 14 | 15 | #compiler 16 | CC = icpc 17 | 18 | #library paths 19 | LIBS = -L$(MKLROOT)/lib/intel64/ 20 | 21 | default: $(PHYLIB) 22 | 23 | .cpp.o: 24 | $(CC) $(INCLUDES) $(CFLAGS) -c $< -o $@ 25 | 26 | #create library 27 | $(PHYLIB):$(OBJ) 28 | $(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -lmkl_avx -lmkl_def 29 | 30 | 31 | clean: 32 | rm -f *.o $(PHYLIB) 33 | -------------------------------------------------------------------------------- /sg/models/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | -------------------------------------------------------------------------------- /sg/models/load_cleansing.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from multiprocessing import Lock 3 | import sys 4 | import time 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import sg.models.spclean as cln 10 | from sg.utils.cache import ATimeCache 11 | from sg.utils.timer import SimpleTimer 12 | 13 | _smoother = None 14 | _max_cache_size = 10000 15 | _temp_mutex = Lock() 16 | _load_mutex = Lock() 17 | _temp_cache = ATimeCache(_max_cache_size) 18 | _load_cache = ATimeCache(_max_cache_size) 19 | 20 | def _get_dataset_hash(dataset): 21 | m = hashlib.md5() 22 | m.update(dataset) 23 | # Put the index in the hash, otherwise invalid datasets will be created 24 | # when we have no temperature data (different dates, same data -> 25 | # TimeSeries with lots of NaNs). 26 | m.update(str(dataset.index[0].value)) 27 | m.update(str(dataset.index[-1].value)) 28 | return m.digest() 29 | 30 | def bspline_clean_dataset(dataset, genome, loci, prediction_steps): 31 | """Clean a dataset containing temperatures and loads using cleaning 32 | parameters from the genome. The dataset is expected to contain NaNs in the 33 | last *prediction_steps* elements of the Load series""" 34 | # Having the smoother as a global is not nice, but it speeds up things A 35 | # LOT, because pickling the smoother caches takes a long time for large 36 | # matrices (long time series). 37 | global _smoother, _temp_cache, _load_cache 38 | if _smoother is None: 39 | _smoother = cln.BSplineSmoother(dataset, smoothness=1) 40 | clean_data = dataset.copy() 41 | key = (_get_dataset_hash(dataset["Temperature"]), 42 | genome[loci.t_smooth], genome[loci.t_zscore]) 43 | try: 44 | _temp_mutex.acquire() 45 | clean_data['Temperature'] = _temp_cache[key].copy() 46 | # print "Got temp from cache: ", key[1], key[2] 47 | # sys.stdout.flush() 48 | except KeyError: 49 | _temp_mutex.release() 50 | # print "Storing temp to cache: ", key[1], key[2] 51 | # sys.stdout.flush() 52 | clean_data['Temperature'] = \ 53 | cln.bspline_clean(dataset['Temperature'], 54 | genome[loci.t_smooth], 55 | genome[loci.t_zscore], _smoother) 56 | _temp_mutex.acquire() 57 | _temp_cache[key] = clean_data['Temperature'].copy() 58 | _temp_mutex.release() 59 | key = (_get_dataset_hash(dataset["Load"]), 60 | genome[loci.l_smooth], genome[loci.l_zscore]) 61 | try: 62 | _load_mutex.acquire() 63 | clean_data['Load'][:-prediction_steps] = _load_cache[key].copy() 64 | # print "Got load from cache: ", key[1], key[2] 65 | # sys.stdout.flush() 66 | except KeyError: 67 | _load_mutex.release() 68 | # print "Storing load to cache: ", key[1], key[2] 69 | # sys.stdout.flush() 70 | clean_data['Load'][:-prediction_steps] = \ 71 | cln.bspline_clean(dataset['Load'][:-prediction_steps], 72 | genome[loci.l_smooth], 73 | genome[loci.l_zscore], _smoother) 74 | _load_mutex.acquire() 75 | _load_cache[key] = clean_data['Load'][:-prediction_steps].copy() 76 | _load_mutex.release() 77 | return clean_data 78 | 79 | def bspline_clean_dataset_no_cache(dataset, genome, loci, prediction_steps): 80 | """Clean a dataset containing temperatures and loads using cleaning 81 | parameters from the genome. The dataset is expected to contain NaNs in the 82 | last *prediction_steps* elements of the Load series""" 83 | # Having the smoother as a global is not nice, but it speeds up things A 84 | # LOT, because pickling the smoother caches takes a long time for large 85 | # matrices (long time series). 86 | global _smoother 87 | if _smoother is None: 88 | _smoother = cln.BSplineSmoother(dataset, smoothness=1) 89 | clean_data = dataset.copy() 90 | clean_data['Temperature'] = cln.bspline_clean(dataset['Temperature'], 91 | genome[loci.t_smooth], 92 | genome[loci.t_zscore], _smoother) 93 | clean_data['Load'][:-prediction_steps] = \ 94 | cln.bspline_clean(dataset['Load'][:-prediction_steps], 95 | genome[loci.l_smooth], 96 | genome[loci.l_zscore], _smoother) 97 | return clean_data 98 | 99 | def bspline_clean_dataset_fast(dataset, genome, loci, prediction_steps): 100 | """Clean a dataset containing temperatures and loads using cleaning 101 | parameters from the genome. The dataset is expected to contain NaNs in the 102 | last *prediction_steps* elements of the Load series""" 103 | clean_data = dataset.copy() 104 | clean_data['Temperature'] = cln.bspline_clean_fast( 105 | dataset['Temperature'], genome[loci.t_smooth], genome[loci.t_zscore]) 106 | clean_data['Load'][:-prediction_steps] = \ 107 | cln.bspline_clean_fast( 108 | dataset['Load'][:-prediction_steps], 109 | genome[loci.l_smooth], genome[loci.l_zscore]) 110 | return clean_data 111 | -------------------------------------------------------------------------------- /sg/models/load_cleansing.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/load_cleansing.pyc -------------------------------------------------------------------------------- /sg/models/load_prediction.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/load_prediction.pyc -------------------------------------------------------------------------------- /sg/models/load_prediction.py~: -------------------------------------------------------------------------------- 1 | """Initiates models, runs them through a genetic algorithm to find the 2 | optimal parameters, and tests the models in a production setting.""" 3 | 4 | import random 5 | from datetime import timedelta as dt 6 | import numpy as np 7 | import mdp, Oger, pdb 8 | import matplotlib.pyplot as plt 9 | import scipy 10 | import scikits.timeseries as ts 11 | import itertools as it 12 | 13 | from pyevolve import GAllele 14 | import sg.data.bchydro as bchydro 15 | import sg.utils 16 | from model import Model 17 | from ga import run_GA 18 | import esn 19 | 20 | def _load_prediction(): 21 | """This is where the models are defined. The models are passed to the GA 22 | engine for evolution of the optimal set of parameters. Afterwards, 23 | the models are tested, and performance is measured.""" 24 | 25 | dataset = bchydro.Dataset(period=dt(days=7*3)) 26 | train, test = dataset.split() 27 | 28 | alleles = GAllele.GAlleles() 29 | alleles.add(GAllele.GAlleleRange(24,1000)) # Data length 30 | alleles.add(GAllele.GAlleleRange(10, 250)) # Network size 31 | alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Leak rate 32 | alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Input scaling 33 | alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Bias scaling 34 | alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Spectral radius 35 | alleles.add(GAllele.GAlleleRange(0,100000)) # Seed 36 | 37 | ESN_feedforward = Model(genes = alleles, error_func = Oger.utils.nrmse, 38 | train_and_predict_func = esn.feedforward, 39 | dataset = train) 40 | 41 | for model in [ ESN_feedforward ]: 42 | run_GA(model) 43 | print model.genome 44 | day = model.genome.getParam('day') 45 | i = 1 46 | target = [] 47 | prediction = [] 48 | while day*i+model.genome[0] <= len(test.series): 49 | test_day = sg.utils.scale(test.series[day*(i-1):day*i+model.genome[0]].data) 50 | test_day.shape = [test_day.shape[0], 1] 51 | ytest = model.train_and_predict_func(test_day[:-day], 52 | model.genome) 53 | target.append(test_day[-day:]) 54 | prediction.append(ytest[-day:]) 55 | i += 1 56 | 57 | target = [ e[0] for e in it.chain(*target) ] 58 | prediction = [ e[0] for e in it.chain(*prediction) ] 59 | plt.figure() 60 | plt.title('Prediction through test phase, %i days, error = %3.2f'%\ 61 | (i,model.error_func(np.array(prediction),np.array(target)))) 62 | plt.plot(target, 'b', label='Target') 63 | plt.plot(prediction, 'r', label='Prediction') 64 | plt.legend(loc=3) 65 | plt.show() 66 | 67 | if __name__ == "__main__": 68 | _load_prediction() 69 | 70 | 71 | # Plot the best individual, with a red line showing when the 72 | # prediction started. 73 | # plt.figure() 74 | # plt.title('Target, test, real life plots') 75 | # plt.plot(top_dog.target, 'b', label='Target') 76 | # plt.plot(top_dog.xtest, top_dog.ytest, 77 | # 'g', label='%s, Error=%3.2f'%(top_dog.label,top_dog.score)) 78 | # plt.axvline(top_dog.xtest[0], plt.ylim()[0], plt.ylim()[1], color='r') 79 | # # Plot real life example, magenta line showing real life performance start. 80 | # plt.plot(top_dog.xreal, top_dog.yreal, 81 | # 'c', label='Real life prediction, Error=%3.2f' \ 82 | # %top_dog.getParam('error_func')(top_dog.yreal, top_dog.target[-24:])) 83 | # plt.plot(top_dog.xreal, top_dog.retrain, 84 | # 'k', label='Retrained, Error=%3.2f' \ 85 | # %top_dog.getParam('error_func')(top_dog.retrain, top_dog.target[-24:])) 86 | # plt.axvline(top_dog.xreal[0], plt.ylim()[0], plt.ylim()[1], color='m') 87 | # plt.legend(loc=3) 88 | # # Plot real life performance 89 | # plt.figure() 90 | # plt.title('Real life Error') 91 | # plt.errorbar(np.arange(0,len(real_avg),1), real_avg, yerr=real_std, label='Mean') 92 | # plt.plot(real_max, label='Max') 93 | # plt.plot(real_min, label='Min') 94 | # plt.legend() 95 | # # Scatter plot of tested error vs actual error 96 | # plt.figure() 97 | # plt.scatter([ I.score for I in ga.getPopulation() ], 98 | # [ I.getParam('error_func')(I.yreal, I.target[-24:]) for I in ga.getPopulation() ]) 99 | # plt.xlabel('Raw score') 100 | # plt.ylabel('Real life score') 101 | # plt.show() 102 | 103 | # We store the result for plotting purposes. 104 | # chromosome.target = load_data 105 | # chromosome.ytest = ytest[-day:] 106 | # chromosome.xtest = scipy.arange(len(load_data)-day*2, len(load_data)-day, 1) 107 | -------------------------------------------------------------------------------- /sg/models/load_prediction_ar.py: -------------------------------------------------------------------------------- 1 | '''Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor.''' 2 | 3 | from pyevolve import GAllele 4 | import Oger 5 | 6 | import sg.utils 7 | import sg.utils.pyevolve_utils as pu 8 | from model import Model 9 | import arima 10 | import load_cleansing 11 | import load_prediction 12 | 13 | class ARModelCreator(load_prediction.ModelCreator): 14 | def _add_transform_genes(self): 15 | '''Sets up for evolution of the ARIMA model.''' 16 | self._alleles.add(pu.make_int_gene(1, 1, 8*24, 5)) 17 | self._alleles.add(pu.make_int_gene(1, 0, 8*24, 5)) 18 | self._loci_list += ['AR_order'] 19 | self._loci_list += ['EXO_order'] 20 | 21 | def _get_transform(self): 22 | return arima.ar_ga 23 | 24 | 25 | class ARBitmapModelCreator(load_prediction.ModelCreator): 26 | def _add_transform_genes(self): 27 | '''Sets up for evolution of the ARIMA model.''' 28 | self._alleles.add(pu.make_bitmap_gene(24*8)) 29 | self._alleles.add(pu.make_bitmap_gene(24*8)) 30 | self._loci_list += ['AR_lags', 'EXO_lags'] 31 | 32 | def _get_transform(self): 33 | return arima.bitmapped_ar_ga 34 | 35 | 36 | if __name__ == '__main__': 37 | load_prediction.run(ARModelCreator) 38 | #load_prediction.run(ARBitmapModelCreator()) 39 | -------------------------------------------------------------------------------- /sg/models/load_prediction_ar24.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor.""" 2 | 3 | from pyevolve import GAllele 4 | import Oger 5 | 6 | import sg.utils 7 | import sg.utils.pyevolve_utils as pu 8 | from model import Model 9 | import arima 10 | import load_cleansing 11 | import load_prediction 12 | import load_prediction_ar 13 | 14 | class ARHourByHourModelCreator(load_prediction_ar.ARModelCreator): 15 | def _get_transform(self): 16 | return arima.hourbyhour_ar_ga 17 | 18 | 19 | class ARHourByHourBitmapModelCreator(load_prediction_ar.ARBitmapModelCreator): 20 | def _get_transform(self): 21 | return arima.bitmapped_hourbyhour_ar_ga 22 | 23 | 24 | if __name__ == "__main__": 25 | load_prediction.run(ARHourByHourModelCreator) 26 | #load_prediction.run(ARHourByHourBitmapModelCreator()) 27 | -------------------------------------------------------------------------------- /sg/models/load_prediction_arima.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor.""" 2 | 3 | from pyevolve import GAllele 4 | import Oger 5 | 6 | import sg.utils 7 | import sg.utils.pyevolve_utils as pu 8 | from model import Model 9 | import arima 10 | import load_cleansing 11 | import load_prediction 12 | 13 | class ARIMAModelCreator(load_prediction.ModelCreator): 14 | def _add_transform_genes(self): 15 | """Sets up for evolution of the ARIMA model.""" 16 | self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # 'AR' backshift (p) 17 | self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # 'I' backshift (d) 18 | self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # 'MA' backshift (q) 19 | self._loci_list += ['AR_order', 'I_order', 'MA_order'] 20 | 21 | def _get_transform(self): 22 | return arima.arima_with_weather 23 | 24 | 25 | class SeasonalARIMAModelCreator(load_prediction.ModelCreator): 26 | def _add_transform_genes(self): 27 | """Sets up for evolution of a seasonal ARIMA model.""" 28 | self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # 'AR' backshift (p) 29 | self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # 'I' backshift (d) 30 | self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # 'MA' backshift (q) 31 | self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Seasonal 'AR' backshift (p) 32 | self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # Seasonal 'I' backshift (d) 33 | self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # Seasonal 'MA' backshift (q) 34 | self._loci_list += ['AR_order', 'I_order', 'MA_order', 35 | 'ssn_AR_order', 'ssn_I_order', 'ssn_MA_order'] 36 | 37 | def _get_transform(self): 38 | return arima.seasonal_arima_with_weather 39 | 40 | 41 | class AutoARIMAModelCreator(load_prediction.ModelCreator): 42 | def _add_transform_genes(self): 43 | """Sets up for evolution of the ARIMA model.""" 44 | pass 45 | 46 | def _get_transform(self): 47 | return arima.auto_arima_with_weather 48 | 49 | 50 | if __name__ == "__main__": 51 | load_prediction.run(ARIMAModelCreator) 52 | -------------------------------------------------------------------------------- /sg/models/load_prediction_averagedaily.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and predictor as daily or 24-hour averages.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import load_prediction 7 | import load_prediction_averagehourly as lpah 8 | 9 | def daily_average(data, genome, loci, prediction_steps): 10 | start = -prediction_steps - genome[loci.hindsight] 11 | end = -prediction_steps 12 | return pd.TimeSeries(data=data["Load"][start:end].mean(), 13 | index=data.index[-prediction_steps:]) 14 | 15 | 16 | class DailyAverageModelCreator(lpah.HourlyAverageModelCreator): 17 | def _get_transform(self): 18 | return daily_average 19 | 20 | 21 | if __name__ == "__main__": 22 | load_prediction.run(DailyAverageModelCreator) 23 | -------------------------------------------------------------------------------- /sg/models/load_prediction_averagehourly.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and predictor as daily or 24-hour averages.""" 2 | 3 | from pyevolve import GAllele 4 | import Oger 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import sg.utils 9 | import sg.utils.pyevolve_utils as pu 10 | from model import Model 11 | import load_prediction 12 | 13 | def hourly_average(data, genome, loci, prediction_steps): 14 | assert(prediction_steps == 24) 15 | start = -prediction_steps - genome[loci.hindsight] 16 | end = -prediction_steps 17 | avg_data = pd.DataFrame({"Load": data["Load"][start:end].copy()}) 18 | avg_data["Hour of day"] = [i.hour for i in avg_data.index] 19 | means = avg_data.groupby(["Hour of day"]).mean()["Load"] 20 | return pd.TimeSeries(data=means.values, 21 | index=data.index[-prediction_steps:]) 22 | 23 | class HourlyAverageModelCreator(load_prediction.ModelCreator): 24 | def _add_transform_genes(self): 25 | """Sets up for evolution of the ARIMA model.""" 26 | self._alleles.add(pu.make_real_gene(1, 0, 1, 0.1)) # Dummy to make 1D crossover work in Pyevolve 27 | self._loci_list += ['crossover_dummy'] 28 | 29 | def _get_transform(self): 30 | return hourly_average 31 | 32 | 33 | if __name__ == "__main__": 34 | load_prediction.run(HourlyAverageModelCreator) 35 | -------------------------------------------------------------------------------- /sg/models/load_prediction_dshw.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and double seasonal Holt 2 | Winters predictor.""" 3 | 4 | from pyevolve import GAllele 5 | import Oger 6 | 7 | import sg.utils 8 | import sg.utils.pyevolve_utils as pu 9 | from model import Model 10 | import arima 11 | import load_cleansing 12 | import load_prediction 13 | 14 | class DSHWModelCreator(load_prediction.ModelCreator): 15 | def _add_transform_genes(self): 16 | """Sets up for evolution of the DSHW model.""" 17 | self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # alpha 18 | self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # beta 19 | self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # gamma 20 | self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # omega 21 | self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # phi 22 | self._loci_list += ['alpha', 'beta', 'gamma', 'omega', 'phi'] 23 | 24 | def _get_transform(self): 25 | return arima.dshw 26 | 27 | 28 | class AutoDSHWModelCreator(load_prediction.ModelCreator): 29 | def _add_transform_genes(self): 30 | """Sets up for evolution of the DSHW model.""" 31 | pass 32 | 33 | def _get_transform(self): 34 | return arima.auto_dshw 35 | 36 | 37 | if __name__ == "__main__": 38 | load_prediction.run(DSHWModelCreator) 39 | #load_prediction.run(AutoDSHWModelCreator()) 40 | -------------------------------------------------------------------------------- /sg/models/load_prediction_esn.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and ESN predictor.""" 2 | 3 | import numpy as np 4 | import Oger 5 | 6 | 7 | import sg.utils 8 | import sg.utils.pyevolve_utils as pu 9 | from model import Model 10 | import esn 11 | import load_cleansing 12 | import load_prediction 13 | 14 | class ESNModelCreator(load_prediction.ModelCreator): 15 | def _add_transform_genes(self): 16 | """Sets up for evolution of the ESN model.""" 17 | self._alleles.add(pu.make_int_gene(1, 10, 500, 25), weight=1) # Network size 18 | self._alleles.add(pu.make_real_gene(1, 0, 1, 0.05), weight=1) # Leak rate 19 | self._alleles.add(pu.make_real_gene(1, 0.1, 0.75, 0.05), weight=1) # Input scaling 20 | self._alleles.add(pu.make_real_gene(1, 0, 1, 0.05), weight=1) # Bias scaling 21 | self._alleles.add(pu.make_real_gene(1, 0.5, 2, 0.05), weight=1) # Spectral radius 22 | # We don't want too many seeds per evolutions, but we don't want to 23 | # always evolve on the same 5 networks either: 24 | self._alleles.add(pu.make_choice_gene( 25 | 1, np.random.random_integers(0, 2**16, 5)), weight=1) # Seed 26 | # Grid optimization showed that for a training length of 336, with 27 | # other params set based on previous gridopts and operating on the 28 | # total dataset rather than single AMS'es, optimal ridge was ~5. Scaled 29 | # thus 5/336=0.015. 30 | self._alleles.add(pu.make_choice_gene( 31 | 1, [0.0001/self._max_hindsight_hours]), weight=1) # Scaled ridge 32 | self._loci_list += ['size', 'leak', 'in_scale', 33 | 'bias_scale', 'spectral', 'seed', 'ridge' ] 34 | 35 | def _get_transform(self): 36 | return esn.feedback_with_external_input 37 | 38 | 39 | if __name__ == "__main__": 40 | load_prediction.run(ESNModelCreator) 41 | -------------------------------------------------------------------------------- /sg/models/load_prediction_esn24.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and ESN predictor.""" 2 | 3 | import sg.utils.pyevolve_utils as pu 4 | import esn 5 | import load_prediction_esn 6 | 7 | class ESNHourByHourModelCreator(load_prediction_esn.ESNModelCreator): 8 | def _add_transform_genes(self): 9 | """Sets up for evolution of the ESN model.""" 10 | # The 24 hour lags. 11 | gene = pu.make_choice_gene(1, [i for i in self._hindsight_days]) 12 | self._alleles.add(gene, weight=1) 13 | self._loci_list += ['lags'] 14 | ESNModelCreator._add_transform_genes(self) 15 | 16 | def _get_transform(self): 17 | return esn.hourbyhour_esn_feedback_with_external_input_ga 18 | 19 | 20 | if __name__ == "__main__": 21 | load_prediction.run(ESNHourByHourModelCreator) 22 | -------------------------------------------------------------------------------- /sg/models/load_prediction_identity.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor.""" 2 | 3 | from pyevolve import GAllele 4 | import Oger 5 | import pandas as pd 6 | 7 | import sg.utils 8 | import sg.utils.pyevolve_utils as pu 9 | from model import Model 10 | import load_cleansing 11 | import load_prediction 12 | 13 | def identity_transformer(data, genome, loci, prediction_steps): 14 | """This prediction model assumes tomorrow will be the same as today.""" 15 | return data["Load"][-prediction_steps*2:-prediction_steps].tshift(prediction_steps) 16 | 17 | def null_transformer(data, genome, loci, prediction_steps): 18 | """This prediction model assumes tomorrow will be entirely flat.""" 19 | return pd.TimeSeries(data=data["Load"][:-prediction_steps].mean(), 20 | index=data.index[-prediction_steps:]) 21 | 22 | class IdentityModelCreator(load_prediction.ModelCreator): 23 | def _add_transform_genes(self): 24 | """Sets up for evolution of a system without transformer.""" 25 | pass 26 | 27 | def _get_transform(self): 28 | return identity_transformer 29 | 30 | 31 | if __name__ == "__main__": 32 | load_prediction.run(IdentityModelCreator) 33 | -------------------------------------------------------------------------------- /sg/models/load_prediction_regul_ar.py: -------------------------------------------------------------------------------- 1 | '''Evolve a load predictor with regularized vector AR predictor.''' 2 | 3 | import functools 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | import sg.utils.pyevolve_utils as pu 9 | import load_prediction 10 | import regul_ar 11 | import arima 12 | 13 | 14 | class LinearRegularizedVectorARModelCreator(load_prediction.ModelCreator): 15 | def _add_transform_genes(self): 16 | '''Sets up for evolution of the regularized vector AR model.''' 17 | self._alleles.add(pu.make_int_gene(1, 1, 8*24, 5)) 18 | self._alleles.add(pu.make_int_gene(1, 0, 8*24, 5)) 19 | self._add_lambda_gene() 20 | self._loci_list += ['AR_order'] 21 | self._loci_list += ['EXO_order'] 22 | self._loci_list += ['lambda_cont'] 23 | 24 | def _add_lambda_gene(self): 25 | self._alleles.add(pu.make_real_gene(1, 0, 9, 0.2)) 26 | 27 | def _lambda_mapper(self, lc_gene_val): 28 | return lc_gene_val 29 | 30 | def _transform(self, data, genome, loci, prediction_steps): 31 | lags_2d = arima.lags_from_order_ga(data, genome, loci) 32 | lambda_cont = self._lambda_mapper(genome[loci.lambda_cont]) 33 | x_start = max(-len(data), -genome[loci.hindsight] - prediction_steps) 34 | svp = regul_ar.SmoothVectorARPredictor( 35 | data[x_start:-prediction_steps].values, 36 | num_models=prediction_steps, 37 | lags_2d=lags_2d, 38 | relative_lags=True, 39 | add_bias=True, 40 | out_cols=[data.columns.tolist().index('Load')]) 41 | svp.estimate(lambda_cont=lambda_cont) 42 | prediction = svp.predict( 43 | exo_series=np.atleast_2d(data['Temperature'].ix[-prediction_steps:].values).T, 44 | prediction_steps=prediction_steps) 45 | return pd.TimeSeries(data=prediction[:,0], index=data[-prediction_steps:].index) 46 | 47 | def _get_transform(self): 48 | return functools.partial(type(self)._transform, self) 49 | 50 | 51 | class LogRegularizedVectorARModelCreator(LinearRegularizedVectorARModelCreator): 52 | def _add_lambda_gene(self): 53 | self._alleles.add(pu.make_int_gene(1, 0, 1e6, 100)) 54 | 55 | def _lambda_mapper(self, lc_gene_val): 56 | return (np.power(10, lc_gene_val) - 1) / 1e3 57 | 58 | 59 | class RegularizedVanillaModelCreator(load_prediction.ModelCreator): 60 | def __init__(self, *args, **kwargs): 61 | load_prediction.ModelCreator.__init__(self, *args, **kwargs) 62 | self._warning_printed = False 63 | 64 | def _add_transform_genes(self): 65 | '''Sets up for evolution of the regularized vanilla benchmark model.''' 66 | self._alleles.add(pu.make_int_gene(1, 0, 1e6, 100)) 67 | self._loci_list += ['lambda_cont'] 68 | 69 | def _transform(self, data, genome, loci, prediction_steps): 70 | if not self._warning_printed: 71 | print 'Hindsight genome ignored, using all available data in Vanilla model.' 72 | self._warning_printed = True 73 | svp = regul_ar.VanillaVectorPredictor(data[:-prediction_steps]) 74 | svp.estimate(lambda_cont=genome[loci.lambda_cont]) 75 | return svp.predict(data[-prediction_steps:]) 76 | 77 | def _get_transform(self): 78 | return functools.partial(type(self)._transform, self) 79 | 80 | 81 | if __name__ == '__main__': 82 | load_prediction.run(LogRegularizedVectorARModelCreator) 83 | -------------------------------------------------------------------------------- /sg/models/load_prediction_taohong.py: -------------------------------------------------------------------------------- 1 | import load_prediction 2 | import taohong 3 | 4 | class VanillaModelCreator(load_prediction.ModelCreator): 5 | def _add_transform_genes(self): 6 | pass 7 | 8 | def _get_transform(self): 9 | return taohong.vanilla 10 | 11 | 12 | if __name__ == '__main__': 13 | load_prediction.run(VanillaModelCreator) 14 | -------------------------------------------------------------------------------- /sg/models/load_prediction_wavelet.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor.""" 2 | 3 | import random 4 | 5 | from pyevolve import GAllele 6 | import Oger 7 | 8 | import sg.utils 9 | import sg.utils.pyevolve_utils as pu 10 | from model import Model 11 | import wavelet 12 | import load_cleansing 13 | import load_prediction 14 | 15 | class WaveletModelCreator(load_prediction.ModelCreator): 16 | def _add_transform_genes(self): 17 | """This is where the models are defined. The models are passed to the 18 | GA engine for evolution of the optimal set of parameters. Afterwards, 19 | the models are tested, and performance is measured.""" 20 | self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Scale 21 | self._alleles.add(pu.make_choice_gene(1, [2])) # Aj, in the paper 2 gives best results. 22 | self._loci_list += ['scale', 'Aj'] 23 | 24 | def _get_transform(self): 25 | #return wavelet.linear_prediction 26 | #return wavelet.linear_vector 27 | #return wavelet.vector_multiscale_prediction 28 | #return wavelet.iterative_multiscale_prediction 29 | return wavelet.multiscale_prediction 30 | 31 | 32 | if __name__ == "__main__": 33 | load_prediction.run(WaveletModelCreator) 34 | -------------------------------------------------------------------------------- /sg/models/load_prediction_wavelet.py.orig: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor.""" 2 | 3 | import random 4 | 5 | from pyevolve import GAllele 6 | import Oger 7 | 8 | import sg.utils 9 | import sg.utils.genemapper as gm 10 | from model import Model 11 | import wavelet 12 | import load_cleansing 13 | import load_prediction 14 | 15 | class WaveletModelCreator(load_prediction.ModelCreator): 16 | def get_model(self, options): 17 | """This is where the models are defined. The models are passed to the 18 | GA engine for evolution of the optimal set of parameters. Afterwards, 19 | the models are tested, and performance is measured.""" 20 | 21 | alleles = GAllele.GAlleles() 22 | alleles.add(gm.MappedAlleleList(range(1,11))) # Scale 23 | alleles.add(gm.MappedAlleleList([2])) # Aj, in the paper 2 gives best results. 24 | alleles.add(gm.MappedAlleleList([ 2**i for i in range(4,12)])) # Train length. 25 | 26 | # For ESN training of predictor. 27 | alleles.add(gm.MappedAlleleRange(10, 500)) # Network size, 1 28 | alleles.add(gm.MappedAlleleRange(0, 2, real=True)) # Leak rate, 2 29 | alleles.add(gm.MappedAlleleRange(0.1, 0.75, real=True)) # Input scaling, 3 30 | alleles.add(gm.MappedAlleleRange(0, 1, real=True)) # Bias scaling, 4 31 | alleles.add(gm.MappedAlleleRange(0.5, 2, real=True)) # Spectral radius, 5 32 | bucket_seed = random.randrange(1, 2**16) 33 | alleles.add(gm.MappedAlleleRange(bucket_seed, bucket_seed + 5)) # Seed, 6 34 | alleles.add(gm.MappedAlleleList([0.0001/336])) # Scaled ridge, 7 35 | 36 | if not options.no_cleaning: 37 | alleles.add(gm.MappedAlleleRange(0.001, 800, real=True, scaling='log')) 38 | alleles.add(gm.MappedAlleleRange(0.001, 800, real=True, scaling='log')) 39 | alleles.add(gm.MappedAlleleRange(0.1, 3, real=True)) 40 | alleles.add(gm.MappedAlleleRange(0.1, 3, real=True)) 41 | 42 | loci = sg.utils.Enum('scale', 'Aj', 'hindsight', 43 | 'size', 'leak', 'in_scale', 44 | 'bias_scale', 'spectral', 'seed', 'ridge', 45 | 't_smooth', 'l_smooth', 't_zscore', 'l_zscore') 46 | 47 | return Model(genes=alleles, error_func=Oger.utils.nrmse, 48 | train_and_predict_func=wavelet.multiscale_prediction, 49 | clean_func=load_cleansing.bspline_clean_dataset, loci=loci) 50 | 51 | if __name__ == "__main__": 52 | load_prediction.run(WaveletModelCreator()) 53 | -------------------------------------------------------------------------------- /sg/models/load_prediction_wavelet24.py: -------------------------------------------------------------------------------- 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor.""" 2 | 3 | import random 4 | 5 | from pyevolve import GAllele 6 | import Oger 7 | 8 | import sg.utils 9 | import sg.utils.pyevolve_utils as pu 10 | from model import Model 11 | import wavelet 12 | import load_cleansing 13 | import load_prediction 14 | 15 | class WaveletHourByHourModelCreator(load_prediction.ModelCreator): 16 | def _add_transform_genes(self): 17 | """This is where the models are defined. The models are passed to the 18 | GA engine for evolution of the optimal set of parameters. Afterwards, 19 | the models are tested, and performance is measured.""" 20 | 21 | self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Scale 22 | self._alleles.add(pu.make_choice_gene(1, [2])) # Aj, in the paper 2 gives best results. 23 | gene = pu.make_choice_gene(1, [i for i in self._hindsight_days]) 24 | self._alleles.add(gene, weight=1) 25 | 26 | if options.no_cleaning: 27 | loci = sg.utils.Enum('scale', 'Aj') 28 | else: 29 | loci = sg.utils.Enum('scale', 'Aj', 't_smooth', 30 | 'l_smooth', 't_zscore', 'l_zscore') 31 | 32 | 33 | def _get_transform(self): 34 | return wavelet.hourbyhour_multiscale_prediction_ga 35 | 36 | 37 | if __name__ == "__main__": 38 | load_prediction.run(WaveletHourByHourModelCreator) 39 | -------------------------------------------------------------------------------- /sg/models/model.py: -------------------------------------------------------------------------------- 1 | class Model(object): 2 | """A class that holds all the properties necessary for a model to 3 | be employed in the GA search for optimal parameters.""" 4 | 5 | def __init__(self, name, genes, error_func, transformer, loci): 6 | self._name = name 7 | self._genes = genes 8 | self._error_func = error_func 9 | self._transformer = transformer 10 | self._loci = loci 11 | self._dataset = None 12 | self._day = None 13 | self._preprocessors = None 14 | self._postprocessors = None 15 | 16 | @property 17 | def name(self): 18 | return self._name 19 | 20 | def get_day(self): 21 | return self._day 22 | def set_day(self, day): 23 | self._day = day 24 | day = property(get_day, set_day) 25 | 26 | def get_loci(self): 27 | return self._loci 28 | def set_loci(self, loci): 29 | self._loci = loci 30 | loci = property(get_loci, set_loci) 31 | 32 | def get_genes(self): 33 | return self._genes 34 | def set_genes(self, genes): 35 | self._genes = genes 36 | genes = property(get_genes, set_genes) 37 | 38 | def get_error_func(self): 39 | return self._error_func 40 | def set_error_func(self, error_func): 41 | self._error_func = error_func 42 | error_func = property(get_error_func, set_error_func) 43 | 44 | def get_preprocessors(self): 45 | return self._preprocessors 46 | def set_preprocessors(self, preprocessors): 47 | self._preprocessors = preprocessors 48 | preprocessors = property(get_preprocessors, set_preprocessors) 49 | 50 | def get_transformer(self): 51 | return self._transformer 52 | def set_transformer(self, transformer): 53 | self._transformer = transformer 54 | transformer = property(get_transformer, set_transformer) 55 | 56 | def get_postprocessors(self): 57 | return self._postprocessors 58 | def set_postprocessors(self, postprocessors): 59 | self._postprocessors = postprocessors 60 | postprocessors = property(get_postprocessors, set_postprocessors) 61 | 62 | def get_genome(self): 63 | return self._genome 64 | def set_genome(self, genome): 65 | self._genome = genome 66 | genome = property(get_genome, set_genome) 67 | 68 | def get_dataset(self): 69 | return self._dataset 70 | def set_dataset(self, dataset): 71 | self._dataset = dataset 72 | dataset = property(get_dataset, set_dataset) 73 | -------------------------------------------------------------------------------- /sg/models/model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/model.pyc -------------------------------------------------------------------------------- /sg/models/onemax_mpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import time 4 | 5 | import numpy as np 6 | from mpi4py import MPI 7 | 8 | pop_size = 512*10 9 | genome_length = 100 10 | generations = 250 11 | mutation_rate = 0.1 12 | 13 | comm = MPI.COMM_WORLD 14 | nhosts = comm.Get_size() 15 | rank = comm.Get_rank() 16 | 17 | 18 | def evolve(): 19 | population = np.random.randint(2, size=(pop_size, genome_length)).astype('float') 20 | rest_time = None 21 | for gen in range(generations): 22 | eval_start = time.time() 23 | fitnesses = evaluate(population) 24 | eval_time = time.time() - eval_start 25 | rest_start = time.time() 26 | print_stats(gen, population, fitnesses, eval_time, rest_time) 27 | reproduce(population, fitnesses) 28 | rest_time = time.time() - rest_start 29 | 30 | def eval_loop(): 31 | for gen in range(generations): 32 | evaluate(None) 33 | 34 | def eval_local(population): 35 | fitnesses = np.empty(len(population)) 36 | target = np.arange(1, genome_length+1) 37 | for idx in range(population.shape[0]): 38 | fitnesses[idx] = -np.abs((population[idx,:] - target)).sum() 39 | return fitnesses 40 | 41 | def evaluate_ndarray(population=None): 42 | indices = np.linspace(0, pop_size, nhosts+1).astype('int') 43 | displs = indices[:-1] 44 | sendcounts = indices[1:] - displs 45 | if rank == 0: 46 | sendbuf = (population, np.array(sendcounts) * genome_length, 47 | np.array(displs) * genome_length, MPI.DOUBLE) 48 | recvbuf = (np.empty(pop_size), sendcounts, displs, MPI.DOUBLE) 49 | else: 50 | sendbuf = None 51 | recvbuf = None 52 | 53 | indivs = np.empty(sendcounts[rank] * genome_length) 54 | fitnesses = np.empty(sendcounts[rank]) 55 | 56 | comm.Scatterv(sendbuf,indivs) 57 | indivs.shape = (len(indivs)/genome_length, genome_length) 58 | 59 | fitnesses = eval_local(indivs) 60 | 61 | comm.Gatherv(fitnesses, recvbuf) 62 | if rank == 0: 63 | return recvbuf[0] 64 | 65 | def evaluate_pickle(population=None): 66 | if rank == 0: 67 | indices = np.linspace(0, pop_size, nhosts+1).astype('int') 68 | starts = indices[:-1] 69 | ends = indices[1:] 70 | scattered = [population[s:e,:] for s,e in zip(starts, ends)] 71 | else: 72 | scattered = None 73 | indivs = comm.scatter(scattered) 74 | fitnesses = eval_local(indivs) 75 | all_fitnesses = comm.gather(fitnesses) 76 | if rank==0: 77 | return np.concatenate(all_fitnesses) 78 | 79 | evaluate = evaluate_ndarray 80 | #evaluate = evaluate_pickle 81 | 82 | def print_stats(gen, population, fitnesses, eval_time, rest_time): 83 | if rest_time is None: 84 | timetxt = "%.4f" % eval_time 85 | else: 86 | timetxt = "%.4s/%.4s" % (eval_time, rest_time) 87 | print "Generation %d in %s: Fitnesses %.2f/%.2f/%.2f. Best indiv:" \ 88 | % (gen, timetxt, fitnesses.min(), fitnesses.mean(), fitnesses.max()) 89 | print population[fitnesses.argmax(),:] 90 | 91 | def mutate(indiv): 92 | for i in range(len(indiv)): 93 | if random.random() < mutation_rate: 94 | indiv[i] = indiv[i] + 1 if random.random() < 0.5 else indiv[i] - 1 95 | 96 | def reproduce(population, fitnesses): 97 | best = population[fitnesses.argmax(),:] 98 | for idx in range(pop_size): 99 | population[idx] = best 100 | mutations = np.where(np.random.random((pop_size, genome_length)) < mutation_rate) 101 | mutvals = np.random.randint(low=-1, high=2, size=len(mutations[0])) 102 | population[mutations] += mutvals 103 | #mutate(population[idx]) 104 | 105 | if __name__ == "__main__": 106 | if rank == 0: 107 | evolve() 108 | else: 109 | eval_loop() 110 | 111 | -------------------------------------------------------------------------------- /sg/models/roughness.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/roughness.tex -------------------------------------------------------------------------------- /sg/models/run_experiments.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import os 3 | import socket 4 | 5 | from sg.data.sintef import tempfeeder_exp 6 | from sg.utils.timer import SimpleTimer 7 | 8 | import run_experiments_params as params 9 | 10 | def run_one_wrapper(arg): 11 | reload(params) 12 | params.run_one(arg) 13 | 14 | def make_runs(user_ids, num_runs): 15 | """Create a list of (user_id, run_number) pairs that can be sent via 16 | pool.map to the run_one function.""" 17 | return [(user, run) for user in user_ids for run in range(num_runs)] 18 | 19 | def run_simulations(runs): 20 | """Run all the simulations provided in runs by sending them on to the 21 | run_one function.""" 22 | num_parallel_processes = 12 23 | pool = Pool(processes=num_parallel_processes) 24 | pool.map(run_one_wrapper, runs, chunksize=1) 25 | 26 | if __name__ == "__main__": 27 | # if socket.gethostname() == "tanzenmusik.idi.ntnu.no": 28 | # user_ids = tempfeeder_exp().user_ids[25:50] 29 | # else: 30 | # user_ids = tempfeeder_exp().user_ids[0:25] 31 | 32 | user_ids = [tempfeeder_exp().user_ids[0]] 33 | num_runs = 12 34 | 35 | print "Master pid is %d " % os.getpid() 36 | timer = SimpleTimer(output_stream=None) 37 | tempfeeder_exp().close() 38 | runs = make_runs(user_ids, num_runs) 39 | run_simulations(runs) 40 | print "All simulations complete. %s" % timer.end() 41 | tempfeeder_exp().close() 42 | -------------------------------------------------------------------------------- /sg/models/run_experiments_params.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | from sg.globals import SG_MODELS_PATH 6 | from sg.utils.timer import SimpleTimer 7 | from sg.globals import SG_SIM_PATH 8 | 9 | # This function is defined in a separate file, so the main runner can reload it 10 | # before each launch. This allows us to adjust parameters "on the fly". 11 | def run_one(arg): 12 | """Run one evolution. Arg is a tuple containing user ID and run number.""" 13 | user_id, run_number = arg 14 | # Note that the PID printed below is the PID in which this function is 15 | # running, which is different from the PID of the evolution. 16 | print "Launching evolution for user %d run %d (pid %d) at %s..." % \ 17 | (user_id, run_number, os.getpid(), time.asctime()) 18 | sys.stdout.flush() 19 | 20 | timer = SimpleTimer(output_stream=None) 21 | out_dir = os.path.join(SG_SIM_PATH, "id_%d" % user_id) 22 | model = os.path.join(SG_MODELS_PATH, "load_prediction.py") 23 | postfix = "run_%d" % run_number 24 | generations = 50 25 | pop_size = 400 26 | mutation = 0.05 27 | crossover = 0.5 28 | # NB Total-load sims: 29 | total = " --total-load" 30 | data_seed = 12 31 | 32 | stdout_path = os.path.join(out_dir, 33 | "output_run_%d.txt" % run_number) 34 | os.system("test -d %s || mkdir -p %s" % (out_dir, out_dir)) 35 | os.system("python %s " % model + \ 36 | " --userid=%d" % user_id + \ 37 | " --out-dir=%s --out-postfix=%s " % (out_dir, postfix) + \ 38 | " --generations=%d --pop-size=%d " % (generations, pop_size) + \ 39 | " --mutation=%f --crossover=%f " % (mutation, crossover) + \ 40 | " --no-show-plot --save-plot " + \ 41 | total + \ 42 | " --data-seed=%d " % data_seed + \ 43 | " >%s" % stdout_path) 44 | 45 | print "Evolution completed for user %d run %d. %s" \ 46 | % (user_id, run_number, timer.end()) 47 | sys.stdout.flush() 48 | -------------------------------------------------------------------------------- /sg/models/spclean.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/spclean.pyc -------------------------------------------------------------------------------- /sg/models/spclean_wrapper.py: -------------------------------------------------------------------------------- 1 | """Demonstrate the cleansing algorithm on datasets of varying length.""" 2 | 3 | import sys 4 | import time 5 | from datetime import timedelta as dt 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import sg.data.sintef.userloads as ul 12 | import spclean as cln 13 | from sg.utils.timer import SimpleTimer 14 | import splines as sp 15 | 16 | import array 17 | import ctypes 18 | 19 | from ctypes import cdll 20 | from ctypes import c_double 21 | 22 | # Load a dataset containing power load history. This set is divided into 23 | # training and test data, we only keep the traning part for now. 24 | 25 | def _get_smoother(): 26 | # Set slow_smoother to True in order to see the actual time consumed by the 27 | # B-spline smoothing operation. If set to False, will use the default 28 | # smoother where the roughness matrices are cached. 29 | slow_smoother = True 30 | if slow_smoother: 31 | #print "Using slow, analytic, non-caching smoother." 32 | return cln.BSplineAnalyticSmoother 33 | else: 34 | #print "Using not quite so slow, caching smoother." 35 | return cln.BSplineSmoother 36 | 37 | ds_array = 0 38 | kn_array = 0 39 | 40 | class BsplineFastSmoother(object): 41 | def __init__(self, data, smoothness, zscore): 42 | #create knot vector 43 | knots = sp.get_uniform_knots_from_points(data, degree, knotrange=(0, len(data) - 1)) 44 | 45 | #determine datasize 46 | n_data = len(data) 47 | n_knot = len(knots) 48 | 49 | #create a pointer to the dataset 50 | ds = np.array(data) 51 | ds_type = c_double*n_data 52 | ds_array = ds_type(*ds) 53 | 54 | #create a pointer to the knots 55 | kn = np.array(knots) 56 | kn_type = c_double*n_knot 57 | kn_array = kn_type(*kn) 58 | 59 | #number of threads 60 | 61 | self._lib = cdll.LoadLibrary('lib_mkl/libspclean.so') 62 | self.obj = self._lib.Smoother_new(ds_array, n_data, kn_array, n_knot, degree, c_double(smoothness), c_double(zscore)) 63 | 64 | 65 | def __del__(self): 66 | self._lib.Smoother_delete(self.obj) 67 | 68 | def bsm_cleanData(self): 69 | return self._lib.bsm_cleanData(self.obj) 70 | 71 | def bsm_smoothedData(self): 72 | return self._lib.bsm_smoothedData(self.obj) 73 | 74 | 75 | # load data 76 | dataset, _ = ul.total_experiment_load() 77 | 78 | # Set parameters for the B-spline smoother/cleanser 79 | degree = 3 80 | smoothness = 100.0 81 | zscore = 1.0 82 | 83 | # Try smoothing/cleansing different time series lengths 84 | for hindsight_days in [1]: 85 | # Select data 86 | num_hours = 24 * hindsight_days 87 | data = dataset["Load"][-num_hours:].copy() 88 | 89 | #determine datasize 90 | n_data = len(data) 91 | 92 | # Some output and rough timing 93 | #print "Cleansing %d hours of data with smoothness %.2f, z-score %.2f..." % \ 94 | # (num_hours, smoothness, zscore) 95 | #sys.stdout.flush() 96 | start_time = time.time() 97 | 98 | # This is the part that takes time 99 | #smoother = _get_smoother()(data, smoothness) 100 | #cleaner = cln.RegressionCleaner(smoother, zscore) 101 | #cleaned, _ = cleaner.get_cleaned_data(method=cln.RegressionCleaner.replace_with_bound) 102 | 103 | #call cpp smpline object and get the result 104 | sm = BsplineFastSmoother(data, smoothness, zscore) 105 | res = sm.bsm_cleanData() 106 | 107 | # Wrap up and plot the result 108 | end_time = time.time() 109 | 110 | #convert the pointer to nparray 111 | # ArrayType = ctypes.c_double*n_data 112 | # array_pointer = ctypes.cast(res, ctypes.POINTER(ArrayType)) 113 | # cleaned_data = np.frombuffer(array_pointer.contents, dtype=np.double) 114 | 115 | # print "Done in %s." % SimpleTimer.period_to_string(start_time, end_time) 116 | # sys.stdout.flush() 117 | 118 | # res = sm.bsm_smoothedData() 119 | 120 | #convert the pointer to nparray 121 | # ArrayType = ctypes.c_double*n_data 122 | # array_pointer = ctypes.cast(res, ctypes.POINTER(ArrayType)) 123 | # print "Getting smoothed data..." 124 | # sys.stdout.flush() 125 | # smoothed_data = np.frombuffer(array_pointer.contents, dtype=np.double) 126 | print "Got smoothed data..." 127 | # sys.stdout.flush() 128 | 129 | # print data 130 | # print cleaned_data 131 | # print smoothed_data 132 | 133 | # plt.figure() 134 | # data.plot(style='b', label="Raw data") 135 | # print "Creating time series from smoothed data..." 136 | # sys.stdout.flush() 137 | # smoothed_series = pd.TimeSeries(data=smoothed_data, index=data.index) 138 | # print "Plotting smoothed series..." 139 | # sys.stdout.flush() 140 | # smoothed_series.plot(style='r', label="Smoothed data") 141 | # print "Done plotting smoothed series." 142 | # sys.stdout.flush() 143 | # plt.legend() 144 | # plt.show() 145 | # data.plot(style='r', label='Raw load') 146 | # cleaned_data.plot(style='b', label='Cleaned load') 147 | # spline = pd.TimeSeries(data=smoother.splev(range(len(cleaned))), index=cleaned.index) 148 | # spline.plot(style='g', label='Smoothing spline') 149 | # plt.legend(loc=3) 150 | 151 | #plt.savefig('cfig.pdf') 152 | -------------------------------------------------------------------------------- /sg/models/splines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/splines.pyc -------------------------------------------------------------------------------- /sg/models/static.py: -------------------------------------------------------------------------------- 1 | import Oger, mdp, pdb 2 | import numpy as NP 3 | 4 | class StaticNode(Oger.nodes.ReservoirNode): 5 | """ Extends the Reservoir node for static classification by letting the inner dynamics of the reservoir settle before the final state of that timestep is 6 | stored. 7 | 8 | Note: in the original paper, the transfer function is not used. It is not clear why one shouldn't use the tanh function, 9 | however, this is not hardcoded. Use the identity function as an input parameter if this behaviour is desired. It seems to me that 10 | the network performs better when the tanh transfer function is used. 11 | 12 | Author: Axel Tidemann 13 | """ 14 | def _execute(self, x): 15 | """ Executes simulation with input vector x. 16 | """ 17 | steps = x.shape[0] 18 | 19 | # Pre-allocate the state vector, adding the initial state. All zeros. 20 | states = mdp.numx.zeros((steps, self.output_dim)) 21 | 22 | # A vector to store how many steps were needed to stabilize the reservoir. 23 | stabilize = mdp.numx.zeros(steps) 24 | 25 | # Loop over the input data and compute the reservoir states. 26 | for n in range(steps): 27 | # Let the reservoir stabilize before collection. 28 | previous_state = states[n,:] 29 | current_state = self.nonlin_func(mdp.numx.dot(self.w, states[n, :]) + mdp.numx.dot(self.w_in, x[n, :]) + self.w_bias) 30 | 31 | i = 0 32 | # We continue until a 0.1% change. Formula taken from Wikipedia for % difference (not percent error). 33 | while abs(NP.sum(previous_state - current_state))/max(abs(NP.sum(previous_state)), abs(NP.sum(current_state))) > 0.001: 34 | previous_state = current_state 35 | # Added flattening of previous_state in the following line, 2012-07-18. Somehow the transposing did not happen before, or 36 | # maybe MDP was more tolerant. 37 | current_state = self.nonlin_func(mdp.numx.dot(self.w, NP.ndarray.flatten(previous_state)) + mdp.numx.dot(self.w_in, x[n, :]) + self.w_bias) 38 | i += 1 39 | 40 | stabilize[n] = i 41 | states[n, :] = current_state 42 | self._post_update_hook(states, x, n) 43 | 44 | print 'StaticNode: Steps to stabilize the reservoir (avg std min max)', NP.average(stabilize), NP.std(stabilize), min(stabilize), max(stabilize) 45 | 46 | #print NP.max(states), NP.min(states), NP.average(states) 47 | 48 | return states 49 | 50 | 51 | ##### Testing ##### 52 | if __name__ == "__main__": 53 | 54 | #Generate random vectors 55 | NP.random.seed() 56 | x = NP.random.randn(100,20) # (number of cases, number of features) 57 | #Generate target vector - one vector for each case. 58 | y = NP.eye(100) 59 | 60 | #Create ESN 61 | reservoir = StaticNode(input_dim = x.shape[1], output_dim = 20, spectral_radius = 0.55) #Too large reservoir -> trouble. 62 | readout = Oger.nodes.RidgeRegressionNode() 63 | 64 | flow = mdp.hinet.FlowNode(reservoir + readout) 65 | flow.train(x, y) 66 | flow.stop_training() 67 | 68 | ytest = flow(x) 69 | 70 | # See how well the classification works, e.g. if the highest activated output node is the correct one. 71 | c = 0 72 | for i in range(y.shape[0]): 73 | if NP.argmax(ytest[i,:]) == NP.argmax(y[i,:]): 74 | c += 1 75 | 76 | print 'Absolute error:', NP.mean(ytest - y), 'Classfication rate:', 100*c/y.shape[0], '%' 77 | 78 | -------------------------------------------------------------------------------- /sg/models/static.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/static.pyc -------------------------------------------------------------------------------- /sg/models/subset_runs/make-runfiles.sh: -------------------------------------------------------------------------------- 1 | bindir="$HOME/SmartGrid/src/sg/models" 2 | rundir="$bindir/subset_runs" 3 | outdir="$rundir/evo_output_files" 4 | 5 | generations=10 6 | popsize=100 7 | 8 | evocmd="python $bindir/load_prediction_arima.py --out-dir=$outdir --out-postfix=|postfix| --generations=$generations --pop-size=$popsize --mutation=0.2 --crossover=0.5 --mutation-sigma=10 --no-plot --elite=0 --num-trials=7 --env-replace=3 --data-seed=|dataseed| --no-cleaning --parallel --user-subset=|numberofusers|" 9 | 10 | max_subset_size=150 11 | runs_per_subset=10 12 | 13 | runfile_path_base="$rundir/run-subset-" 14 | rm ${runfile_path_base}* 15 | 16 | num_subset_files_created=0 17 | for (( subset=1; $subset<${max_subset_size}; subset=$subset+1 )); do 18 | runfile=${runfile_path_base}$subset.sh 19 | let num_subset_files_created=${num_subset_files_created}+1 20 | cat >$runfile <\$outputfile 58 | for (( run=0; \$run<${runs_per_subset}; run=\$run+1 )); do 59 | cmd="\`echo \$evocmd | sed -e\"s/|postfix|/subset_${subset}_/; s/|dataseed|/\$run/\; s/|numberofusers|/${subset}/"\`" 60 | echo "Launching \$cmd..." 61 | rmse=\`\$cmd 2>/dev/null | tail -n 3 | head -n 1 |awk '{print \$NF}'\` 62 | echo "\$cmd; $subset; \$run; \$rmse" >>\$outputfile 63 | echo "Done with run \$run for subset size $subset." 64 | done 65 | EOF 66 | chmod u+x $runfile 67 | done 68 | 69 | echo "Made ${num_subset_files_created} from 1 to ${max_subset_size} files with ${runs_per_subset} runs per subset, evolving a population of $popsize individuals over $generations generations." -------------------------------------------------------------------------------- /sg/models/taohong.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def vanilla(data, genome, loci, prediction_steps, spinup=0): 5 | """ Tao Hong's Vanilla Benchmark method, as described in 6 | "A Naive Multipe Linear Regression Benchmark for Short Term Load 7 | Forecasting" (Hong, 2011) 8 | 9 | Note: this model is specifically built for hourly based predictions, 10 | and will not work properly otherwise.""" 11 | 12 | temps = data.Temperature 13 | 14 | num_params = 2 + 7*24 + 4*12 + 3*24 15 | print 'Created model with', num_params, 'parameters.' 16 | a = np.zeros((len(data), num_params)) 17 | 18 | for i in range(a.shape[0]): 19 | day, hour, month = temps.index[i].dayofweek, temps.index[i].hour, temps.index[i].month 20 | month -= 1 21 | tmp = temps[i] 22 | trend = (temps.index[i].value - temps.index[0].value)/(3600*10**9) + 1 23 | a[i, 0:2] = [ 1, trend] 24 | offset = 2 25 | a[i, offset + day*hour] = 1 26 | offset += 7*24 27 | a[i, offset + month] = 1 28 | offset += 12 29 | a[i, offset + month] = tmp 30 | offset += 12 31 | a[i, offset + month] = tmp**2 32 | offset += 12 33 | a[i, offset + month] = tmp**3 34 | offset += 12 35 | a[i, offset + hour] = tmp 36 | offset += 24 37 | a[i, offset + hour] = tmp**2 38 | offset += 24 39 | a[i, offset + hour] = tmp**3 40 | assert(offset + 24 == num_params) 41 | x,_,_,_ = np.linalg.lstsq(a[:-prediction_steps], data.Load[:-prediction_steps]) 42 | 43 | return pd.Series(data=np.dot(a[-prediction_steps:],x), 44 | index=data.index[-prediction_steps:]) 45 | 46 | -------------------------------------------------------------------------------- /sg/models/test_esn.py: -------------------------------------------------------------------------------- 1 | """Early attempt. Be patient.""" 2 | 3 | from datetime import timedelta as dt 4 | import math 5 | import random 6 | 7 | import numpy as np 8 | import Oger, mdp 9 | import matplotlib.pyplot as plt 10 | import scikits.timeseries as ts 11 | 12 | import esn 13 | import sg.utils 14 | from sg.data.sintef.create_full_temp_data import data as read_temperatures 15 | import sg.data.sintef.userloads as ul 16 | import load_prediction 17 | 18 | user_id = 55864860 19 | 20 | (dataset, test) = load_prediction.prepare_datasets(user_id, True) 21 | 22 | day = 24 23 | today = random.randint(1000, dataset.shape[0]-day*2) 24 | today = 4600 25 | 26 | 27 | # [len_data, res_size, leak, input, bias, spectral, 28 | # seed, ridge, tmp_sm, load_sm] 29 | genome = [336, 500, 0.1, 0.5, 0.5, 0.9, 1000, 0.0001, 10, 10] 30 | genome = [168, 360, 0.1370736370770198, 1.322886484520891, 0.3211445098985698, 31 | 0.9492725784817237, 42979, 0.043436305850920925, 93, 52, 32 | 1.3053755202564812, 0.5905128791783507] 33 | 34 | alleles.loci = sg.utils.enum('hindsight', 'size', 'leak', 'in_scale', 35 | 'bias_scale', 'spectral', 'seed', 'ridge', 36 | 't_smooth', 'l_smooth', 't_zscore', 'l_zscore') 37 | 38 | test = sg.utils.Normalizer(dataset[today-genome[0]:today+day,:], axis=0) 39 | 40 | ytest = esn.feedback_with_external_input(test.normalized, genome, day) 41 | 42 | print Oger.utils.nrmse(ytest[-day:], test.normalized[-day:,1]) 43 | 44 | plt.figure() 45 | plt.plot(test.normalized[:,1], label="Input loads") 46 | offset = len(test.raw) - genome[0] 47 | plt.plot(range(offset, offset + len(ytest)), ytest, label="Prediction") 48 | plt.show() 49 | 50 | # ytest.shape = (len(ytest), 1) 51 | # ytest = test.expand(np.concatenate((ytest, ytest), axis=1))[:,1] 52 | 53 | # print sg.utils.mape(ytest[-day:], test.raw[-day:,1]) 54 | 55 | # out_series = ts.time_series(data=ytest, dates=loads[524:1000].dates) 56 | # sg.utils.plot_time_series([loads[524:1000], out_series], 57 | # ["r-", "g-"], ["Loads", "Prediction"]) 58 | 59 | -------------------------------------------------------------------------------- /sg/models/test_sequence_scan.py: -------------------------------------------------------------------------------- 1 | # The dumbest form of similar sequence retrieval: sequential scan. To see if there actually 2 | # are any similar sequences. 3 | 4 | 5 | from datetime import timedelta as dt 6 | import math 7 | import random 8 | 9 | import numpy as np 10 | import Oger, mdp 11 | import matplotlib.pyplot as plt 12 | import scikits.timeseries as ts 13 | from rtree import index 14 | 15 | import pywt 16 | import sg.utils 17 | from sg.data.sintef.create_full_temp_data import data as read_temperatures 18 | import sg.data.sintef.userloads as ul 19 | import load_prediction 20 | 21 | user_id = 55864860 22 | 23 | (dataset, test) = load_prediction.prepare_datasets(user_id, False) 24 | 25 | window = 256 26 | 27 | candidate = sg.utils.Normalizer(dataset[:window+24,1]).normalized 28 | 29 | sim = np.argmin([ Oger.utils.nrmse(sg.utils.Normalizer(test[i:i+window,1]).normalized, candidate[:-24]) for i in range(len(test)) if len(test)-i >= window ]) 30 | print 'Done.' 31 | plt.plot(candidate, label='target') 32 | most_similar = sg.utils.Normalizer(test[sim:sim+window+24,1]).normalized 33 | plt.plot(most_similar, label='most similar, NRMSE %f' % Oger.utils.nrmse(most_similar[-24:], candidate[-24:])) 34 | plt.legend() 35 | plt.show() 36 | 37 | -------------------------------------------------------------------------------- /sg/models/test_wavelet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing. 3 | Author: Axel Tidemann 4 | """ 5 | 6 | from datetime import timedelta as dt 7 | import math 8 | import random 9 | 10 | import numpy as np 11 | import Oger, mdp 12 | import matplotlib.pyplot as plt 13 | import scikits.timeseries as ts 14 | 15 | import pywt 16 | import sg.utils 17 | from sg.data.sintef.create_full_temp_data import data as read_temperatures 18 | import sg.data.sintef.userloads as ul 19 | import load_prediction 20 | 21 | from static import StaticNode 22 | 23 | user_id = 55864860 24 | 25 | (dataset, test) = load_prediction.prepare_datasets(user_id, False) 26 | 27 | #day = 24 28 | #today = random.randint(1000, dataset.shape[0]-day*2) 29 | #today = 4600 30 | 31 | #See if we can predict 24 times based on instances, learned from the training set. 32 | 33 | data_raw = sg.utils.Normalizer(dataset, axis=0) 34 | 35 | data = data_raw.normalized[:2**14,1] 36 | 37 | # One year is 365*24 = 8760 datapoints. If we round down to 8192, we will get 38 | # the maximum amount of scales for the decomposition (13), i.e. math.pow(2,13) 39 | # The number of levels/scales determine how far we look back. 40 | level = 4 41 | 42 | coeffs = pywt.swt(data, 'haar', level=level) 43 | 44 | # Collect coeffecients for training. Aj = 2 is taken from the paper. 45 | 46 | Aj = 2 47 | 48 | # The first 2^level datapoints cannot be used to predict because of lack of history. 49 | # level+1 because of the smooth array. 50 | x = np.zeros((len(data) - 2**level, (level+1)*Aj)) 51 | 52 | for i in range(len(x)): 53 | row = [] 54 | # Collect coefficients for each level. cAn, i.e. the smoothest array. 55 | for k in range(1, Aj+1): 56 | row.append(coeffs[-1][0][2**level + i - 2**level*(k-1)]) 57 | # cD, the details. 58 | for j in range(1, level+1): 59 | for k in range(1, Aj+1): 60 | row.append(coeffs[j-1][1][2**level + i - 2**j*(k-1)]) 61 | 62 | x[i] = np.array(row) 63 | 64 | # Target 65 | y = data_raw.normalized[2**level:,1] 66 | y.shape = (len(y), 1) 67 | 68 | # Split into train/test sets 69 | x_train = x[:356*24] 70 | y_train = y[:356*24] 71 | 72 | print 'Start ESN training...' 73 | 74 | # Do 24hr predictions based on single day instances 75 | x_24 = x[::24] 76 | y_24 = np.zeros((len(y)/24,24)) 77 | for i in range(len(y_24)): 78 | y_24[i] = np.transpose(y[i*24:i*24+24]) 79 | x_24_train = x_24[:365] 80 | y_24_train = y_24[:365] 81 | 82 | flow_24 = mdp.hinet.FlowNode(Oger.nodes.LeakyReservoirNode(input_dim = x_24.shape[1], output_dim = 100, spectral_radius = 0.9) + Oger.nodes.RidgeRegressionNode()) 83 | flow_24.train(x_24_train, y_24_train) 84 | flow_24.stop_training() 85 | 86 | x_24_test = x_24[365:-1] # There is one more element than the y target due to rounding. 87 | y_24_target = y_24[365:] 88 | 89 | y_24_test = flow_24(x_24_test) 90 | print 'NRMSE 24hr:', Oger.utils.nrmse(np.ndarray.flatten(y_24_test), np.ndarray.flatten(y_24_target)) 91 | 92 | plt.figure() 93 | plt.plot(np.ndarray.flatten(y_24_target), label='24 hr target') 94 | plt.plot(np.ndarray.flatten(y_24_test), label='24 hr test') 95 | plt.legend() 96 | 97 | # Test with a classifier ESN 98 | #reservoir = StaticNode(input_dim = x.shape[1], output_dim = 2000, spectral_radius = 0.9) 99 | reservoir = Oger.nodes.LeakyReservoirNode(input_dim = x.shape[1], output_dim = 2000, spectral_radius = 0.9) 100 | readout = Oger.nodes.RidgeRegressionNode() 101 | 102 | flow = mdp.hinet.FlowNode(reservoir + readout) 103 | flow.train(x_train, y_train) 104 | flow.stop_training() 105 | 106 | x_test = x[356*24:] 107 | y_target = y[356*24:] 108 | 109 | y_test = flow(x_test) 110 | print 'NRMSE:', Oger.utils.nrmse(y_test, y_target) 111 | 112 | plt.figure() 113 | plt.plot(data, label="Input loads") 114 | plt.plot(coeffs[-1][0], label='Smooth array') 115 | i = 1 116 | for _,cD in coeffs: 117 | plt.plot(cD, label='cD%i'%i) 118 | i += 1 119 | plt.legend() 120 | 121 | plt.figure() 122 | plt.plot(y_target, label='Target') 123 | plt.plot(y_test, label='Prediction') 124 | plt.legend() 125 | plt.show() 126 | 127 | -------------------------------------------------------------------------------- /sg/models/test_wavelet_retrieve.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing. 3 | Author: Axel Tidemann 4 | """ 5 | 6 | from datetime import timedelta as dt 7 | import math 8 | import random 9 | 10 | import numpy as np 11 | import Oger, mdp 12 | import matplotlib.pyplot as plt 13 | import scikits.timeseries as ts 14 | from rtree import index 15 | 16 | import pywt 17 | import sg.utils 18 | from sg.data.sintef.create_full_temp_data import data as read_temperatures 19 | import sg.data.sintef.userloads as ul 20 | import load_prediction 21 | 22 | user_id = 55864860 23 | 24 | (dataset, test) = load_prediction.prepare_datasets(user_id, False) 25 | 26 | #See if we can predict 24 times based on instances, learned from the training set. 27 | window = 512 28 | 29 | hours = [ sg.utils.Normalizer(dataset[i:i+window,1]).normalized for i in range(len(dataset)) if len(dataset)-i >= window+24 ] 30 | 31 | coeffs = [ pywt.wavedec(segment,'haar') for segment in hours ] 32 | 33 | # Grow tree 34 | p = index.Property() 35 | p.dimension = 20 36 | idx = index.Index(properties=p) 37 | 38 | i = 0 39 | for c in coeffs: 40 | key = [item for sublist in c for item in sublist ][:p.dimension] 41 | idx.insert(i, tuple(key)) 42 | i+=1 43 | 44 | def retrieve(query): 45 | query_key = [item for sublist in pywt.wavedec(query[:-24],'haar') for item in sublist ][:p.dimension] 46 | results = list(idx.nearest(tuple(query_key), 3)) 47 | print results 48 | plt.plot(query, label='query') 49 | for i in range(len(results)): 50 | candidate = sg.utils.Normalizer(dataset[results[i]:results[i]+window+24,1]).normalized 51 | plt.plot(candidate, label='candidate %i, NRMSE %f'%(i,Oger.utils.nrmse(candidate[-24:], query[-24:]))) 52 | plt.axvline(x=window, color='r', linewidth=1) 53 | plt.legend() 54 | plt.show() 55 | 56 | 57 | # Try to find 5 random points in the training dataset 58 | for test_point in np.random.permutation(range(len(dataset) - window - 24))[:5]: 59 | retrieve(sg.utils.Normalizer(dataset[test_point:test_point+window+24,1]).normalized) 60 | 61 | # Try 5 different random points in the test dataset 62 | for test_point in np.random.permutation(range(len(test) - window - 24))[:5]: 63 | retrieve(sg.utils.Normalizer(test[test_point:test_point+window+24,1]).normalized) 64 | -------------------------------------------------------------------------------- /sg/models/wavelet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/wavelet.pyc -------------------------------------------------------------------------------- /sg/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | Required packages to get the code running: 3 | * python 2.7 (or possibly another 2.x, see note for scikits below) 4 | * numpy, scipy and matplotlib 5 | * distribute (successor of setuptools) (not for macports?): http://pypi.python.org/pypi/distribute#installation-instructions 6 | * Pip (not macports?) http://www.pip-installer.org/en/latest/installing.html 7 | * scikits.timeseries from macports, pip or http://sourceforge.net/projects/pytseries/files/scikits.timeseries. Only for Python 2.6 in macports at the time of writing. 8 | -------------------------------------------------------------------------------- /sg/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | -------------------------------------------------------------------------------- /sg/utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/__init__.pyc -------------------------------------------------------------------------------- /sg/utils/_test_template.py: -------------------------------------------------------------------------------- 1 | """This is a unit test skeleton, meant to be used as a template for the 2 | boilerplate code when creating a new unit test file.""" 3 | 4 | import os 5 | import sys 6 | import unittest 7 | 8 | import numpy as np 9 | 10 | import sg.utils.testutils as testutils 11 | 12 | from xxx import * 13 | 14 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | class Test(testutils.ArrayTestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def tearDown(self): 21 | pass 22 | 23 | def test_(self): 24 | """.""" 25 | pass 26 | 27 | class Test(unittest.TestCase): 28 | def setUp(self): 29 | pass 30 | 31 | def tearDown(self): 32 | pass 33 | 34 | def test_(self): 35 | """.""" 36 | pass 37 | 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | 42 | # if __name__ == "__main__": 43 | # from unittest import main 44 | # main(module="test_" + __file__[:-3]) 45 | 46 | -------------------------------------------------------------------------------- /sg/utils/analyze_gefcom_temp_genes.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous routines to import/extract/plot the evolution of the 2 | temperature genes in evolved forecasters for the GEFCom 2012 dataset.""" 3 | 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | 7 | def _plot_on_axis(means, station, ax): 8 | m = means['temp_{}'.format(station)] 9 | for i in range(30): 10 | try: 11 | m.ix[i].plot(ax=ax, color='r', legend=False) 12 | except: 13 | print "Trouble plotting run {}, station {}. Missing data?".format(i, station) 14 | plt.title('Temp station {}'.format(station)) 15 | 16 | def multi(means): 17 | """Create one plot for each temperature station in 'means'. Draw the 18 | evolution in each run as a separate line.""" 19 | for s in range(11): 20 | _plot_on_axis(means, s, plt.figure().gca()) 21 | 22 | def multi_sub(means, title=None): 23 | """Create one subplot for each temperature station in 'means'. Draw the 24 | evolution in each run as a separate line.""" 25 | fig = plt.figure() 26 | if title is not None: 27 | plt.suptitle(title) 28 | for s in range(11): 29 | _plot_on_axis(means, s, fig.add_subplot(3, 4, s+1)) 30 | 31 | # for s in range(11): 32 | # ax = fig.add_subplot(3, 4, s) 33 | # m = means['temp_{}'.format(s)] 34 | # for i in range(30): 35 | # m.ix[i].plot(ax=ax, color='b', alpha=0.3, legend=False) 36 | # plt.title('Temp station {}'.format(s)) 37 | 38 | def multi_2(means, stations=range(11), fig=None): 39 | """All runs and (the given) stations on the same plot""" 40 | if fig is None: 41 | fig = plt.figure() 42 | ax = fig.gca() 43 | columns = ['temp_{}'.format(s) for s in stations] 44 | lbls = ['Temperature Station {}'.format(s+1) for s in stations] 45 | for i in range(30): 46 | # means[columns].ix[i].plot(ax=ax, colormap='jet', alpha=1, legend=False) 47 | means[columns].ix[i].plot(ax=ax, color=['b', 'c', 'm', 'g', 'y', 'r'], alpha=1, legend=False) 48 | plt.legend(lbls, loc='right') 49 | # m.ix[0].plot(ax=ax, colormap='jet', alpha=0.05, legend=False) 50 | # for i in range(1,30): 51 | # m.ix[i].plot(ax=ax, colormap='jet', alpha=0.05, legend=False) 52 | # plt.title('Temp station {}'.format(t)) 53 | 54 | def import_from_csv(path): 55 | """Read the CSV file in 'path', output a pandas Dataframe with 11 56 | columns, one for each temperature gene, and 100 rows, one for each 57 | generation in each run. Each value is averaged across all 58 | individuals in all runs found in the CSV file. The CSV was typically 59 | made with a command similar to [...]/scripts/parse-logs-into-csv.sh output_*.txt.""" 60 | all = pd.read_csv(open(path, 'r')) 61 | cols = ['file', 'gen', 'fitn1', 'fitn2', 'hindsight', 'AR_order'] 62 | cols += [ 'temp_{}'.format(i) for i in range(11)] 63 | all.columns = cols 64 | grouped = all.groupby(['file', 'gen'], as_index='False') 65 | means = grouped.mean() 66 | stds = grouped.std() 67 | means = means.drop(['fitn1', 'fitn2', 'hindsight', 'AR_order'], axis=1) 68 | stds = stds.drop(['fitn1', 'fitn2', 'hindsight', 'AR_order'], axis=1) 69 | return means 70 | -------------------------------------------------------------------------------- /sg/utils/cache.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | class ATimeCache(object): 4 | """Cache class (dictionary) with a limited size, where only the 5 | 'max_entries' most recently added or accessed entries are stored.""" 6 | 7 | def __init__(self, max_entries): 8 | self._cache = OrderedDict() 9 | self._max_entries = max_entries 10 | 11 | def _shrink(self): 12 | while len(self._cache) > self._max_entries: 13 | self._cache.popitem(last=False) 14 | 15 | def get_max_entries(self): 16 | return self._max_entries 17 | 18 | def set_max_entries(self, value): 19 | self._max_entries = value 20 | self._shrink() 21 | 22 | max_entries = property( 23 | get_max_entries, set_max_entries, None, "Set or get the cache size") 24 | 25 | def has_key(self, key): 26 | return self._cache.has_key(key) 27 | 28 | def __eq__(self, other): 29 | try: 30 | return self._cache.__eq__(other._cache) 31 | except: 32 | return False 33 | 34 | def __len__(self): 35 | return self._cache.__len__() 36 | 37 | def __getitem__(self, key): 38 | value = self._cache.pop(key) 39 | self._cache[key] = value 40 | return value 41 | 42 | def __setitem__(self, key, value): 43 | if self._cache.has_key(key): 44 | self._cache.pop(key) 45 | self._cache.__setitem__(key, value) 46 | self._shrink() 47 | 48 | def __contains__(self, key): 49 | return self.has_key(key) 50 | 51 | def __str__(self): 52 | return self.cache.__str__() 53 | 54 | def __iter__(self): 55 | # Iterate directly on the underlying dict, rather than on this 56 | # class, in order to change the order of cached items (as 57 | # opposed to []/__getitem__, which will reinsert an item on top 58 | # of the stack whenever it is looked up. 59 | return iter(self._cache) 60 | 61 | if __name__ == "__main__": 62 | from unittest import main 63 | main(module="test_" + __file__[:-3]) 64 | 65 | -------------------------------------------------------------------------------- /sg/utils/genemapper.py: -------------------------------------------------------------------------------- 1 | """The genemappers map a real-valued gene to an allele.""" 2 | 3 | import math 4 | 5 | from pyevolve import GAllele, G1DList, Consts 6 | 7 | class _AlleleMapper(): 8 | def _get_normalized_gene(self, gene_val, gene_range): 9 | gene_norm = float(gene_val - gene_range[0]) / \ 10 | (gene_range[1] - gene_range[0]) 11 | if gene_norm < 0 or gene_norm > 1: 12 | raise ValueError("Gene value (%f) outside allowed range (%f - %f)." \ 13 | % (gene_val, gene_range[0], gene_range[1])) 14 | return gene_norm 15 | 16 | class MappedAlleleRange(GAllele.GAlleleRange, _AlleleMapper): 17 | """Subclass of GAllele.GAlleleRange that provides a way of mapping from a 18 | real-valued gene to a range allele gene.""" 19 | 20 | def __init__(self, begin=Consts.CDefRangeMin, 21 | end=Consts.CDefRangeMax, real=False, scaling='linear'): 22 | """See superclass for begin, end and real args. 'scaling' scales the 23 | mapping, and can be linear or log. If scaling is log, then begin < end 24 | must hold.""" 25 | GAllele.GAlleleRange.__init__(self, begin, end, real) 26 | self._scaling = scaling 27 | 28 | def map_to_allele(self, gene_val, gene_range): 29 | """Map a gene value in gene_range to the corresponding allele value.""" 30 | if len(self.beginEnd) != 1: 31 | raise NotImplementedError("The mapper can currently only handle " \ 32 | "alleles with a single range.") 33 | gene_norm = self._get_normalized_gene(gene_val, gene_range) 34 | beginEnd = self.beginEnd[0] 35 | to_range = (beginEnd[1] - beginEnd[0]) 36 | if self._scaling == 'log': 37 | to_range = math.log(1 + to_range) 38 | mapped_val = beginEnd[0] + math.exp(gene_norm * to_range) - 1 39 | elif self._scaling == 'linear': 40 | mapped_val = beginEnd[0] + gene_norm * to_range 41 | else: 42 | raise ValueError("Unknown scaling method: %s" % self._scaling) 43 | if not self.real: 44 | return int(round(mapped_val)) 45 | return max(beginEnd[0], min(beginEnd[1], mapped_val)) 46 | 47 | class MappedAlleleList(GAllele.GAlleleList, _AlleleMapper): 48 | """Subclass of GAllele.GAlleleList that provides a way of mapping from a 49 | real-valued gene to a list allele gene.""" 50 | 51 | def map_to_allele(self, gene_val, gene_range): 52 | gene_norm = self._get_normalized_gene(gene_val, gene_range) 53 | to_idx = int(gene_norm * len(self.options)) 54 | # In case gene_norm == 1: 55 | if to_idx == len(self.options): 56 | to_idx -= 1 57 | return self.options[to_idx] 58 | 59 | 60 | def map_to_alleles(genome): 61 | """Maps from a (real-valued G1DList) genome to a list of allele genes.""" 62 | alleles = genome.getParam("allele") 63 | genes = genome[:] 64 | gene_range = (genome.getParam("rangemin"), genome.getParam("rangemax")) 65 | return [alleles[i].map_to_allele(genes[i], gene_range) 66 | for i in range(len(genes))] 67 | 68 | 69 | 70 | if __name__ == "__main__": 71 | from unittest import main 72 | main(module="test_" + __file__[:-3]) 73 | 74 | -------------------------------------------------------------------------------- /sg/utils/genemapper.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/genemapper.pyc -------------------------------------------------------------------------------- /sg/utils/output.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/output.pyc -------------------------------------------------------------------------------- /sg/utils/plot_fitnesses.py: -------------------------------------------------------------------------------- 1 | import sqlite3 as sql 2 | import glob 3 | import numpy as np 4 | import sys 5 | import optparse 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | import sg.utils 10 | 11 | def fetch_one(db_path, exp_id="ex1"): 12 | """Fetch one evolution, identified by exp_id, from the database given in 13 | db_path. Returns an array holding: generation, min/avg/max fitness, and 14 | fitness std dev.""" 15 | with sql.connect(db_path) as conn: 16 | crs = conn.execute("select generation, rawMin, rawAve, rawMax, " 17 | "rawDev from statistics") 18 | return np.array(zip(*crs.fetchall())) 19 | 20 | def fetch_match(pattern, exp_id="ex1"): 21 | """Fetch all files matching pattern.""" 22 | # While tempting to test for isinstance(pattern, collections.iterable) in 23 | # order to support multiple patterns (e.g. "fetch_match(sys.argv[1:])"), a 24 | # string (such as the pattern) will also pass this test. 25 | return [fetch_one(path, exp_id) for path in glob.glob(pattern)] 26 | 27 | def _common_generations(evolutions): 28 | """Return the list of evolutions shortened to only common generations.""" 29 | stats_lengths = [evo.shape[1] for evo in evolutions] 30 | last_common_idx = min(stats_lengths) 31 | first_evo = evolutions[0] 32 | last_common_gen = first_evo[0, last_common_idx - 1] 33 | for evo in evolutions[1:]: 34 | for gen in range(last_common_idx): 35 | if evo[0,gen] != first_evo[0,gen]: 36 | last_common_gen = min(last_common_gen, gen) 37 | last_common_idx = gen 38 | break 39 | 40 | max_gen = max([evo[0, -1] for evo in evolutions]) 41 | print "max gen is", max_gen 42 | print "last_common_gen is", last_common_gen 43 | if last_common_gen < max_gen: 44 | print >>sys.stderr, "Some generations missing in at least one " \ 45 | "simulation. Plotting only the first generations 0-%d." \ 46 | % last_common_gen 47 | print >>sys.stderr, "Lengths of statistics for each evolution:" 48 | print >>sys.stderr, "\t", stats_lengths 49 | print >>sys.stderr, "Generation at last common index for each evolution:" 50 | print >>sys.stderr, "\t", [evo[0, last_common_idx-1] 51 | for evo in evolutions] 52 | if last_common_idx <= 0: 53 | raise ValueError("No overlapping generations (one empty evolution?).") 54 | return [evo[:,:last_common_idx] for evo in evolutions] 55 | 56 | def join(evolutions): 57 | """Join the output from several evolutions. Evolutions is a list where each 58 | evolution element is an array as returned from fetch_one. 59 | 60 | Return generation and averages. 61 | """ 62 | evolutions = _common_generations(evolutions) 63 | return np.average(np.array(evolutions), axis=0) 64 | 65 | def plot_evols(evolutions, generations=None, axes=None, **plt_kwargs): 66 | if axes is None: 67 | axes = plt.axes() 68 | col = sg.utils.Enum("gen", "min", "avg", "max", "dev") 69 | mg = -1 if generations is None else generations + 1 70 | axes.plot(evolutions[col.gen,0:mg], evolutions[col.min,0:mg], 71 | label="Minimum", **plt_kwargs) 72 | axes.plot(evolutions[col.gen,0:mg], evolutions[col.avg,0:mg], 73 | label="Average", **plt_kwargs) 74 | axes.plot(evolutions[col.gen,0:mg], evolutions[col.max,0:mg], 75 | label="Maximum", **plt_kwargs) 76 | axes.plot(evolutions[col.gen,0:mg], evolutions[col.dev,0:mg], 77 | label="Devation", **plt_kwargs) 78 | return axes 79 | 80 | def _get_options(): 81 | parser = optparse.OptionParser() 82 | parser.usage = "[options] path_to_pyevolve.db [more_pyevolve.dbs]" 83 | parser.description = "Plot fitness averaged over evolutions from multiple Pyevolve sqlite3 databases" 84 | parser.add_option("--title", dest="title", default=None, help="Title for the plot") 85 | parser.add_option("--exp", dest="exp_id", default="ex1", help="Name identifying experiment in database") 86 | parser.add_option("--generations", dest="generations", type="int", default=None, help="Max number of generations to plot") 87 | parser.add_option("--ymin", dest="ymin", type="float", default=None, help="Fix Y axis to given min value") 88 | parser.add_option("--ymax", dest="ymax", type="float", default=None, help="Fix Y axis to given max value") 89 | return parser.parse_args() 90 | 91 | if __name__ == "__main__": 92 | options, args = _get_options() 93 | evolutions = [fetch_one(path, options.exp_id) for path in args] 94 | print "Plotting the average of %d evolutions." % len(evolutions) 95 | average = join(evolutions) 96 | plot_evols(average, generations=options.generations) 97 | plt.legend(loc=(0.2, 0.2)) 98 | if options.ymin is not None: 99 | plt.ylim(ymin=options.ymin) 100 | if options.ymax is not None: 101 | plt.ylim(ymax=options.ymax) 102 | if options.title is not None: 103 | plt.title(options.title) 104 | plt.show() 105 | -------------------------------------------------------------------------------- /sg/utils/pyevolve_mpi.py: -------------------------------------------------------------------------------- 1 | """MPI variant of Pyevolve.""" 2 | 3 | import numpy as np 4 | import random 5 | import sys 6 | import collections 7 | 8 | from mpi4py import MPI 9 | import pyevolve 10 | from pyevolve.GPopulation import GPopulation, multiprocessing_eval 11 | from pyevolve.GSimpleGA import GSimpleGA 12 | 13 | import sg.utils 14 | import pyevolve_utils as pu 15 | from sg.utils.cache import ATimeCache 16 | 17 | class MPIPopulation(pu.SpecifiedPopulation): 18 | def __init__(self, ga, genome): 19 | self._ga = ga 20 | if isinstance(genome, pu.SpecifiedPopulation): 21 | if not isinstance(genome, MPIPopulation): 22 | raise RuntimeError("A non-MPI population has crept into the system!") 23 | pu.SpecifiedPopulation.__init__(self, genome) 24 | 25 | def _make_data_cache_key(self): 26 | key = [] 27 | train_iter = self._ga.model.dataset.train_data_iterator() 28 | for (data_in, data_out) in train_iter(): 29 | key += [data_in.index[0].value, data_in.index[-1].value, 30 | data_out.index[0].value, data_out.index[-1].value] 31 | return tuple(key) 32 | 33 | def _make_cache_key(self, indiv): 34 | genome = pu.raw_genes(indiv, True) 35 | return tuple(sg.utils.safe_deep_flatten(genome)) 36 | 37 | def evaluate(self, **args): 38 | if self._ga.rank == 0: 39 | cache = self._ga.caches[self._make_data_cache_key()] 40 | keys = [self._make_cache_key(indiv) for indiv in self.internalPop] 41 | uncached = [key not in cache for key in keys] 42 | uncached_indices = np.where(uncached)[0] 43 | cached_indices = np.where(np.logical_not(uncached))[0] 44 | unevaled_pop = [self.internalPop[index] for index in uncached_indices] 45 | pop_size = len(unevaled_pop) 46 | print "Cache size is {}, unevaluated population size is {}. Now scattering"\ 47 | .format(len(cache), pop_size) 48 | sys.stdout.flush() 49 | indices = np.linspace(0, pop_size, self._ga.nhosts+1).astype('int') 50 | scattered = [unevaled_pop[start:end] for (start, end) in \ 51 | zip(indices[:-1], indices[1:])] 52 | else: 53 | scattered = None 54 | indivs = self._ga.comm.scatter(scattered) 55 | fitnesses = np.array([multiprocessing_eval(indiv) for indiv in indivs]) 56 | # print "Evaluation of {} indivs complete on host {}".format(len(indivs), self._ga.rank) 57 | sys.stdout.flush() 58 | all_fitnesses = self._ga.comm.gather(fitnesses) 59 | if self._ga.rank == 0: 60 | # Fetch from cache before adding newly evaluated genomes, as 61 | # these may otherwise delete old cached entries before their 62 | # values are retrieved. 63 | for index in cached_indices: 64 | self.internalPop[index].score = cache[keys[index]] 65 | for index, score in zip(uncached_indices, np.concatenate(all_fitnesses)): 66 | self.internalPop[index].score = score 67 | cache[keys[index]] = score 68 | self.clearFlags() 69 | 70 | 71 | class SimpleMPIGA(pu.SimpleGAWithFixedElitism): 72 | def __init__(self, model, genome, seed=None, interactiveMode=True): 73 | self._init_MPI() 74 | self._model = model 75 | self._caches = collections.defaultdict(lambda: ATimeCache(1000)) 76 | pu.SimpleGAWithFixedElitism.__init__(self, genome, seed, interactiveMode) 77 | 78 | def _init_MPI(self): 79 | self._comm = MPI.COMM_WORLD 80 | self._nhosts = self._comm.Get_size() 81 | self._rank = self._comm.Get_rank() 82 | 83 | def make_population(self, genome): 84 | return MPIPopulation(self, genome) 85 | 86 | def evolve(self, freq_stats=0): 87 | if not self.terminationCriteria.isEmpty(): 88 | raise RuntimeError("Termination criteria other than number of generations unsupported under MPI.") 89 | if self._rank != 0: 90 | raise RuntimeError("Evolve should only be called on rank 0 process.") 91 | return pu.SimpleGAWithFixedElitism.evolve(self, freq_stats) 92 | 93 | def eval_loop(self): 94 | stopFlagCallback = False 95 | for gen in range(self.nGenerations + 1): 96 | self.internalPop.evaluate() 97 | if not self.stepCallback.isEmpty(): 98 | for it in self.stepCallback.applyFunctions(self): 99 | stopFlagCallback = it 100 | if stopFlagCallback: 101 | break 102 | 103 | @property 104 | def model(self): 105 | return self._model 106 | 107 | @property 108 | def caches(self): 109 | return self._caches 110 | 111 | @property 112 | def comm(self): 113 | return self._comm 114 | 115 | @property 116 | def rank(self): 117 | return self._rank 118 | 119 | @property 120 | def nhosts(self): 121 | return self._nhosts 122 | -------------------------------------------------------------------------------- /sg/utils/queue_jobs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import sys 4 | import os 5 | 6 | class JobSubmitter(object): 7 | def __init__(self, stream): 8 | self.batch_size = 1 9 | self.max_in_queue = 10 10 | self.wait_time_secs = 5 11 | self._jobs = [job[:-1] for job in stream] 12 | self._user = os.environ['USER'] 13 | self._queue_status = "" 14 | self._slots_available = 0 15 | 16 | def _sys_cmd(self, cmd): 17 | #print "_sys_cmd with command:", cmd 18 | return subprocess.check_output(cmd, shell=True) 19 | 20 | def _log(self, *args): 21 | sys.stdout.write(*args) 22 | sys.stdout.write("\n") 23 | sys.stdout.flush() 24 | 25 | def _jobs_remaining(self): 26 | return len(self._jobs) > 0 27 | 28 | def _is_job_running(self, job): 29 | return self._queue_status.find(job) >= 0 30 | 31 | def _update_queue_status(self): 32 | self._queue_status = self._sys_cmd("qstat -f") 33 | 34 | def _resources_available(self): 35 | cmd = "qstat |grep %s |wc |awk '{print $1}'" % self._user 36 | in_queue = int(self._sys_cmd(cmd)[:-1]) 37 | self._slots_available = self.max_in_queue - in_queue 38 | return self._slots_available > 0 39 | 40 | def _submit_more_jobs(self): 41 | self._update_queue_status() 42 | submitted = 0 43 | max_submissions = min(self._slots_available, self.batch_size) 44 | while self._jobs and submitted < max_submissions: 45 | job = self._jobs.pop(0) 46 | if self._is_job_running(job): 47 | self._log("Skipping job, already running: " + job) 48 | else: 49 | self._sys_cmd("qsub " + job) 50 | self._log("Submitted job at " + time.strftime("%b. %d, %X: ") \ 51 | + job) 52 | submitted += 1 53 | return submitted 54 | 55 | def _wait(self, brief=False): 56 | if brief: 57 | time.sleep(15) 58 | else: 59 | time.sleep(self.wait_time_secs) 60 | 61 | def submit_jobs(self): 62 | self._log("Queueing " + str(len(self._jobs)) + " jobs for submission in " + \ 63 | "batches of " + str(self.batch_size) + ". Polling the queue " + \ 64 | "for free space every " + str(self.wait_time_secs/60.0) + " minutes.") 65 | while self._jobs_remaining(): 66 | brief_wait = False 67 | if self._resources_available(): 68 | num_submitted = self._submit_more_jobs() 69 | brief_wait = num_submitted < self._slots_available 70 | self._wait(brief=brief_wait) 71 | self._log("All jobs submitted. Bye for now.") 72 | 73 | def get_options(): 74 | """Add prediction-related options to the parser. If no parser is provided, one 75 | will be created.""" 76 | import optparse 77 | parser = optparse.OptionParser() 78 | parser.usage = "[options] [jobfile]" 79 | parser.description = "Send jobs in batches to the queueing system. The list of jobs can be sent to stdin or be stored in jobfile." 80 | parser.add_option("--wait", dest="wait", type="float", help="Wait time in minutes between each check of the queue", default=10) 81 | parser.add_option("--queued", dest="queued", type="long", help="Max number of jobs in the queue at once", default=10) 82 | parser.add_option("--batch", dest="batch", type="long", help="Number of jobs to submit each time", default=1) 83 | (options, args) = parser.parse_args() 84 | return options, args 85 | 86 | def make_submitter(path=None): 87 | if path is None: 88 | if sys.stdin.isatty(): 89 | sys.stderr.write("You must submit a path or cat jobs to stdin.") 90 | exit(1) 91 | print "Reading jobs from standard input." 92 | return JobSubmitter(sys.stdin) 93 | else: 94 | print "Reading jobs from " + path + "." 95 | with open(path, "r") as f: 96 | return JobSubmitter(f) 97 | 98 | if __name__ == "__main__": 99 | options, args = get_options() 100 | submitter = make_submitter(args[0] if args else None) 101 | submitter.wait_time_secs = options.wait * 60 102 | submitter.batch_size = options.batch 103 | submitter.max_in_queue = options.queued 104 | submitter.submit_jobs() 105 | -------------------------------------------------------------------------------- /sg/utils/scripts/best-genomes-found.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME=$0 4 | if test -n "`type rev 2>/dev/null`"; then 5 | NAME="`echo $0 | rev | cut -d '/' -f 1 | rev`"; 6 | fi 7 | 8 | NAME=$0 9 | if test -n "`type basename 2>/dev/null`"; then 10 | NAME="`basename $0`"; 11 | fi 12 | 13 | USAGE="Usage: 14 | $NAME [options] [ file [ file2 ...] ] 15 | 16 | Print the best genomes found (as alleles) by evolution for each of the files 17 | given on the command line, or from standard input if no files are given. The 18 | script works by finding the last line containing the word 'alleles', and then 19 | printing everything inside brackets on that line. 20 | 21 | Options: 22 | -a 23 | Print all generations, not just the last (or first). 24 | -r 25 | Print raw genes (instead of genes mapped to allele ranges) from old log files. 26 | -m 27 | Print mapped genes from old log files. 28 | -h 29 | Prints this help. 30 | -f 31 | Print the first generation rather than the last 32 | " 33 | 34 | KEYWORD="Best genome at generation [0-9]\+: \[" 35 | TAIL="tail -n 1" 36 | while getopts afmrh'?' opt 37 | do 38 | case $opt in 39 | a) 40 | TAIL="cat";; 41 | f) 42 | TAIL="head -n 1";; 43 | r) 44 | KEYWORD="raw genes";; 45 | m) 46 | KEYWORD="alleles";; 47 | h|'?'|?|*) 48 | echo "$USAGE" 49 | exit 2;; 50 | esac 51 | done 52 | shift `expr $OPTIND - 1` 53 | 54 | if [ $# -eq 0 ]; then 55 | INPUT='-' 56 | else 57 | INPUT="$@" 58 | fi 59 | 60 | for f in $INPUT; do 61 | extension="${f##*.}" 62 | # Likewise, filename="${f%.*}" 63 | if [ "$extension" == "bz2" ]; then 64 | GREP=bzgrep 65 | else 66 | GREP=grep 67 | fi 68 | $GREP "$KEYWORD" "$f" |$TAIL; 69 | done |sed -e's/.*\[//; s/\]$//' 70 | -------------------------------------------------------------------------------- /sg/utils/scripts/list-finished-jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | models=arima 4 | datasets="total-load bc-data" 5 | match="Error on test phase for best genome found" 6 | for model in $models; do 7 | for data in $datasets; do 8 | for (( r=0; $r<30; r++ )); do 9 | base=output_${model}_run_${r}_${data}-noholidays_; 10 | for prep in "" subtract-daily-pattern_ subtract-weekly-pattern_; do 11 | f=${base}${prep}100_0.txt; 12 | have_txt=false 13 | fin_txt=false 14 | have_bz2=false 15 | fin_bz2=false 16 | if [[ -e "$f" ]]; then 17 | have_txt=true 18 | if [[ -n "`grep \"$match\" $f`" ]]; then 19 | fin_txt=true 20 | fi 21 | fi 22 | if [[ -e "$f.bz2" ]]; then 23 | have_bz2=true 24 | if [[ -n "`bzgrep \"$match\" $f.bz2`" ]]; then 25 | fin_bz2=true 26 | fi 27 | fi 28 | if [[ ${have_txt} == true && ${have_bz2} == true ]]; then 29 | if [[ ${fin_txt} == true && ${fin_bz2} == true ]]; then 30 | echo "Duplicate, both finished: $f/.bz2" 31 | elif [[ ${fin_txt} == true ]]; then 32 | echo "Duplicate, only .txt finished: $f/.bz2" 33 | elif [[ ${fin_bz2} == true ]]; then 34 | echo "Duplicate, only .bz2 finished: $f/.bz2" 35 | else 36 | echo "Duplicate, both incomplete: $f/.bz2" 37 | fi 38 | elif [[ ${have_txt} == true ]]; then 39 | if [[ ${fin_txt} == true ]]; then 40 | echo "Finished: $f" 41 | else 42 | echo "Incomplete: $f (no .bz2)" 43 | fi 44 | elif [[ ${have_bz2} == true ]]; then 45 | if [[ ${fin_bz2} == true ]]; then 46 | echo "Finished: $f.bz2" 47 | else 48 | echo "Incomplete: $f.bz2 (no .txt)" 49 | fi 50 | else 51 | echo "Both missing: $f/.bz2" 52 | fi 53 | done; 54 | done 55 | done 56 | done 57 | -------------------------------------------------------------------------------- /sg/utils/scripts/parse-logs-into-csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script takes a list of output_....txt evolution log files as 4 | # input, and parses them into one big csv file where the fitnesses 5 | # (scaled and raw) and genes of each individual in each generation is 6 | # stored. Each line is prepended with the run number and the generation 7 | # number. 8 | # 9 | # Input: List of files to parse, already bunzipped 10 | # Output: Huge CSV file 11 | 12 | inputs=$@ 13 | output=all.csv 14 | 15 | dirs="generated filed genes stripped" 16 | for d in $dirs; do 17 | if test -e $d; then 18 | echo "Temporary directory '$d' already exists. Exiting." 19 | exit 20 | fi 21 | done 22 | if test -e $output; then 23 | echo "Output file '$output' already exists. Exiting." 24 | exit 25 | fi 26 | 27 | mkdir -v $dirs 28 | 29 | echo "Appending generation number to each line of logs..." 30 | for f in $inputs; do 31 | awk 'BEGIN{gen=0} /Best genome at generation [0-9]* had/{gen++} {print gen, ",", $0}' $f >generated/$f; 32 | done 33 | 34 | echo "Appending run number to each line in each file..." 35 | pushd generated/ 36 | for f in $inputs; do 37 | run=`echo $f | awk -F_ '{print $4}'`; 38 | awk "{print $run, \",\", \$0}" $f >../filed/$f; 39 | done 40 | popd 41 | 42 | echo "Extracting lines with genes and fitnesses (population dump) from logs" 43 | pushd filed/ 44 | for f in $inputs; do 45 | gsed -n -e '/^[][0-9,[:space:]e.+-]\+$/p' $f |gsed -n -e'/[][]\+/p' >../genes/$f; 46 | done 47 | popd 48 | echo "Removing braces from genes, comma-separating fields." 49 | pushd genes/ 50 | for f in $inputs; do 51 | sed -e's/[][]//g' $f |sed -e's/\([0-9]\) \([0-9]\)/\1, \2/g'; 52 | done >../$output 53 | popd 54 | 55 | echo "CSV saved to $output." 56 | 57 | echo "Removing temporary directories." 58 | rm -rf $dirs 59 | -------------------------------------------------------------------------------- /sg/utils/scripts/resubmit-jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME=$0 4 | if test -n "`type rev 2>/dev/null`"; then 5 | NAME="`echo $0 | rev | cut -d '/' -f 1 | rev`"; 6 | fi 7 | 8 | NAME=$0 9 | if test -n "`type basename 2>/dev/null`"; then 10 | NAME="`basename $0`"; 11 | fi 12 | 13 | submitter=$USER 14 | 15 | USAGE="Usage: 16 | $NAME jobscript [ more jobscripts...] 17 | 18 | Submit the job(s) given on the command line, as long as it/they aren't already 19 | present in the job queue. 20 | 21 | Options: 22 | -h 23 | Prints this help. 24 | -u user 25 | Check jobs submitted by user rather than jobs submitted by '$submitter'. 26 | " 27 | 28 | while getopts u:h'?' opt 29 | do 30 | case $opt in 31 | u) 32 | submitter=$OPTARG;; 33 | h|'?'|?|*) 34 | echo "$USAGE" 35 | exit 2;; 36 | esac 37 | done 38 | shift `expr $OPTIND - 1` 39 | 40 | if [ $# -eq 0 ]; then 41 | echo "$USAGE" 42 | exit 2 43 | fi 44 | 45 | for job in $@; do 46 | if [ -z "`qstat -f -u $submitter | grep "$job"`" ]; then 47 | echo "(Re)submitting job $job:" 48 | qsub "$job" 49 | else 50 | echo "Job '$job' already in queue, skipping." 51 | fi 52 | done 53 | -------------------------------------------------------------------------------- /sg/utils/scripts/split-test-validate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Split the output from running a prediction model on the test set (i.e. the 4 | # output file from running a GA) into two halves: validation and test. 5 | # 6 | # In other words: filter the input, keeping only lines containing 7 | # $filter_pattern. Split the resulting set of lines in the middle, and average 8 | # the values found in the last field on each line in each half. 9 | 10 | filter_pattern="^Error for test at" 11 | filter () { 12 | extension="${1##*.}" 13 | if [ "$extension" == "bz2" ]; then 14 | CAT=bzcat 15 | else 16 | CAT=cat 17 | fi 18 | $CAT $1 | sed -n -e"/$filter_pattern/p" 19 | } 20 | 21 | split_and_calc() { 22 | flines=`filter $1 | wc | awk '{print $1}'` 23 | filter $1 \ 24 | | awk "{ 25 | if (NR <= $flines/2) { 26 | valid += (\$NF)^2; 27 | vlines++; 28 | } else { 29 | test += (\$NF)^2; 30 | tlines++; 31 | } 32 | } 33 | END { 34 | print \"This script assumes RMSE for each day is based on the same number of predictions (24).\"; 35 | print \"RMSE on validation phase (\", vlines, \" lines): \", sqrt(valid/vlines); 36 | print \"RMSE on test phase (\", tlines, \" lines): \", sqrt(test/tlines); 37 | }" 38 | echo "" 39 | } 40 | 41 | split_and_calc $1 42 | -------------------------------------------------------------------------------- /sg/utils/scripts/summarize-simulation-results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . $HOME/local/bin/query.sh 4 | 5 | NAME=$0 6 | if test -n "`type basename 2>/dev/null`"; then 7 | NAME="`basename $0`"; 8 | fi 9 | 10 | USAGE="Usage: 11 | $NAME [options] 12 | 13 | Summarize simulation results, with or without plotting fitness graphs 14 | 15 | Options: 16 | -g 17 | Show graphs 18 | " 19 | 20 | show_graphs="FALSE" 21 | 22 | while getopts 'gh?' opt 23 | do 24 | case $opt in 25 | g) 26 | show_graphs="TRUE";; 27 | h|'?'|?|*) 28 | echo "$USAGE" 29 | exit 2;; 30 | esac 31 | done 32 | shift `expr $OPTIND - 1` 33 | 34 | 35 | children() { 36 | ps -o pid,ppid,command | grep "[0-9][0-9]*[[:space:]]\+$$" | awk '{print $1}' 37 | } 38 | 39 | for dataset in _gef-data _gef-temp-data; do 40 | if [ -z "$dataset" ]; then 41 | ymax=0.7 42 | elif [ "$dataset" == "_total-load" ]; then 43 | ymax=0.07 44 | else 45 | ymax=0.009 46 | fi 47 | for model in ar; do 48 | clean="" #_no-cleaning # Axl: Everything is without cleaning. To avoid errors in the following loop (cleaning 49 | # and subtract can not be in the same parameter, since they both exist in the filename), 50 | # this is where the clean parameter loop should be. 51 | for subtract in _subtract-weekly-pattern ; do 52 | # Find the relevant log files. Using 'find' to ensure regexp search 53 | # rather than full wildcard matching on the '*' in the filename. 54 | GREP=bzgrep 55 | ext=txt.bz2 56 | pattern="./output_${model}_run_[0-9]*${dataset}${subtract}${clean}_100_0_zone_1" # Axl note: _100_0 was added. 57 | logs=`find . -regex "$pattern.${ext}"` 58 | if [ -z "$logs" ]; then 59 | echo -e "\nNo matches for $pattern.${ext}, trying non-compressed files." 60 | ext=txt 61 | logs=`find . -regex "$pattern.${ext}"` 62 | if [ -z "$logs" ]; then 63 | echo "No matches for $pattern.${ext} either." 64 | continue 65 | fi 66 | GREP=grep 67 | fi 68 | #echo $logs 69 | # Use log file paths to find database paths 70 | dbs=`echo $logs | sed -e"s/output_/pyevolve_/g; s/\.${ext}/.db/g"` 71 | 72 | # Make a "nice" title 73 | if [ -z "$dataset" ]; then 74 | datasettxt="single-user" 75 | else 76 | datasettxt=$dataset 77 | fi 78 | if [ -z "$clean" ]; then 79 | cleantxt="with cleaning" 80 | else 81 | cleantxt=$clean 82 | fi 83 | nlogs=`ls -1 $logs 2>/dev/null | wc | awk '{print $1}'` 84 | ndbs=`ls -1 $dbs 2>/dev/null | wc | awk '{print $1}'` 85 | title="$model ${datasettxt} ${subtract} ${cleantxt} ($nlogs output logs, $ndbs databases)" 86 | 87 | # Calculate average prediction errors 88 | mm="python $HOME/local/bin/minmax.py" 89 | echo -e "\n $title" 90 | echo "Test set prediction error as stored in file (old and new runs use different measures):" 91 | $GREP "Error on test phase" $logs | awk '{print $12}' | $mm --header 92 | echo "Test set prediction error as mean of daily errors:" 93 | # for log in $logs; do 94 | # echo "$GREP 'Error for test at' $log" 95 | # $GREP 'Error for test at' $log | awk '{rows++; total += $NF}END{print total/rows}' 96 | # done 97 | for log in $logs; do 98 | $GREP 'Error for test at' $log | awk '{rows++; total += $NF}END{print total/rows}' 99 | done | $mm --header 100 | #echo "Fitnesses (not error) of last generation (rows=Min,ave,max,dev):" 101 | echo -n "Average maximum fitness (not error) of last generation: " 102 | # The sed selects the max fitness row, the awk selects the average column. 103 | for db in $dbs; do 104 | cat </dev/null & 118 | fi 119 | 120 | done 121 | echo "" 122 | done 123 | done 124 | 125 | if [ "${show_graphs}" == "TRUE" ]; then 126 | if query "Close plot windows?" "y"; then 127 | # Children will include the ps and grep processes, so redirect stderr. 128 | kill $(children) 2>/dev/null 129 | exit 130 | fi 131 | fi 132 | 133 | -------------------------------------------------------------------------------- /sg/utils/test_cache.py: -------------------------------------------------------------------------------- 1 | """Test cache class(es).""" 2 | 3 | import os 4 | import unittest 5 | 6 | import sg.utils.testutils as testutils 7 | 8 | from cache import * 9 | 10 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | class TestCache(testutils.ArrayTestCase): 13 | def setUp(self): 14 | self.size = 10 15 | self.cache = ATimeCache(max_entries=self.size) 16 | 17 | def _overfill_cache(self): 18 | for i in range(self.size*2): 19 | self.cache[i] = i 20 | 21 | def test_max_size(self): 22 | """Check that the cache size doesn't exceed the given size.""" 23 | self._overfill_cache() 24 | self.assertEqual(self.size, len(self.cache)) 25 | 26 | def test_resize(self): 27 | self._overfill_cache() 28 | new_size = self.size - 2 29 | self.cache.max_entries = new_size 30 | self.assertEqual(new_size, len(self.cache)) 31 | self._overfill_cache() 32 | self.assertEqual(new_size, len(self.cache)) 33 | 34 | def test_store_retrieve(self): 35 | """Check that storage and retrieval works both on empty and full 36 | caches.""" 37 | self.cache[12] = 12 38 | self.assertEqual(self.cache[12], 12) 39 | self._overfill_cache() 40 | for i in range(self.size): 41 | self.cache[-i] = i*12 42 | for i in range(self.size): 43 | self.assertEqual(self.cache[-i], i*12) 44 | 45 | def test_retrieve_nonexisting(self): 46 | """Check that retrieval of a non-existing key fails.""" 47 | with self.assertRaises(KeyError): 48 | x = self.cache[0] 49 | 50 | def test_read_refreshes(self): 51 | """Check that a read refreshes the cache status.""" 52 | for i in range(100): 53 | self.cache[-12] = 1 54 | self.cache[i] = i 55 | with self.assertRaises(KeyError): 56 | x = self.cache[0] 57 | self.assertEqual(self.cache[-12], 1) 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | 62 | -------------------------------------------------------------------------------- /sg/utils/test_genemapper.py: -------------------------------------------------------------------------------- 1 | """Testing the gene mapper classes.""" 2 | 3 | import os 4 | import unittest 5 | 6 | from pyevolve import GAllele, G1DList 7 | 8 | from genemapper import * 9 | 10 | class TestGeneMapper(unittest.TestCase): 11 | def test_range_map(self): 12 | allele = MappedAlleleRange(10, 100) 13 | self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 10) 14 | self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 33) 15 | self.assertEqual(allele.map_to_allele(0, (-1, 1)), 55) 16 | self.assertEqual(allele.map_to_allele(1, (-1, 1)), 100) 17 | self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1)) 18 | self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1)) 19 | 20 | def test_float_range(self): 21 | allele = MappedAlleleRange(2, 4, real=True) 22 | self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2) 23 | self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2.5) 24 | self.assertEqual(allele.map_to_allele(0.4, (-1, 1)), 3.4) 25 | self.assertEqual(allele.map_to_allele(1, (-1, 1)), 4) 26 | self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1)) 27 | self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1)) 28 | 29 | def test_float_range_log(self): 30 | allele = MappedAlleleRange(2, 4, real=True, scaling='log') 31 | self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2) 32 | #self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2.5) 33 | #self.assertEqual(allele.map_to_allele(0.4, (-1, 1)), 3.4) 34 | self.assertEqual(allele.map_to_allele(1, (-1, 1)), 4) 35 | self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1)) 36 | self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1)) 37 | 38 | def test_long_list(self): 39 | allele = MappedAlleleList([2, 4, 12]) 40 | self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2) 41 | self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2) 42 | self.assertEqual(allele.map_to_allele(0.3, (-1, 1)), 4) 43 | self.assertEqual(allele.map_to_allele(1, (-1, 1)), 12) 44 | self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1)) 45 | self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1)) 46 | 47 | def test_short_list(self): 48 | allele = MappedAlleleList([-12]) 49 | self.assertEqual(allele.map_to_allele(-1, (-1, 1)), -12) 50 | self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), -12) 51 | self.assertEqual(allele.map_to_allele(0.3, (-1, 1)), -12) 52 | self.assertEqual(allele.map_to_allele(1, (-1, 1)), -12) 53 | self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1)) 54 | self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1)) 55 | 56 | class TestGenomeMapper(unittest.TestCase): 57 | def setUp(self): 58 | self.alleles = GAllele.GAlleles() 59 | self.alleles.add(MappedAlleleRange(10, 100)) 60 | self.alleles.add(MappedAlleleRange(0, 2, real=True)) 61 | self.alleles.add(MappedAlleleList([2, 4, 12])) 62 | self.alleles.add(MappedAlleleList([-1])) 63 | self.genome = G1DList.G1DList(len(self.alleles)) 64 | self.genome.setParams(allele=self.alleles, rangemin=-1, rangemax=1) 65 | 66 | def test_map_genome(self): 67 | self.genome[:] = [-1, -0.5, 0.4, 1] 68 | mapped_genome = map_to_alleles(self.genome) 69 | self.assertEqual(mapped_genome, [10, 0.5, 12, -1]) 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | 74 | -------------------------------------------------------------------------------- /sg/utils/test_output.py: -------------------------------------------------------------------------------- 1 | """Test the output utilities.""" 2 | 3 | import os 4 | import unittest 5 | import copy 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import sg.utils.testutils as testutils 11 | from sg.globals import SG_DATA_PATH 12 | 13 | from output import * 14 | 15 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | class TestOutputMisc(testutils.ArrayTestCase): 18 | def setUp(self): 19 | test_file = "_test_series_bc_hydro_no_temperatures_esn_run_8_bc-data.pickle" 20 | test_path = os.path.join(_PATH_TO_HERE, test_file) 21 | self.dataset = load_pickled_prediction(test_path) 22 | 23 | def tearDown(self): 24 | pass 25 | 26 | def test_split_dataset(self): 27 | """Test splitting of the test dataset, which is known to have 262 28 | days.""" 29 | left_lengths = [26, 52, 78, 104, 131, 157, 183, 209, 235] 30 | for splits in zip(np.arange(0.1, 1, 0.1), left_lengths): 31 | (left, right) = split_dataset(self.dataset, splits[0]) 32 | self.assertEqual(len(left[1]), splits[1]) 33 | self.assertEqual(len(left[0]), splits[1] * len(self.dataset[1][0])) 34 | self.assertEqual(len(right[1]), 262 - splits[1]) 35 | self.assertArraysEqual(self.dataset[0], left[0].append(right[0])) 36 | for (whole_days, split_days) in zip(self.dataset[1], left[1] + right[1]): 37 | self.assertArraysEqual(whole_days, split_days) 38 | 39 | def test_sort_by_validation_error(self): 40 | """Test sorting by validation error by faking a number of datasets.""" 41 | datasets = [self.dataset] 42 | # Incrementally append copies with modified target signal 43 | for i in range(10): 44 | next_set = copy.deepcopy(datasets[i]) 45 | indices = np.random.random_integers(len(next_set), size=i+1) 46 | next_set[0][indices] *= 1.2 47 | datasets.append(next_set) 48 | # Permute to make sure they are not ordered on entry 49 | shuffled = [datasets[i] for i in np.random.permutation(len(datasets))] 50 | val_sorted = sort_data_by_validation_error(shuffled) 51 | def index_of(left, right): 52 | for i in range(len(datasets)): 53 | if np.all(datasets[i][0] == (left[0].append(right[0]))): 54 | return i 55 | for i, (error, (left, right)) in zip(range(len(val_sorted)), val_sorted): 56 | self.assertEqual(i, index_of(left, right)) 57 | 58 | def test_matching_paths(self): 59 | """Since the output of the matching_paths function depends on the 60 | contents of the working directory, the tests here may have to be 61 | updated when files are added to or removed from the relevant 62 | directory.""" 63 | # Use full path to ensure that it works also when running unit tests 64 | # from another directory. 65 | here_wc = os.path.join(_PATH_TO_HERE, "*") 66 | wildcards = [here_wc, "test", "py$", "output"] 67 | self.assertEqual(matching_paths(wildcards), 68 | [os.path.join(_PATH_TO_HERE, "test_output.py")]) 69 | wildcards = [here_wc, "__+", ".py$"] 70 | self.assertEqual(matching_paths(wildcards), 71 | [os.path.join(_PATH_TO_HERE, "__init__.py")]) 72 | there_wc = os.path.join(SG_DATA_PATH, "bchydro", "*") 73 | wildcards = [there_wc, "area", "[89]"] 74 | targets = [os.path.join(SG_DATA_PATH, "bchydro", fname) for fname in \ 75 | ("2008controlareaload.csv", "jandec2009controlareaload.csv")] 76 | self.assertEqual(matching_paths(wildcards), targets) 77 | 78 | 79 | 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | 84 | -------------------------------------------------------------------------------- /sg/utils/test_timer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import StringIO 3 | import time 4 | 5 | from timer import * 6 | 7 | class TimerTester(unittest.TestCase): 8 | def _wrapped_timing(self, stream): 9 | t = SimpleTimer(stream) 10 | time.sleep(0.1) 11 | 12 | def test_report_when_out_of_scope(self): 13 | stream = StringIO.StringIO() 14 | self._wrapped_timing(stream) 15 | output = stream.getvalue() 16 | self.assertIn("Started at", output) 17 | self.assertIn("Ended at", output) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /sg/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cPickle as pickle 3 | import os 4 | import tempfile 5 | 6 | import numpy as np 7 | 8 | import testutils as testutils 9 | from utils import * 10 | 11 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | class NormalizerTester(testutils.ArrayTestCase): 14 | def setUp(self): 15 | self._data = np.array([0., 1., 2., 3., 4., 5.]) 16 | self._norm = np.array([0., 1./5, 2./5, 3./5, 4./5, 1.]) 17 | self._shifted = np.array([-3., -2., -1., 0., 1., 2.]) 18 | self._2d_data = np.array([[0., 1., 2.], [3., 4., 5.]]) 19 | self._2d_norm = np.array([[0., 1./5, 2./5], [3./5, 4./5, 1.]]) 20 | 21 | def test_normalize(self): 22 | normalizer = Normalizer(self._data) 23 | self.assertArraysEqual(normalizer.normalized, self._norm) 24 | self.assertArraysEqual(normalizer.normalize(self._data), self._norm) 25 | 26 | def test_normalize_other_data(self): 27 | normalizer = Normalizer(self._data) 28 | self.assertArraysAlmostEqual(normalizer.normalize(self._shifted), 29 | self._norm - 3./5) 30 | 31 | def test_expand(self): 32 | normalizer = Normalizer(self._data) 33 | self.assertArraysEqual(normalizer.expand(self._norm), self._data) 34 | 35 | def test_expand_other_data(self): 36 | normalizer = Normalizer(self._data) 37 | shifted_norm = np.array([-0.6, -0.4, -0.2, 0., 0.2, 0.4]) 38 | expanded = normalizer.expand(shifted_norm) 39 | self.assertArraysEqual(expanded, self._shifted) 40 | 41 | def test_twodim_flatten(self): 42 | normalizer = Normalizer(self._2d_data) 43 | self.assertArraysEqual(normalizer.normalized, self._2d_norm) 44 | self.assertArraysEqual(normalizer.normalize(self._data), self._norm) 45 | 46 | def test_twodim_axis_0(self): 47 | normalizer = Normalizer(self._2d_data, axis=0) 48 | self.assertArraysEqual(normalizer.normalized, 49 | np.array([[0, 0, 0], [1, 1, 1]])) 50 | self.assertArraysEqual(normalizer.expand([[-1, 0, 2], [1, 0.5, 2]]), 51 | np.array([[-3, 1, 8], [3, 2.5, 8]])) 52 | 53 | def test_twodim_axis_1(self): 54 | normalizer = Normalizer(self._2d_data, axis=1) 55 | self.assertArraysEqual(normalizer.normalized, 56 | np.array([[0, 0.5, 1], [0, 0.5, 1]])) 57 | 58 | 59 | class MiscTester(testutils.ArrayTestCase): 60 | def _test_enum_values(self, enum): 61 | self.assertEqual(enum.ZERO, 0) 62 | self.assertEqual(enum.ONE, 1) 63 | self.assertEqual(enum.TWO, 2) 64 | self.assertEqual(enum.NOT_THREE, 4) 65 | with self.assertRaises(AttributeError) as cm: 66 | x = enum.NONEXISTING 67 | 68 | def _make_enum(self): 69 | return Enum('ZERO', 'ONE', 'TWO', NOT_THREE=4) 70 | 71 | def test_enum_create(self): 72 | numbers = self._make_enum() 73 | self._test_enum_values(numbers) 74 | 75 | def test_pickle_enum(self): 76 | numbers = self._make_enum() 77 | storage = tempfile.NamedTemporaryFile(prefix='_test_utils_deleteme_', 78 | dir=_PATH_TO_HERE) 79 | pickle.dump(numbers, storage) 80 | storage.flush() 81 | storage.seek(0) 82 | numbers2 = pickle.load(storage) 83 | self._test_enum_values(numbers2) 84 | 85 | def test_indicer_values(self): 86 | manual = dict((('one', 0), 87 | ('two', 1), 88 | ('three', 2))) 89 | indices = indicer('one', 'two', 'three') 90 | self.assertEqual(indices, manual) 91 | 92 | def test_bound(self): 93 | self.assertEqual(bound(0, 1, -1), 0) 94 | self.assertEqual(bound(0, 1, 2), 1) 95 | self.assertEqual(bound(0, 1, 0.5), 0.5) 96 | 97 | def test_flatten(self): 98 | lists = ((1, 2), (3, 4), (5, 6)) 99 | flats = [1, 2, 3, 4, 5, 6] 100 | sublists = (((1, 2), (3, 4)), ((5, 6))) 101 | subflats = [(1, 2), (3, 4), 5, 6] 102 | self.assertEqual(flatten(*lists), flats) 103 | self.assertEqual(flatten(*sublists), subflats) 104 | 105 | def test_safe_flatten(self): 106 | l = [[1, 2, 3], 9, [[11, 12], [13, 14]], 22, 24] 107 | shallow = [1, 2, 3, 9, [11, 12], [13, 14], 22, 24] 108 | deep = [1, 2, 3, 9, 11, 12, 13, 14, 22, 24] 109 | self.assertEqual(list(safe_shallow_flatten(l)), shallow) 110 | self.assertEqual(list(safe_deep_flatten(l)), deep) 111 | 112 | def test_diffinv_determined(self): 113 | x = np.arange(10) 114 | diffed = np.diff(x) 115 | self.assertArraysEqual(diffed, np.ones(len(x) - 1)) 116 | self.assertArraysEqual(diffinv(diffed, xi=0), x) 117 | diffed = np.diff(x, n=2) 118 | self.assertArraysEqual(diffinv(diffed, n=2, xi=[0, 1]), x) 119 | # Difference increases by one each step 120 | x = np.array([1, 2, 4, 7, 11, 16, 22, 29, 37, 46]) 121 | self.assertArraysEqual(np.diff(x), np.arange(1, len(x))) 122 | self.assertArraysEqual( 123 | diffinv(np.diff(x, n=2), n=2, xi=[1, 2]), x) 124 | 125 | def test_diffinv_roundtrip(self): 126 | diffed = np.arange(10) 127 | for diff_order in range(10): 128 | xi = np.arange(diff_order) 129 | x = diffinv(diffed, n=diff_order, xi=xi) 130 | re_diff = np.diff(x, n=diff_order) 131 | re_x = diffinv(re_diff, n=diff_order, xi=xi) 132 | self.assertArraysEqual(diffed, re_diff) 133 | self.assertArraysEqual(x, re_x) 134 | 135 | if __name__ == "__main__": 136 | unittest.main() 137 | -------------------------------------------------------------------------------- /sg/utils/testutils.py: -------------------------------------------------------------------------------- 1 | """Unit testing utilities.""" 2 | 3 | import os 4 | import unittest 5 | 6 | import numpy as np 7 | 8 | class ArrayTestCase(unittest.TestCase): 9 | """This class adds some extra assertions in order to simplify working with 10 | numpy matrices. Uses oddCamelCase in public method names to be consistent 11 | with the asserts in unittest.TestCase.""" 12 | 13 | def assertArraysAlmostEqual(self, first, second, 14 | places=7, msg=None, delta=None): 15 | """Assert that two or more numpy arrays have the same shape and contain 16 | elements that are almost equal.""" 17 | self._generic_multi_array_assert(self.assertAlmostEqual, first, second, 18 | places=places, msg=msg, delta=delta) 19 | 20 | def assertArraysEqual(self, first, second, msg=None): 21 | """Assert that two or more numpy arrays have the same shape and contain 22 | elements that are equal.""" 23 | self._generic_multi_array_assert(self.assertEqual, first, second, 24 | msg=msg) 25 | 26 | def assertNaNArraysEqual(self, first, second, msg=None): 27 | """Assert that two or more numpy arrays have the same shape and contain 28 | elements that are equal. This is the same as assertArraysEqual, but 29 | with the addition of NaN == NaN.""" 30 | nans_first = np.isnan(first) 31 | nans_second = np.isnan(second) 32 | self.assertArraysEqual(nans_first, nans_second, msg=msg) 33 | self.assertArraysEqual(first[np.where(nans_first == False)[0]], 34 | second[np.where(nans_second == False)[0]], msg=msg) 35 | 36 | def _assert_are_arrays(self, *arrays): 37 | """Check that all the arrays passed in are actually numpy arrays.""" 38 | for array in arrays: 39 | self.assertIsInstance(array, np.ndarray) 40 | 41 | def _assert_same_shape_arrays(self, *arrays): 42 | """Check that all the arrays passed in have the same shape.""" 43 | self.assertGreater(len(arrays), 1) 44 | shape1 = arrays[0].shape 45 | for ar in arrays[1:]: 46 | self.assertEqual(shape1, ar.shape) 47 | 48 | def _assert_are_similar_arrays(self, *arrays): 49 | """Check that the arrays passed in are "similar": they are all numpy 50 | arrays with the same shape.""" 51 | self._assert_are_arrays(*arrays) 52 | self._assert_same_shape_arrays(*arrays) 53 | 54 | def _generic_multi_array_assert(self, assertion, first, second, **kwargs): 55 | """Generic array assert for several arrays. Checks that the arrays are 56 | similar, then performs the assertion element-by-element.""" 57 | self._assert_are_similar_arrays(first, second) 58 | flats = [array.flatten() for array in (first, second)] 59 | for elements in zip(*flats): 60 | assertion(*elements, **kwargs) 61 | 62 | -------------------------------------------------------------------------------- /sg/utils/testutils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/testutils.pyc -------------------------------------------------------------------------------- /sg/utils/timer.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | import sys 4 | from datetime import timedelta as dt 5 | 6 | class SimpleTimer(): 7 | """Basic timer class, initial code by Lester. 8 | 9 | Basic usage: 10 | timer = SimpleTimer() 11 | ...lost of slow code here... 12 | # Optional, will be called by destructor unless called manually: 13 | report = timer.end() 14 | print report 15 | 16 | Use cProfile for more in-depth profiling.""" 17 | 18 | def __init__(self, output_stream=sys.stdout): 19 | """Start timing (may be restarted by explicit calls to start()). Output 20 | printed by start() and end() will be printed to output_stream unless 21 | this is Null.""" 22 | self.times = [] 23 | self.labels = [] 24 | self._has_ended = False 25 | self._stream = output_stream 26 | self.start() 27 | 28 | def __del__(self): 29 | if (not self._has_ended) and self._stream is not None: 30 | print >>self._stream, self.end() 31 | 32 | def start(self): 33 | self.times = [time.time()] 34 | self.labels = ["start"] 35 | if self._stream is not None: 36 | print >>self._stream, "Started at", time.asctime() 37 | 38 | def end(self): 39 | self._has_ended = True 40 | self.lap("end") 41 | if self._stream is not None: 42 | print >>self._stream, "Ended at", time.asctime() 43 | return self.report() 44 | 45 | def lap(self,label): 46 | self.times.append(time.time()) 47 | self.labels.append(label) 48 | 49 | @staticmethod 50 | def seconds_to_string(seconds): 51 | whole_secs = int(seconds) 52 | micros = int((seconds - whole_secs) * 1000000) 53 | delta = dt(seconds=whole_secs, microseconds=micros) 54 | if micros == 0: 55 | return str(delta) 56 | else: 57 | return str(delta)[:-4] 58 | 59 | @staticmethod 60 | def period_to_string(start_time, end_time): 61 | seconds = (end_time - start_time) 62 | return SimpleTimer.seconds_to_string(seconds) 63 | 64 | def report(self): 65 | s = "Finished in %s: " % self.period_to_string(self.times[0], 66 | self.times[-1]) 67 | for i in range(1,len(self.labels)-1): 68 | s += "%s %s, " % (self.labels[i], 69 | self.period_to_string(self.times[i-1], 70 | self.times[i])) 71 | return s[:-2] + "." 72 | 73 | 74 | if __name__ == "__main__": 75 | from unittest import main 76 | main(module="test_" + __file__[:-3]) 77 | 78 | -------------------------------------------------------------------------------- /sg/utils/timer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/timer.pyc -------------------------------------------------------------------------------- /sg/utils/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/utils.pyc --------------------------------------------------------------------------------