├── .gitignore
├── README.md
├── data.zip
├── getting_started.pdf
└── sg
    ├── __init__.py
    ├── __init__.pyc
    ├── data
        ├── __init__.py
        ├── __init__.pyc
        ├── bchydro
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── bcholidays.txt
        │   ├── bchydro.py
        │   ├── bchydro.pyc
        │   ├── data_for_testing.csv
        │   ├── demo_bchydro.py
        │   ├── holiday_parser.py
        │   ├── readme.txt
        │   ├── test_bchydro.py
        │   └── test_holiday_parser.py
        ├── dataset.py
        ├── dataset.pyc
        ├── demo_dataset.py
        ├── eklima
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── parse_eklima_xml.py
        │   └── parse_eklima_xml.pyc
        ├── eunite
        │   ├── __init__.py
        │   ├── eunite.py
        │   ├── import_csv_to_sqlite.py
        │   ├── readme.txt
        │   └── test_eunite.py
        ├── sintef
        │   ├── Makefile
        │   ├── README.txt
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── anonymize_gs2.cpp
        │   ├── convert_scikits_to_pandas_hdf5.py
        │   ├── create_full_temp_data.py
        │   ├── create_full_temp_data.pyc
        │   ├── data_for_eirik.py
        │   ├── eb_userloads.py
        │   ├── find_binary.cpp
        │   ├── gs2-do.sh
        │   ├── gs2-grep.sh
        │   ├── gs2.txt
        │   ├── gs2_for_prediction.txt
        │   ├── gs2_short.txt
        │   ├── make-list-of-gs2-files.sh
        │   ├── map_EIA_to_anonymous.py
        │   ├── parse_gs2.py
        │   ├── parse_gs2.pyc
        │   ├── plot_temp.py
        │   ├── plot_temp_misc.py
        │   ├── preprocess_gs2.py
        │   ├── select_meters.py
        │   ├── test_parse_gs2.py
        │   ├── test_userloads.py
        │   ├── test_userloads.pyc
        │   ├── testfile.gs2
        │   ├── testfile_short.gs2
        │   ├── unique.py
        │   ├── userloads.py
        │   └── userloads.pyc
        ├── test_dataset.py
        └── yr.no
        │   ├── README.txt
        │   ├── crontab.txt
        │   └── get-forecasts.sh
    ├── globals.py
    ├── globals.pyc
    ├── models
        ├── __init__.py
        ├── __init__.pyc
        ├── arima.py
        ├── bfgs.py
        ├── demo_cleansing.py
        ├── error_functions.py
        ├── esn.py
        ├── esn.pyc
        ├── exp_cleaning.py
        ├── filter-R-messages.py
        ├── ga.py
        ├── ga.pyc
        ├── gaussian_process_intro.py
        ├── genome_evaluator.py
        ├── gridopt_load_prediction.py
        ├── gui.py
        ├── gui.pyc
        ├── lib_atlas
        │   ├── BsplineAnalyticSmoother.cpp
        │   ├── BsplineAnalyticSmoother.h
        │   └── Makefile
        ├── lib_mkl
        │   ├── BsplineAnalyticSmoother.cpp
        │   ├── BsplineAnalyticSmoother.h
        │   └── Makefile
        ├── linear.py
        ├── load_cleansing.py
        ├── load_cleansing.pyc
        ├── load_prediction.py
        ├── load_prediction.py.orig
        ├── load_prediction.pyc
        ├── load_prediction.py~
        ├── load_prediction_CBR.py
        ├── load_prediction_ar.py
        ├── load_prediction_ar24.py
        ├── load_prediction_arima.py
        ├── load_prediction_averagedaily.py
        ├── load_prediction_averagehourly.py
        ├── load_prediction_dshw.py
        ├── load_prediction_esn.py
        ├── load_prediction_esn24.py
        ├── load_prediction_identity.py
        ├── load_prediction_regul_ar.py
        ├── load_prediction_taohong.py
        ├── load_prediction_wavelet.py
        ├── load_prediction_wavelet.py.orig
        ├── load_prediction_wavelet24.py
        ├── manual_load_prediction_gridsearch.py
        ├── mixture_of_experts.py
        ├── model.py
        ├── model.pyc
        ├── onemax_mpi.py
        ├── pattern_eliminators.py
        ├── regul_ar.py
        ├── regul_ar_grid_search.py
        ├── roughness.tex
        ├── run_experiments.py
        ├── run_experiments_params.py
        ├── spclean.py
        ├── spclean.pyc
        ├── spclean_wrapper.py
        ├── splines.py
        ├── splines.pyc
        ├── static.py
        ├── static.pyc
        ├── subset_runs
        │   └── make-runfiles.sh
        ├── taohong.py
        ├── test_arima.py
        ├── test_esn.py
        ├── test_sequence_scan.py
        ├── test_spclean.py
        ├── test_splines.py
        ├── test_wavelet.py
        ├── test_wavelet_retrieve.py
        ├── wavelet.py
        └── wavelet.pyc
    ├── requirements.txt
    └── utils
        ├── __init__.py
        ├── __init__.pyc
        ├── _test_template.py
        ├── analyze_gefcom_temp_genes.py
        ├── cache.py
        ├── genemapper.py
        ├── genemapper.pyc
        ├── output.py
        ├── output.pyc
        ├── plot_fitnesses.py
        ├── pyevolve_mpi.py
        ├── pyevolve_utils.py
        ├── pyevolve_utils.py~
        ├── queue_jobs.py
        ├── scripts
            ├── best-genomes-found.sh
            ├── list-finished-jobs.sh
            ├── parse-logs-into-csv.sh
            ├── resubmit-jobs.sh
            ├── split-test-validate.sh
            └── summarize-simulation-results.sh
        ├── test_cache.py
        ├── test_genemapper.py
        ├── test_output.py
        ├── test_pyevolve_utils.py
        ├── test_timer.py
        ├── test_utils.py
        ├── testutils.py
        ├── testutils.pyc
        ├── timer.py
        ├── timer.pyc
        ├── utils.py
        ├── utils.py.orig
        ├── utils.pyc
        └── visualize.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pickle
2 | *.db
3 | .hg*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | load_forecasting
 2 | ================
 3 | 
 4 | A framework for load forecasting, where seasonal cycles are removed from the input signal. A genetic algorithm is used to parameterize various forecasting models.
 5 | 
 6 | If you use this software, please cite:
 7 | 
 8 | ```
 9 | @Article{hoverstadr:_three_stage_approac_load_forec,
10 |   author =       {Boye Annfelt Høverstad and Axel Tidemann and Helge Langseth and Pinar {\"O}zt{\"u}rk},
11 |   title =        {Short term load forecasting with seasonal decomposition using evolution for parameter tuning},
12 |   journal =      {IEEE Transactions on Smart Grid},
13 |   year =         2015}
14 | ```
15 | 
16 | Because of data property issues, we can only publish two datasets: GEFCOM 2012 and BCHydro. They are in the data.zip file. You will have to arrange the path to these data files accordingly in the source code. For installation instructions, see getting_started.pdf. 
17 | 
18 | 


--------------------------------------------------------------------------------
/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/data.zip


--------------------------------------------------------------------------------
/getting_started.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/getting_started.pdf


--------------------------------------------------------------------------------
/sg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/__init__.py


--------------------------------------------------------------------------------
/sg/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/__init__.pyc


--------------------------------------------------------------------------------
/sg/data/__init__.py:
--------------------------------------------------------------------------------
1 | from dataset import *
2 | 


--------------------------------------------------------------------------------
/sg/data/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/__init__.pyc


--------------------------------------------------------------------------------
/sg/data/bchydro/__init__.py:
--------------------------------------------------------------------------------
1 | from bchydro import *
2 | 


--------------------------------------------------------------------------------
/sg/data/bchydro/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/bchydro/__init__.pyc


--------------------------------------------------------------------------------
/sg/data/bchydro/bcholidays.txt:
--------------------------------------------------------------------------------
 1 | Holiday	Day Observed	Notes
 2 | New Year's Day	January 1	
 3 | Good Friday	Friday before Easter Sunday	
 4 | Easter Monday	Monday after Easter Sunday	Not a statutory holiday, but bank holiday and federally regulated employers give the day off.
 5 | Victoria Day	Monday before May 25	
 6 | Canada Day	July 1 or July 2 if July 1 is a Sunday	
 7 | Civic Holiday	First Monday of August	A.k.a. British Columbia Day in BC
 8 | Labour Day	First Monday of September	
 9 | Thanksgiving	Second Monday of October	
10 | Remembrance Day	November 11	
11 | Christmas Day	December 25	
12 | Family Day	Second Monday of February	Introduced in 2013	
13 | 


--------------------------------------------------------------------------------
/sg/data/bchydro/bchydro.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/bchydro/bchydro.pyc


--------------------------------------------------------------------------------
/sg/data/bchydro/data_for_testing.csv:
--------------------------------------------------------------------------------
  1 | January 1, 2004; 1; 7160
  2 | January 1, 2004; 2; 6853
  3 | January 1, 2004; 3; 6634
  4 | January 1, 2004; 4; 6468
  5 | January 1, 2004; 5; 6430
  6 | January 1, 2004; 6; 6503
  7 | January 1, 2004; 7; 6608
  8 | January 1, 2004; 8; 6812
  9 | January 1, 2004; 9; 6976
 10 | January 1, 2004; 10; 7225
 11 | January 1, 2004; 11; 7461
 12 | January 1, 2004; 12; 7585
 13 | January 1, 2004; 13; 7634
 14 | January 1, 2004; 14; 7564
 15 | January 1, 2004; 15; 7504
 16 | January 1, 2004; 16; 7644
 17 | January 1, 2004; 17; 8238
 18 | January 1, 2004; 18; 8734
 19 | January 1, 2004; 19; 8647
 20 | January 1, 2004; 20; 8485
 21 | January 1, 2004; 21; 8333
 22 | January 1, 2004; 22; 8116
 23 | January 1, 2004; 23; 7674
 24 | January 1, 2004; 24; 7173
 25 | January 2, 2004; 1; 6797
 26 | January 2, 2004; 2; 6604
 27 | January 2, 2004; 3; 6535
 28 | January 2, 2004; 4; 6537
 29 | January 2, 2004; 5; 6538
 30 | January 2, 2004; 6; 6785
 31 | January 2, 2004; 7; 7166
 32 | January 2, 2004; 8; 7749
 33 | January 2, 2004; 9; 8160
 34 | January 2, 2004; 10; 8433
 35 | January 2, 2004; 11; 8514
 36 | January 2, 2004; 12; 8445
 37 | January 2, 2004; 13; 8406
 38 | January 2, 2004; 14; 8256
 39 | January 2, 2004; 15; 8125
 40 | January 2, 2004; 16; 8281
 41 | January 2, 2004; 17; 8789
 42 | January 2, 2004; 18; 9238
 43 | January 2, 2004; 19; 9176
 44 | January 2, 2004; 20; 8999
 45 | January 2, 2004; 21; 8845
 46 | January 2, 2004; 22; 8641
 47 | January 2, 2004; 23; 8203
 48 | January 2, 2004; 24; 7729
 49 | January 3, 2004; 1; 7293
 50 | January 3, 2004; 2; 7045
 51 | January 3, 2004; 3; 6957
 52 | January 3, 2004; 4; 6958
 53 | January 3, 2004; 5; 6985
 54 | January 3, 2004; 6; 7105
 55 | January 3, 2004; 7; 7382
 56 | January 3, 2004; 8; 7809
 57 | January 3, 2004; 9; 8238
 58 | January 3, 2004; 10; 8639
 59 | January 3, 2004; 11; 8873
 60 | January 3, 2004; 12; 8862
 61 | January 3, 2004; 13; 8768
 62 | January 3, 2004; 14; 8617
 63 | January 3, 2004; 15; 8550
 64 | January 3, 2004; 16; 8625
 65 | January 3, 2004; 17; 9178
 66 | January 3, 2004; 18; 9758
 67 | January 3, 2004; 19; 9677
 68 | January 3, 2004; 20; 9437
 69 | January 3, 2004; 21; 9241
 70 | January 3, 2004; 22; 9041
 71 | January 3, 2004; 23; 8700
 72 | January 3, 2004; 24; 8207
 73 | January 4, 2004; 1; 7814
 74 | January 4, 2004; 2; 7597
 75 | January 4, 2004; 3; 7482
 76 | January 4, 2004; 4; 7474
 77 | January 4, 2004; 5; 7559
 78 | January 4, 2004; 6; 7682
 79 | January 4, 2004; 7; 7872
 80 | January 4, 2004; 8; 8245
 81 | January 4, 2004; 9; 8608
 82 | January 4, 2004; 10; 8823
 83 | January 4, 2004; 11; 8905
 84 | January 4, 2004; 12; 8865
 85 | January 4, 2004; 13; 8762
 86 | January 4, 2004; 14; 8657
 87 | January 4, 2004; 15; 8622
 88 | January 4, 2004; 16; 8762
 89 | January 4, 2004; 17; 9428
 90 | January 4, 2004; 18; 10030
 91 | January 4, 2004; 19; 10024
 92 | January 4, 2004; 20; 9841
 93 | January 4, 2004; 21; 9627
 94 | January 4, 2004; 22; 9367
 95 | January 4, 2004; 23; 8777
 96 | January 4, 2004; 24; 8297
 97 | January 5, 2004; 1; 7938
 98 | January 5, 2004; 2; 7798
 99 | January 5, 2004; 3; 7786
100 | January 5, 2004; 4; 7816
101 | 


--------------------------------------------------------------------------------
/sg/data/bchydro/demo_bchydro.py:
--------------------------------------------------------------------------------
 1 | # Short demonstration of the utilities to load BCHydro data
 2 | import sys
 3 | import os
 4 | from datetime import timedelta as dt
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | import sg.data.bchydro as bc
 9 | 
10 | if __name__ == "__main__":
11 |     # Option 1: load the entire dataset as a timeseries
12 |     timeseries = bc.load()
13 |     filtered = [x if x > 10 else 4000 for x in timeseries]
14 |     plt.plot(filtered, '-')
15 |     plt.title("The entire BC Hydro dataset")
16 |     # Option 2: load the using the Dataset class
17 |     dataset = bc.Dataset(period=dt(days=30), step_length=dt(days=7))
18 |     plt.figure()
19 |     plt.plot(dataset.get_random_period(), '-')
20 |     plt.title("A randomly selected 30-day period from the BC Hydro dataset")
21 |     plt.show()
22 | 


--------------------------------------------------------------------------------
/sg/data/bchydro/readme.txt:
--------------------------------------------------------------------------------
1 | Transmission load data downloaded from 
2 | 
3 | http://transmission.bchydro.com/transmission_system/balancing_authority_load_data/historical_transmission_data.htm
4 | 
5 | Original Excel spreadsheets stored in directory 'originals'.
6 | 
7 | First sheet of each file exported to csv.
8 | 
9 | 


--------------------------------------------------------------------------------
/sg/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from datetime import timedelta as dt
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import copy
  7 | 
  8 | class Dataset(object):
  9 |     def __init__(self, series, period_length, step_length=None):
 10 |         """Initialize the dataset with the entire timeseries, and a
 11 |         datetime.timedelta indicating the length of each period to be
 12 |         extracted.
 13 | 
 14 |         If step_length is provided, this should be a datetime.timedelta that
 15 |         indicates the step length between each period that may be selected. For
 16 |         instance, a step_length of 1 day indicates that all the selected
 17 |         periods will start at the same hour of day, even if the dataset has
 18 |         higher frequency."""
 19 |         self._series = series
 20 |         self._period_length = \
 21 |           self._convert_timedelta_to_timeseries(period_length)
 22 |         if step_length is None:
 23 |             self._step_length = 1
 24 |         else:
 25 |             self._step_length = \
 26 |                 self._convert_timedelta_to_timeseries(step_length)
 27 |         self._num_periods = (len(series) - self._period_length + 1) / \
 28 |           self._step_length
 29 | 
 30 |     def _get_start_and_end_times(self):
 31 |         """Return the start and end times of the time series. End time is the
 32 |         start time of the last entry in the series, not the end time, i.e. the
 33 |         duration of the last timestep is not included."""
 34 |         start_time = self._series.first_valid_index()
 35 |         end_time = self._series.last_valid_index()
 36 |         if isinstance(start_time, pd.Period):
 37 |             # start_time and end_time for a Period seem to be equivalent
 38 |             start_time = start_time.start_time
 39 |             end_time = end_time.start_time 
 40 |         return (start_time, end_time)
 41 |         
 42 |     def _convert_timedelta_to_timeseries(self, period_length):
 43 |         """Return the length of the period (a timedelta) represented as an
 44 |         integer, based on the frequency of the dataset."""
 45 |         # Calculating this cannot be done using the timeseries frequency, as
 46 |         # that falls apart when the frequency is undefined. This method should
 47 |         # work for all frequencies, as long as the time step is constant
 48 |         # between data points.
 49 |         start_time, end_time = self._get_start_and_end_times()
 50 |         dt_series = (end_time - start_time) / (len(self._series) - 1)
 51 |         if dt_series >= period_length:
 52 |             return 1
 53 |         else:
 54 |             # datetime.timedelta doesn't support division, so count the steps
 55 |             # incrementally.
 56 |             dt_acc = dt_series
 57 |             steps = 1
 58 |             while dt_acc < period_length:
 59 |                 steps += 1
 60 |                 dt_acc += dt_series
 61 |             if dt_acc > period_length:
 62 |                 msg = "Could not create dataset, failed to convert time " \
 63 |                     "period length to a number steps in the time series array. " \
 64 |                     "The selected period length (%s) is not a multiple of " \
 65 |                     "the time step of the original data set (%s)." % \
 66 |                     (period_length, dt_series)
 67 |                 raise RuntimeError(msg)
 68 |             return steps
 69 | 
 70 |     @property
 71 |     def num_periods(self):
 72 |         """The number of selectable periods. This is a read-only property."""
 73 |         return self._num_periods
 74 | 
 75 |     @property
 76 |     def series(self):
 77 |         """The entire time series from which dataset periods are selected. This
 78 |         is a read-only property."""
 79 |         return self._series
 80 | 
 81 |     def index_of(self, period_number):
 82 |         """Return index in entire time series of period number
 83 |         period_number."""
 84 |         return period_number * self._step_length
 85 |         
 86 |     def get_period(self, period_number):
 87 |         """Return period number period_number."""
 88 |         first = self.index_of(period_number)
 89 |         last = first + self._period_length
 90 |         return self._series[first:last]
 91 | 
 92 |     def get_random_period(self, return_period_number=False):
 93 |         """Select a random period of the predefined length. If
 94 |         return_period_number, return a tuple consisting of a random period and
 95 |         the period number the selected period. Otherwise return only the data."""
 96 |         number = np.random.randint(0, self.num_periods)
 97 |         data = self.get_period(number)
 98 |         if return_period_number:
 99 |             return (data, number)
100 |         else:
101 |             return data
102 | 
103 |     def split(self, ratio=0.5):
104 |         """Splits the current dataset into two datasets defined by the ratio."""
105 |         first = copy.copy(self)
106 |         first._series = first._series[:int(len(first._series)*ratio)]
107 |         last = copy.copy(self)
108 |         last._series = last._series[int(len(last._series)*ratio):]
109 |         return first, last
110 |     
111 | def remove_outlier_set_previous(dataset, outlier_val=0):
112 |     """Set all 'outlier'-valued elements in the dataset to be the value at the
113 |     position before. This routine does not copy the dataset before cleaning.
114 | 
115 |     If there are several consecutive outliers, they will all be set to the
116 |     preceding non-outlier value."""
117 |     outliers = np.where(dataset[1:] == outlier_val)
118 |     for outlier in outliers[0]:
119 |         dataset[outlier + 1] = dataset[outlier]
120 |     return dataset
121 | 
122 | if __name__ == "__main__":
123 |     from unittest import main
124 |     main(module="test_" + __file__[:-3])
125 | 


--------------------------------------------------------------------------------
/sg/data/dataset.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/dataset.pyc


--------------------------------------------------------------------------------
/sg/data/demo_dataset.py:
--------------------------------------------------------------------------------
 1 | # Code demonstrating the use of Dataset (actually data.bchydro.Dataset).
 2 | 
 3 | from datetime import timedelta as dt
 4 | 
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | import sg.data.bchydro as bc
 9 | import sg.src.spclean as cln
10 | 
11 | # 7-day periods, selected with one day overlap (step length 6 days)
12 | duration = 7
13 | step = 6
14 | 
15 | # Create the dataset, specifying period and step length as datetime.timedelta
16 | dataset = bc.Dataset(period=dt(days=duration), step_length=dt(days=step))
17 | 
18 | # Plot the first 5 periods sequentially with overlap
19 | for period in (0, 1, 2, 3, 4):
20 |     period_start_hrs = period * step * 24
21 |     period_end_hrs = period_start_hrs + duration * 24
22 |     x = np.arange(period_start_hrs, period_end_hrs)
23 |     y = dataset.get_period(period)
24 |     plt.plot(x, y)
25 | plt.title("A sequence of 7-day periods selected with 1 day overlap.")
26 | 
27 | # Plot the same sequence using the original time series directly
28 | plt.figure()
29 | plt.plot(dataset.series[0:4*step*24+duration*24])
30 | plt.title("Same data plotted by manually selecting a slice from the time series")
31 | 
32 | # Plot a random sequence
33 | plt.figure()
34 | (data, period_number) = dataset.get_random_period(True)
35 | plt.plot(data)
36 | plt.title("Period number %d (randomly selected).\n\nThis period starts at " \
37 |               "index %d in the original time series." % \
38 |               (period_number, dataset.index_of(period_number)))
39 | 
40 | # Show all figures
41 | plt.show()
42 | 


--------------------------------------------------------------------------------
/sg/data/eklima/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/__init__.py


--------------------------------------------------------------------------------
/sg/data/eklima/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/__init__.pyc


--------------------------------------------------------------------------------
/sg/data/eklima/parse_eklima_xml.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.cElementTree as et
 2 | import sys, pdb
 3 | import re
 4 | import pandas as pd
 5 | from datetime import datetime
 6 | import calendar
 7 | import sg.utils
 8 | 
 9 | def parse(file):
10 |     cal = dict((v,k) for k,v in enumerate(calendar.month_name))
11 |     xml = et.parse(file)
12 |     root = xml.getroot()
13 |     station_name = root.findall('table/Stnr/Name')[0].text
14 |     TS = []
15 | 
16 |     for table in root.findall('table'):
17 |         if station_name in table.attrib['name']:
18 |             month, year = table.attrib['name'].split(station_name)[-1].split()
19 |             for date in table.findall('Date'):
20 |                 try:
21 |                     day = int(date.attrib['id'])
22 |                     data = [ float(ele.text) for ele in date.getchildren() if re.search('TA_\d*', ele.tag) ]
23 |                     hours = [ int(ele.tag.split('_')[-1]) for ele in date.getchildren() if re.search('TA_\d*', ele.tag) ]
24 |                     dates = [ datetime(year=int(year), month=cal[month], day=day, hour=hour) for hour in hours ]
25 |                     TS.append(pd.Series(data=data, index=dates))
26 |                 except ValueError:
27 |                     pass
28 | 
29 |     return pd.concat((TS)) 
30 | 
31 | if __name__ == "__main__":
32 |     sg.utils.plot_time_series([parse(sys.argv[1])], ['-'], ['Dummy station'])
33 | 


--------------------------------------------------------------------------------
/sg/data/eklima/parse_eklima_xml.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/eklima/parse_eklima_xml.pyc


--------------------------------------------------------------------------------
/sg/data/eunite/__init__.py:
--------------------------------------------------------------------------------
1 | from eunite import *
2 | 


--------------------------------------------------------------------------------
/sg/data/eunite/eunite.py:
--------------------------------------------------------------------------------
 1 | """Import EUNITE dataset. Concatenates 1997, 1998 and January 1999 data. The
 2 | competition used Jan 1999 as test set."""
 3 | 
 4 | import os
 5 | import sys
 6 | import sqlite3
 7 | import datetime
 8 | import numpy as np
 9 | 
10 | import pandas as pd
11 | 
12 | from sg.globals import SG_DATA_PATH
13 | import sg.data
14 | 
15 | PATH_TO_EUNITE_DB = os.path.join(SG_DATA_PATH, "eunite", "eunite.db")
16 | 
17 | class Dataset(sg.data.Dataset):
18 |     def __init__(self, period, step_length=None):
19 |         """Loads the EUNITE time series and sets up for extraction of random
20 |         slices of length 'period', 'step_length' apart. See class Dataset for
21 |         more info."""
22 |         sg.data.Dataset.__init__(self, load(), period, step_length)
23 | 
24 | def load(dbpath=PATH_TO_EUNITE_DB):
25 |     """Read the load data from the given database. Return a pandas.DataFrame
26 |     containing the data."""
27 |     with sqlite3.connect(dbpath, detect_types=sqlite3.PARSE_DECLTYPES|
28 |                              sqlite3.PARSE_COLNAMES) as conn:
29 |         crs = conn.cursor()
30 |         sel_stmt = "SELECT Timestamp as 'stamp [timestamp]', "\
31 |             "Deg_C as 'temp [float]', "\
32 |             "MWh as 'load [float]' "\
33 |             "FROM "
34 |         crs.execute(sel_stmt + "training" + \
35 |                         " UNION " + \
36 |                         sel_stmt + "testing" + \
37 |                         " ORDER BY Timestamp ASC")
38 |         stamps, temps, loads = zip(*crs.fetchall())
39 |     return pd.DataFrame({'Temperature' : np.array(temps, dtype=float),
40 |                          'Load' : np.array(loads, dtype=float)},
41 |                          index=stamps)
42 | 
43 | if __name__ == '__main__':
44 |     from unittest import main
45 |     main(module='test_'+__file__[:-3])
46 | 


--------------------------------------------------------------------------------
/sg/data/eunite/import_csv_to_sqlite.py:
--------------------------------------------------------------------------------
 1 | """Import load, temperature and holiday data from csv files into sqlite.
 2 | 
 3 | May not work without modification, after separating data from code. The code in
 4 | this file assumes the csv files are in the working directory of the
 5 | interpreter.
 6 | 
 7 | """
 8 |  
 9 | import csv
10 | import datetime
11 | import os
12 | import sqlite3
13 | 
14 | from sg.data.eunite import PATH_TO_EUNITE_DB
15 | 
16 | def import_data(load_path, temp_path, cursor, table_name):
17 |     load_reader = csv.reader(open(load_path), delimiter=';')
18 |     temp_reader = csv.reader(open(temp_path), delimiter=';')
19 |     loads = [l for l in load_reader]
20 |     temperatures = [t for t in temp_reader]
21 |     assert(len(temperatures) == len(loads))
22 |     for (temp, load) in zip(temperatures, loads):
23 |         load = [int(l) for l in load]
24 |         ldate = datetime.datetime(year=load[0], month=load[1], day=load[2])
25 |         tdate = datetime.datetime.strptime(temp[0], "%Y-%m-%d")
26 |         assert(ldate == tdate)
27 |         deg_c = float(temp[1])
28 |         for half_hour in range(len(load)-3):
29 |             stamp = ldate + datetime.timedelta(hours=float(half_hour) / 2)
30 |             cursor.execute("INSERT INTO %s VALUES (?, ?, ?)" % table_name,
31 |                            (stamp, deg_c, load[half_hour + 3]))
32 | 
33 | def import_holidays(cursor):
34 |     with open("holidays.csv") as f:
35 |         for l in f:
36 |             date = datetime.datetime.strptime(l[:-1], "%Y-%m-%d")
37 |             cursor.execute('INSERT INTO holidays VALUES (?)', (date,))
38 | 
39 | def _reformat_date_jan_1999():
40 |     """Run this function only once, to transform the date format of
41 |     temperature_1999.csv into ISO."""
42 |     reader = csv.reader(open("temperatures_1999.csv"), delimiter=";")
43 |     for (day, month, temp) in reader:
44 |         date = datetime.datetime.strptime("-".join(["1999", month, day]), 
45 |                                           "%Y-%m-%d")
46 |         print "%s; %s" % (date.strftime("%Y-%m-%d"), temp)
47 | 
48 | def clear_db(cursor):
49 |         try:
50 |             cursor.execute("DROP TABLE training")
51 |         except:
52 |             pass
53 |         try:
54 |             cursor.execute("DROP TABLE testing")
55 |         except:
56 |             pass
57 |         try:
58 |             cursor.execute("DROP TABLE holidays")
59 |         except:
60 |             pass
61 | 
62 | def setup_db(cursor):
63 |     cursor.execute('CREATE TABLE holidays ' \
64 |                        '("Timestamp" datetime unique not null primary key)')
65 |     for table in ("training", "testing"):
66 |         cursor.execute('CREATE TABLE %s ' \
67 |                         '("Timestamp" datetime unique not null primary key, ' \
68 |                         '"Deg_C" float, "MWh" float)' % table)
69 | 
70 | if __name__ == "__main__":
71 |     with sqlite3.connect(PATH_TO_EUNITE_DB,
72 |                          detect_types=sqlite3.PARSE_DECLTYPES) as conn:
73 |         cursor = conn.cursor()
74 |         clear_db(cursor)
75 |         setup_db(cursor)
76 |         import_holidays(cursor)
77 |         import_data("loads.csv", "temperatures.csv", cursor, "training")
78 |         import_data("loads_1999.csv", "temperatures_1999.csv", cursor, 
79 |                     "testing")
80 | 


--------------------------------------------------------------------------------
/sg/data/eunite/readme.txt:
--------------------------------------------------------------------------------
 1 | The data in this folder are those used in the EUNITE 2001 load forecasting competition. These have subsequently also been applied by other forecasting studies (e.g. T. Rashid and T. Kechadi, A Practical Approach for Electricity Load Forecasting, World Academy of Science, Engineering and Technology 5 2005).In the competition, the 1997 and 1998 data were used as training sets, while the competition used data from January 1999. 
 2 | 
 3 | See web page for more info:
 4 | http://neuron.tuke.sk/competition/index.php
 5 | 
 6 | * Web page of competition winners:
 7 | http://www.csie.ntu.edu.tw/~cjlin/papers.html
 8 | * Chang, Chen & Lin. EUNITE Network Competition: Electricity Load Forecasting:
 9 | http://www.csie.ntu.edu.tw/~cjlin/papers/euniteelf.ps.gz
10 | Also saved in this directory as winner_model_article.pdf.
11 | 
12 | The data were preprocessed as follows:
13 | 
14 | * In Excel, all dates were formatted as ISO (YYYY-MM-DD).  
15 | 
16 | * All line endings were changed using mac2unix.
17 | 
18 | * The data in Holidays.xls was manually transformed so all dates were in a row.
19 |  
20 | * Temperature data for 1997 and 1998 (from "competition" folder) were manually opened in Excel, the two years were concatenated, the date format was set to ISO YYYY-MM-DD, and the file was saved as temperatures.csv.
21 | 
22 | * temperature.csv and temperature_1999.csv were modified replacing ',' with '.' as decimal separator. 
23 | 
24 | * Load data for 1997 and 1998 (from "competition" folder) were manually concatenated and saves as loads.csv.
25 | 
26 | * Dates for temperature 1999 were reformatted using the function _reformat_date_jan_1999 in import_csv_to_sqlite.py.
27 | 
28 | 


--------------------------------------------------------------------------------
/sg/data/eunite/test_eunite.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datetime import timedelta as dt
 3 | 
 4 | import sg.utils.testutils as testutils
 5 | from sg.data.eunite.eunite import *
 6 | 
 7 | class TestEuniteDataset(testutils.ArrayTestCase):
 8 |     def setUp(self):
 9 |         self.data = Dataset(period=dt(days=2), step_length=dt(days=1))
10 | 
11 |     def _test_two_days_correct(self, period, temps, loads):
12 |         self.assertEqual(len(period), 2 * 48)
13 |         temps = [temps[0] for i in range(48)] + [temps[1] for i in range(48)]
14 |         fasit = np.array([[t, l] for (t, l) in zip(temps, loads)])
15 |         self.assertArraysEqual(period.data, fasit)
16 | 
17 |     def test_first_correct(self):
18 |         day_1_to_2 = self.data.get_period(0)
19 |         temps = [-7.6, -6.3]
20 |         loads = [797, 794, 784, 787, 763, 749, 745, 730, 707, 706, 720, 657, 
21 |                  633, 595, 560, 540, 519, 601, 631, 621, 640, 643, 654, 653, 
22 |                  688, 688, 690, 690, 684, 679, 674, 677, 644, 660, 654, 683, 
23 |                  688, 698, 719, 733, 700, 671, 692, 685, 717, 694, 692, 686,
24 |                  704, 697, 704, 676, 664, 668, 668, 662, 665, 666, 703, 677, 
25 |                  669, 660, 650, 672, 648, 682, 692, 724, 727, 739, 739, 733, 
26 |                  741, 754, 767, 768, 738, 734, 747, 733, 751, 746, 737, 750, 
27 |                  759, 776, 777, 777, 746, 724, 697, 708, 745, 705, 702, 722]
28 |         self._test_two_days_correct(day_1_to_2, temps, loads)
29 | 
30 |     def test_feb17_18_1998_correct(self):
31 |         days = self.data.get_period(365 + 31 + 16)
32 |         temps = [4.1, 1.8]
33 |         loads = [655, 621, 612, 611, 602, 621, 598, 608, 601, 595, 602, 632, 
34 |                  662, 699, 715, 671, 685, 723, 745, 711, 725, 734, 690, 708, 
35 |                  721, 729, 726, 695, 717, 725, 697, 681, 710, 678, 746, 744, 
36 |                  749, 770, 761, 759, 734, 715, 675, 658, 647, 686, 656, 671,
37 |                  702, 698, 672, 659, 665, 655, 630, 637, 633, 672, 674, 715,
38 |                  708, 747, 709, 711, 725, 719, 738, 742, 725, 729, 707, 715, 
39 |                  738, 746, 750, 712, 728, 709, 709, 698, 711, 720, 734, 751, 
40 |                  759, 782, 760, 773, 729, 707, 647, 660, 659, 643, 648, 658]
41 |         self._test_two_days_correct(days, temps, loads)
42 | 
43 |     def test_last_correct(self):
44 |         last_2_days = self.data.get_period(2 * 365 + 31 - 2)
45 |         temps = [-7.8, -6.0]
46 |         loads = [716, 714, 697, 686, 680, 686, 641, 658, 658, 645, 673, 640, 
47 |                  630, 604, 615, 628, 634, 660, 699, 696, 702, 732, 726, 717, 
48 |                  740, 753, 749, 734, 743, 718, 705, 708, 711, 727, 736, 747, 
49 |                  744, 740, 751, 763, 741, 714, 698, 701, 710, 697, 687, 703,
50 |                  712, 720, 694, 698, 679, 648, 665, 656, 677, 651, 623, 604, 
51 |                  595, 578, 576, 598, 620, 644, 691, 666, 691, 700, 717, 700, 
52 |                  694, 714, 724, 702, 696, 691, 682, 677, 677, 688, 687, 713, 
53 |                  708, 735, 734, 743, 711, 717, 702, 698, 694, 691, 691, 704]
54 |         self._test_two_days_correct(last_2_days, temps, loads)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/sg/data/sintef/Makefile:
--------------------------------------------------------------------------------
 1 | CC  = gcc
 2 | CXX = g++
 3 | 
 4 | OPTS = -O3
 5 | 
 6 | .SUFFIXES:
 7 | 
 8 | clean:
 9 | 	rm -f *~ *.o a.out
10 | 
11 | %.o: %.cpp force
12 | 	$(CXX) $(OPTS) $(INCLUDE) -c $<
13 | 
14 | %: %.cpp force
15 | 	$(CXX) $(OPTS) -o $(subst _,-,$@) $<
16 | force: ;
17 | 


--------------------------------------------------------------------------------
/sg/data/sintef/__init__.py:
--------------------------------------------------------------------------------
1 | from userloads import *
2 | 


--------------------------------------------------------------------------------
/sg/data/sintef/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/__init__.pyc


--------------------------------------------------------------------------------
/sg/data/sintef/convert_scikits_to_pandas_hdf5.py:
--------------------------------------------------------------------------------
 1 | """Convert SINTEF load data in HDF5 files from scikits.timeseries to
 2 | pandas.DataFrames."""
 3 | 
 4 | import datetime
 5 | 
 6 | import tables as h5
 7 | import pandas as pd
 8 | import scikits.timeseries.lib.tstables
 9 | import scikits.timeseries as ts
10 | 
11 | from preprocess_gs2 import PandasH5Storer
12 | 
13 | class Converter(PandasH5Storer):
14 |     def __init__(self, path_ts_in, path_pd_out):
15 |         PandasH5Storer.__init__(self, path_pd_out)
16 |         self._h5file_ts = h5.openFile(path_ts_in, "r")
17 | 
18 |     def __del__(self):
19 |         PandasH5Storer.__del__(self)
20 |         self._h5file_ts.close()
21 |         
22 |     def _load_ts_user(self, user_id):
23 |         return self._h5file_ts.getNode("/loads/id_" + str(user_id)).read()
24 | 
25 |     def _convert_dates(self, series_ts):
26 |         return [date.datetime for date in series_ts.dates]
27 |     
28 |     def _make_pd_series_from_scikits_series(self, series_ts):
29 |         data = {'Load' : series_ts[:,0],
30 |                 'Status Code' : series_ts[:,1]}
31 |         dates = self._convert_dates(series_ts)
32 |         return pd.DataFrame(data, index=dates)
33 | 
34 |     def _convert_user_id_lists(self):
35 |         """The list of experiment users was stored in the original file. This
36 |         must be carried over as a Series in the Pandas file."""
37 |         user_ids = self._h5file_ts.root.loads.cln_pred_exp_ids.read()
38 |         self.store_list('user_ids_cln_pred_exp', user_ids)
39 |         user_ids = self._h5file_ts.root.loads.user_ids.read()
40 |         self.store_list('user_ids', user_ids)
41 |     
42 |     def _convert_users(self):
43 |         user_ids = self._h5file_ts.root.loads.user_ids.read()
44 |         for user_id in user_ids:
45 |             series_ts = self._load_ts_user(user_id)
46 |             series_pd = self._make_pd_series_from_scikits_series(series_ts)
47 |             self.store_pd_user(user_id, series_pd)
48 |         
49 |     def convert(self):
50 |         self._convert_user_id_lists()
51 |         self._convert_users()
52 | 
53 | def _get_targets_from_base_paths(paths):
54 |     from os.path import split, join
55 |     targets = []
56 |     for path in paths:
57 |         dir, base = split(path)
58 |         targets.append(join(dir, "pandas_" + base))
59 |     return targets
60 | 
61 | def _get_sintef_paths():
62 |     import userloads as ul
63 |     bases = (ul.DATA_WITH_DUPES_PATH, ul.DATA_WITHOUT_DUPES_PATH)
64 |     targets = _get_targets_from_base_paths(bases)
65 |     return zip(bases, targets)
66 |     
67 | def convert_sintef_files(interactive=False):
68 |     paths = _get_sintef_paths()
69 |     print "This script will convert scikits.timeseries to pandas in the " \
70 |        "following files:"
71 |     for (path_ts, path_pd) in paths:
72 |        print "\n\t%s\nto\n\t%s" % (path_ts, path_pd)
73 |     while True:
74 |         response = raw_input("\nContinue (y/n)? ")
75 |         if response == 'y':
76 |             break
77 |         elif response == 'n':
78 |             return
79 |     for (path_ts, path_pd) in paths:
80 |         print "Converting %s to %s." % (path_ts, path_pd)
81 |         Converter(path_ts, path_pd).convert()
82 |     print "Done."
83 | 
84 | if __name__ == "__main__":
85 |     convert_sintef_files(interactive=True)
86 | 
87 |     
88 | 


--------------------------------------------------------------------------------
/sg/data/sintef/create_full_temp_data.py:
--------------------------------------------------------------------------------
 1 | """6.5% of the Porsgrunn temperature readings from the SINTEF files
 2 | are missing. They are concatenated with eklima.met.no data from
 3 | Gvarv-Nes, and interpolated. Furthermore, two periods have obvious
 4 | erroneous data readings, look up the periods 2004-11-11 14:00 ->
 5 | 2004-11-22 23:00 and 2005-02-08 08:00 -> 2005-02-27 23:00. These two
 6 | periods are replaced with data from eklima. The final stage is
 7 | interpolation, so the dataset has hourly readings (eklima only reads
 8 | data 4 times a day). Note: the following command must be issued
 9 | beforehand, since it stores all the timeseries in a file that is loaded.
10 | 
11 | ./gs2-grep.sh -l Grader | python plot_temp.py
12 | """
13 | 
14 | import os
15 | 
16 | import numpy.ma as ma
17 | import numpy as np
18 | import pandas as pd
19 | 
20 | import sg.data.eklima.parse_eklima_xml as xml
21 | import sg.utils
22 | from sg.globals import SG_DATA_PATH
23 | 
24 | _TEMP_DATA = os.path.join(SG_DATA_PATH, "eklima", "Telemark", 
25 |                           "Gvarv-Nes2004-2006.xml")
26 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
27 | 
28 | def data():
29 |     temp = pd.read_pickle(os.path.join(_PATH_TO_HERE, 'temp_data.pickle'))
30 |     temp = temp.sort_index().asfreq("H")
31 |     # Extended periods with failed readings, replace with Gvarv
32 |     temp['2004-11-11 14:00':'2004-11-21 23:00'] = np.nan
33 |     temp['2005-02-08 08:00':'2005-02-27 23:00'] = np.nan
34 |     # Shorter periods with failed readings, that we may leave to the cleansing
35 |     # to take care of?
36 |     # temp['2005-09-07 08:00':'2005-09-08 04:00'] = np.nan
37 |     # temp['2006-02-28 05:00':'2006-02-28 04:00'] = np.nan
38 |     # temp['2006-06-17 11:00':'2006-06-18 08:00'] = np.nan
39 |     # temp['2006-12-19 06:00':'2006-12-21 03:00'] = np.nan
40 |     gvarv = xml.parse(_TEMP_DATA)[temp.index[0]:].asfreq("H")
41 |     gvarv_aligned = temp.align(gvarv, join="left")[1]
42 |     # np.where returned a Pandas Timeseries with old Numpy, but now
43 |     # returns an ndarray. Therefore we need to reassign to temp.
44 |     temp[:] = np.where(np.isnan(temp), gvarv_aligned, temp)
45 |     temp = temp.interpolate()
46 |     temp.name = "Temperature"
47 |     # Interpolate away a couple of outliers and zero-recordings, or leave to
48 |     # cleansing?
49 |     # temp['2004-11-29 08:00'] = np.nan
50 |     # temp['2005-11-30 00:00':'2005-11-30 02:00'] = np.nan
51 |     # temp['2006-10-27 09:00'] = np.nan
52 |     # temp = temp.interpolate()
53 |     return temp
54 |     
55 | if __name__ == "__main__":
56 |     data = data()
57 |     sg.utils.plot_time_series([data], ['b.'], ['Porsgrunn + Gvarv temperature'])
58 | 


--------------------------------------------------------------------------------
/sg/data/sintef/create_full_temp_data.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/create_full_temp_data.pyc


--------------------------------------------------------------------------------
/sg/data/sintef/data_for_eirik.py:
--------------------------------------------------------------------------------
 1 | import sg.data.sintef.userloads as ul
 2 | import sys
 3 | 
 4 | tf = ul.tempfeeder_nodup()
 5 | user_ids = tf.user_ids
 6 | for user in user_ids:
 7 |     loads = tf[user][:,0]
 8 |     idx = 0
 9 |     while loads.dates[idx].hour != 0:
10 |         idx += 1
11 |     while idx < len(loads) - 48:
12 |         sys.stdout.write("%d %s False " % (user, loads.dates[idx].strftime("%Y-%M-%d")))
13 |         for i in range(24):
14 |             sys.stdout.write("%f " % loads[idx])
15 |             idx += 1
16 |         print ""
17 | 


--------------------------------------------------------------------------------
/sg/data/sintef/find_binary.cpp:
--------------------------------------------------------------------------------
  1 | /********************************************************************
  2 |  *   		find_binary.cpp
  3 |  *   Created on Tue Feb 07 2012 by Boye A. Hoeverstad.
  4 |  *   
  5 |  *   Given a list of files, as arguments on the command line and/or as input to
  6 |  *   stdin, classify the files as binary or text.
  7 |  *******************************************************************/
  8 | 
  9 | #include <iostream>
 10 | #include <fstream>
 11 | #include <vector>
 12 | #include <set>
 13 | #include <iomanip>
 14 | #include <iterator>
 15 | #include <unistd.h>
 16 | 
 17 | using namespace std;
 18 | 
 19 | string program_name;
 20 | 
 21 | bool verbose = false;
 22 | vector<string> files;
 23 | const int chunk_size = 1024*1024*500;
 24 | vector<char> buffer(chunk_size);
 25 | set<unsigned char> characters;
 26 | 
 27 | void
 28 | setup_text_character_set()
 29 | {
 30 |    characters.insert(0x0a); // LF
 31 |    characters.insert(0x0d); // CR
 32 |    characters.insert(0xe6); // ae
 33 |    characters.insert(0xf8); // oe
 34 |    characters.insert(0xe5); // aa
 35 |    characters.insert(0xc6); // AE
 36 |    characters.insert(0xd8); // OE
 37 |    characters.insert(0xc5); // AA
 38 | }
 39 | 
 40 | bool
 41 | is_binary(istream &stream, string path)
 42 | {
 43 |    stream.read(&buffer[0], chunk_size);
 44 |    int num_read = stream.gcount();
 45 |    for (int n = 0; n < num_read; n++)
 46 |    {
 47 |       unsigned char c = static_cast<unsigned char>(buffer[n]);
 48 |       if ((c < 32 || c > 127) && characters.find(c) == characters.end())
 49 |       {
 50 |          if (verbose)
 51 |          {
 52 |             cout << "Binary character: " << ios::hex << static_cast<unsigned int>(c)
 53 |                  << " at position " << n << " (probably) of file " 
 54 |                  << path << ". Context:\n";
 55 |             copy(&buffer[max(n-10, 0)], &buffer[min(n+10, num_read)], ostream_iterator<char>(cout, ""));
 56 |             cout << "\n" << flush;
 57 |          }
 58 |          return true;
 59 |       }
 60 |    }
 61 |    return false;
 62 | }
 63 | 
 64 | void 
 65 | exit_with_usage()
 66 | {
 67 |    cerr << "Usage: " << program_name << " inputfile [more inputfiles]\n"
 68 |         << "Get input files from command line and/or standard input. "
 69 |         << "Output an indication of which files are binary and which are text.\n";
 70 |    exit(1);
 71 | }
 72 | 
 73 | void
 74 | parse_cmdline_arguments(int argc, char *argv[])
 75 | {
 76 |    if (argc == 2 && (!strcmp("-?", argv[1]) || !strcmp("--help", argv[1])))
 77 |       exit_with_usage();
 78 |    int next_arg = 1;
 79 |    if (argc >= 2 && (!strcmp("-v", argv[1])))
 80 |    {
 81 |       next_arg++;
 82 |       verbose = true;
 83 |    }
 84 |    for (; next_arg < argc; next_arg++)
 85 |       files.push_back(argv[next_arg]);
 86 | }
 87 | 
 88 | void
 89 | get_stdin_arguments()
 90 | {
 91 |    if (isatty(fileno(stdin)))
 92 |       return;
 93 |    string path;
 94 |    while (getline(cin, path))
 95 |       files.push_back(path);
 96 | }
 97 | 
 98 | void 
 99 | get_input_files(int argc, char *argv[])
100 | {
101 |    parse_cmdline_arguments(argc, argv);
102 |    get_stdin_arguments();
103 |    if (files.size() == 0)
104 |       exit_with_usage();
105 | }
106 | 
107 | int
108 | main(int argc, char *argv[])
109 | {
110 |    program_name = argv[0];
111 |    setup_text_character_set();
112 |    get_input_files(argc, argv);
113 | 
114 |    set<string> binary_files, text_files;
115 | 
116 |    for (vector<string>::const_iterator fit = files.begin(); fit != files.end(); fit++)
117 |    {
118 |       ifstream file(fit->c_str(), ios::binary);
119 |       if (!file)
120 |       {
121 |          cerr << "Failed to open " << *fit << "!\n";
122 |          return 1;
123 |       }
124 |       cout << "Checking file " << *fit << "...\n" << flush;
125 |       bool binary = false;
126 |       while (!file.eof())
127 |          if (is_binary(file, *fit))
128 |             binary = true;
129 |       if (binary)
130 |          binary_files.insert(*fit);
131 |       else
132 |          text_files.insert(*fit);
133 |    }
134 | 
135 |    cout << "Done.\n\nText files:\n";
136 |    copy(text_files.begin(), text_files.end(), ostream_iterator<string>(cout, "\n"));
137 |    cout << "\nBinary files:\n";
138 |    copy(binary_files.begin(), binary_files.end(), ostream_iterator<string>(cout, "\n"));
139 |    return 0;
140 | }
141 | 


--------------------------------------------------------------------------------
/sg/data/sintef/gs2-do.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Perform a command on each file in gs2.txt
4 | # Example usage:
5 | # ./gs2-do.sh sed -n -e'/Istad Nett/p' 
6 | cat gs2.txt | while read path; do "$@" "$path"; done
7 | 


--------------------------------------------------------------------------------
/sg/data/sintef/gs2-grep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Perform a grep on each file in gs2.txt
4 | 
5 | cat gs2.txt | while read path; do grep $@ "$path"; done
6 | 


--------------------------------------------------------------------------------
/sg/data/sintef/gs2_short.txt:
--------------------------------------------------------------------------------
1 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/133_4_2_2006010200_2006100123_20061129202638000_154753.exp
2 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/134_4_2_2006010200_2006100123_20061129132145000_154749.exp
3 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006010200_2006010900_20060110203859000_144086.exp
4 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006010900_2006011600_20060117153217000_144312.exp
5 | /Users/tidemann/Documents/NTNU/devel/src/sg/data/sintef/../../../../data/sintef/raw//Buskerud/Innlest/133_4_2_2006011600_2006012300_20060124154601000_144524.exp
6 | 


--------------------------------------------------------------------------------
/sg/data/sintef/make-list-of-gs2-files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo -n "Creating a list of all the GS2 files and storing it in ./gs2.txt.. "
 4 | find "`pwd`/../../../../data/sintef/raw" -iname '*.exp' -or -iname '*.gs2' >gs2.txt
 5 | echo "Done."
 6 | 
 7 | echo -n "Creating a list of a few small GS2 files and storing it in ./gs2_short.txt.. "
 8 | find "`pwd`/../../../../data/sintef/raw" -iname '*.exp' -or -iname '*.gs2' -size -3MB |head -n 5 >gs2_short.txt
 9 | echo "Done."
10 | 


--------------------------------------------------------------------------------
/sg/data/sintef/map_EIA_to_anonymous.py:
--------------------------------------------------------------------------------
  1 | """This script was written to create a mapping from actual installation IDs to
  2 | the anonymized ones, after the anonymization process has been performed. The
  3 | motivation for this is that the anonymous IDs have already been used
  4 | extensively, but for Buskerud we need to select readings based on actual IDs,
  5 | in order to build a load profile from all meters below a certain substation or
  6 | other grid connection point.
  7 | 
  8 | The script expects the output of a 'diff' between original and anonymized files
  9 | (in that order!) as input. If the files differ in any other way than in IDs,
 10 | the script will fail by design."""
 11 | 
 12 | # A diff output in "normal format" consists of hunks of differences. Each hunk
 13 | # has four parts: the change command, the text from the left input file, a
 14 | # separator, and the text from the right input file (ref
 15 | # http://www.chemie.fu-berlin.de/chemnet/use/info/diff/diff_3.html).
 16 | 
 17 | # Usage:
 18 | # cat gs2_buskerud.txt| while read line; do diff "$line" "`echo $line | sed -e's*/unanom**'`"; done |python map_EIA_to_anonymous.py
 19 | #
 20 | # Note that "for line in $(<gs2_buskerud_short.txt); do ..." will not work, as
 21 | # there are directory names with spaces in the file.
 22 | 
 23 | import sys
 24 | 
 25 | from sg.utils import ask_user
 26 | 
 27 | class DiffParser(object):
 28 |     def __init__(self):
 29 |         self._reset(stream=None)
 30 |         self._ids = set(("#Installation", "#Reference", "#Plant"))
 31 |         self._id_sep = "="
 32 |         
 33 |     def _reset(self, stream):
 34 |         self._stream = stream
 35 |         self._lineno = 1
 36 |         self._map = dict()
 37 | 
 38 |     def _parse_change_command(self):
 39 |         """Parse a change command. This should consist of <line>c<line> or
 40 |         <line1,line2>c<line1,line2>. Returns the number of lines changed, or 0
 41 |         on end of file."""
 42 |         line = self._stream.readline()
 43 |         if len(line) == 0:
 44 |             return 0
 45 |         self._lineno += 1
 46 |         left, right = line[:-1].split('c')
 47 |         if left != right:
 48 |             raise RuntimeError("Error in change command, line mismatch between files: %d vs %d." % (left, right))
 49 |         try:
 50 |             (d1_1, d1_2) = [int(d) for d in left.split(',')]
 51 |             (d2_1, d2_2) = [int(d) for d in right.split(',')]
 52 |             return d1_2 - d1_1 + 1
 53 |         except:
 54 |             pass
 55 |         try:
 56 |             d1, d2 = (int(left), int(right))
 57 |             return 1
 58 |         except:
 59 |             raise RuntimeError("Failed to parse change command.")
 60 |         
 61 |     def _next_line(self, desc):
 62 |         line = self._stream.readline()
 63 |         if len(line) == 0:
 64 |             raise RuntimeError("Error while parsing %s: Unexpected end of file." % desc)
 65 |         self._lineno += 1
 66 |         return line[:-1]
 67 |         
 68 |     def _parse_contents(self, prefix):
 69 |         line = self._next_line("diff contents")
 70 |         if len(prefix) > len(line) or line[:len(prefix)] != prefix:
 71 |             raise RuntimeError("Error while parsing diff contents: Expected '%s', got '%s'." \
 72 |                                % (prefix, line[:len(prefix)]))
 73 |         (tag, value) = line[len(prefix):].split(self._id_sep)
 74 |         if not tag in self._ids:
 75 |             raise RuntimeError("Error while parsing diff contents: Not a recognized identifier: '%s'." % tag)
 76 |         return value
 77 | 
 78 |     def _parse_separator(self):
 79 |         line = self._next_line("separator")
 80 |         sep = "---"
 81 |         if sep != line:
 82 |             raise RuntimeError("Error while parsing separator: Expected '%s', got '%s'." % (sep, line))
 83 |             
 84 |     def _parse_hunk(self):
 85 |         num_changes = self._parse_change_command()
 86 |         if num_changes == 0:
 87 |             return False
 88 |         keys, values = [], []
 89 |         for _ in range(num_changes):
 90 |             keys.append(self._parse_contents("< "))
 91 |         self._parse_separator()
 92 |         for key in keys:
 93 |             self._map[key] = self._parse_contents("> ")
 94 |         return True
 95 | 
 96 |     def parse(self, input_stream):
 97 |         self._reset(input_stream)
 98 |         try:
 99 |             while self._parse_hunk():
100 |                 pass
101 |         except Exception as e:
102 |             print >>sys.stderr, "Error while parsing, probably on line %d." % self._lineno
103 |             print >>sys.stderr, "Exception message: "
104 |             print >>sys.stderr, e
105 |             return
106 |         return self._map
107 | 
108 | def _interactive(pickle_path):
109 |     if ask_user("Input parsed. Save map to %s" % pickle_path, None):
110 |         import cPickle as pickle
111 |         with open(pickle_path, "wb") as f:
112 |             pickle.save(f, id_map)
113 |         print "Done."
114 |     else:
115 |         for (key, value) in id_map.iteritems():
116 |             print key, ":", value
117 | 
118 | def _main():
119 |     id_map = DiffParser().parse(sys.stdin)
120 |     if id_map is None:
121 |         print >>sys.stderr, "Parsing failed."
122 |     else:
123 |         pickle_path = "id_map.pickle"
124 |         import cPickle as pickle
125 |         with open(pickle_path, "wb") as f:
126 |             pickle.dump(id_map, f)
127 |         print "Done, mapping saved to", pickle_path
128 | 
129 | if __name__ == "__main__":
130 |     _main()
131 | 


--------------------------------------------------------------------------------
/sg/data/sintef/parse_gs2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os
  4 | 
  5 | from sg.utils.timer import SimpleTimer
  6 | 
  7 | _KEYVAL_SEPARATOR = "="
  8 | _VALUE_OPEN = "<"
  9 | _VALUE_CLOSE = ">"
 10 | _ENTRY_INDICATOR = "#"
 11 | _HEADING_INDICATOR = "##"
 12 | 
 13 | def _is_element_type(line, element_indicator):
 14 |     return len(line) >= len(element_indicator) and \
 15 |         line[:len(element_indicator)] == element_indicator
 16 | 
 17 | def _is_entry(line):
 18 |     return _is_element_type(line, _ENTRY_INDICATOR)
 19 | 
 20 | def _is_heading(line):
 21 |     return _is_element_type(line, _HEADING_INDICATOR)
 22 | 
 23 | def _make_section(heading_line):
 24 |     return (heading_line[2:-1], dict())
 25 | 
 26 | def _close_value(stream):
 27 |     line = ""
 28 |     for next_line in stream:
 29 |         line += next_line
 30 |         if _VALUE_CLOSE in next_line:
 31 |             return line
 32 |     
 33 | def _find_value(line, stream):
 34 |     if _VALUE_OPEN in line:
 35 |         if not _VALUE_CLOSE in line:
 36 |             line += _close_value(stream)
 37 |         contents = line[line.index(_VALUE_OPEN) + 1:line.index(_VALUE_CLOSE)]
 38 |         return contents.split()
 39 |     return [line[:-1]]
 40 | 
 41 | def _split_line(line, stream):
 42 |     try:
 43 |         separator_idx = line.index(_KEYVAL_SEPARATOR)
 44 |         key = line[1:separator_idx]
 45 |         value = _find_value(line[separator_idx + 1:], stream)
 46 |         return (key, value)
 47 |     except ValueError:
 48 |         raise ValueError("Error while splitting a line into key and value "
 49 |                          "constituents! Key/value separator sign (" +
 50 |                          _KEYVAL_SEPARATOR + ") not found in input line:\n\t" +
 51 |                          line)
 52 |     
 53 | def _add_key_value(section, line, stream):
 54 |     key, value = _split_line(line, stream)
 55 |     contents = section[1]
 56 |     if key in contents:
 57 |         raise ValueError("Duplicate key '" + key +
 58 |                          "' in section '" + section[0])
 59 |     contents[key] = value
 60 | 
 61 | def _find_first_heading(stream):
 62 |     for line in stream:
 63 |         if _is_heading(line):
 64 |             return line
 65 | 
 66 | def section_generator(stream):
 67 |     """Iterator generator. Each call to the generated iterator will return the
 68 |     next section of a GS2 file. A section is represented as a two-element
 69 |     tuple, where the first element is the section heading and the second
 70 |     element is a dictionary. The dictionary holds the entries as key/value
 71 |     pairs. Each value is a list."""
 72 |     line = _find_first_heading(stream)
 73 |     if line is None:
 74 |         return
 75 |     section = _make_section(line)
 76 |     for line in stream:
 77 |         if _is_heading(line):
 78 |             yield section
 79 |             section = _make_section(line)
 80 |         elif _is_entry(line):
 81 |             _add_key_value(section, line, stream)
 82 |     yield section
 83 | 
 84 | def parse_file(path):
 85 |     """Parse a GS2 file and return a list where each element is a section in
 86 |     the GS2 file. See section_generator for info on the format of each
 87 |     section."""
 88 |     with open(path, "r") as f:
 89 |         return [section for section in section_generator(f)]
 90 | 
 91 | def parse_all_generator(pathfile):
 92 |     """Given a file containing a list of GS2 files, parse all the GS2 files one
 93 |     after the other."""
 94 |     with open(pathfile) as paths:
 95 |         for path in paths:
 96 |             path = path[:-1]
 97 |             yield (path, parse_file(path))
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     from unittest import main
102 |     main(module='test_'+__file__[:-3])
103 | 


--------------------------------------------------------------------------------
/sg/data/sintef/parse_gs2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/parse_gs2.pyc


--------------------------------------------------------------------------------
/sg/data/sintef/plot_temp.py:
--------------------------------------------------------------------------------
 1 | """Takes as input a list of gs2-files that has temperature data, reads
 2 | them and plots them.
 3 | Example: grep -l Grader *.exp | python path/to/plot_temp.py """
 4 | 
 5 | import string
 6 | import sys
 7 | import datetime
 8 | 
 9 | import pandas as pd
10 | 
11 | import sg.utils
12 | import sg.data.sintef.parse_gs2 as parse
13 | 
14 | def collect_and_plot(files):
15 |     TS = []
16 |     location = []
17 |     for f in files:
18 |         temperatures = [ section[1] for section in parse.parse_file(f)[1:-1] if section[1]['Plant'] == ['tmp'] ]
19 |         for t in temperatures:
20 |             if t['Step'][0] != '0000-00-00.01:00:00':
21 |                 print 'Not hourly readings of temperature. Abort.'
22 |                 break
23 |             start_time = datetime.datetime.strptime(t['Start'][0], "%Y-%m-%d.%H:%M:%S")
24 |             dates = pd.date_range(start=start_time, periods=len(t['Value']), 
25 |                                     freq='H')
26 |             data = [ float(value.rsplit('/')[0]) for value in t['Value'] ]
27 |             TS.append(pd.Series(data=data, index=dates))
28 |             if location and t['Installation'][0] != location:
29 |                 print 'Location changed during reading of gs2 files. Probably some bad grouping of gs2 files.'
30 |             location = t['Installation'][0]
31 |     if TS:
32 |         all_series = pd.concat(TS).sort_index()
33 |         all_series_no_duplicates = all_series.groupby(level=0).first()
34 |         all_series_no_duplicates.dump('temp_data.pickle')
35 |         sg.utils.plot_time_series([all_series_no_duplicates], ['b-'], [location])
36 |     else:
37 |         print 'No temperature data.'
38 | 
39 | if __name__ == "__main__":
40 |     if not sys.stdin.isatty():
41 |         collect_and_plot([ s.rstrip('\n') for s in sys.stdin.readlines() ])
42 | 
43 | 


--------------------------------------------------------------------------------
/sg/data/sintef/plot_temp_misc.py:
--------------------------------------------------------------------------------
 1 | """Takes as input a list of gs2-files that has temperature data, reads
 2 | them and plots them.
 3 | Example: grep -l Grader *.exp | python path/to/plot_temp.py """
 4 | 
 5 | import string
 6 | import matplotlib.pyplot as plt
 7 | import sg.data.sintef.parse_gs2 as parse
 8 | import sys, os
 9 | import scikits.timeseries as ts
10 | import scikits.timeseries.lib.plotlib as tpl
11 | import sg.utils
12 | import sg.data.eklima.parse_eklima_xml as xml
13 | 
14 | def _collect_and_plot(files):
15 |     TS = []
16 |     location = []
17 |     for f in files:
18 |         temperatures = [ section[1] for section in parse.parse_file(f)[1:-1] if section[1]['Plant'] == ['tmp'] ]
19 |         for t in temperatures:
20 |             if t['Step'][0] != '0000-00-00.01:00:00':
21 |                 print 'Not hourly readings of temperature. Abort.'
22 |                 break
23 |             dates = ts.date_array(start_date=ts.Date('H', t['Start'][0]), length=len(t['Value']))
24 |             data = [ float(value.rsplit('/')[0]) for value in t['Value'] ]
25 |             TS.append(ts.TimeSeries(data=data, dates=dates))
26 |             if location and t['Installation'][0] != location:
27 |                 print 'Location changed during reading of gs2 files. Probably some bad grouping of gs2 files.'
28 |             location = t['Installation'][0]
29 |     if TS:
30 |         path = '/Users/tidemann/Documents/NTNU/devel/data/eklima/Telemark/'
31 |         for file in os.listdir(path):
32 |             try:
33 |                 series = xml.parse(path + file)
34 |                 sg.utils.plot_time_series([ts.concatenate((TS)), series], ['b-','r-'], [location, file])
35 |             except:
36 |                 print file, 'had no data.'
37 |     else:
38 |         print 'No temperature data.'
39 | 
40 | if __name__ == "__main__":
41 |     if not sys.stdin.isatty():
42 |         _collect_and_plot([ s.rstrip('\n') for s in sys.stdin.readlines() ])
43 | 
44 | 


--------------------------------------------------------------------------------
/sg/data/sintef/test_parse_gs2.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import StringIO
 4 | 
 5 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
 6 | _PATH_TO_GS2_TESTDATA = os.path.join(_PATH_TO_HERE, "testfile.gs2")
 7 | _PATH_TO_GS2_TESTDATA_SHORT = os.path.join(_PATH_TO_HERE, "testfile_short.gs2")
 8 | 
 9 | import parse_gs2 as gs2
10 | 
11 | class TestGS2Parser(unittest.TestCase):
12 |     def test__is_entry(self):
13 |         self.assertTrue(gs2._is_entry("#Sum=0\n"))
14 |         self.assertFalse(gs2._is_entry(""))
15 |         self.assertFalse(gs2._is_entry(" something is wrong"))
16 | 
17 |     def test__is_heading(self):
18 |         self.assertTrue(gs2._is_heading("##Heading"))
19 |         self.assertFalse(gs2._is_heading("#Entry"))
20 |         self.assertFalse(gs2._is_heading("Neither"))
21 | 
22 |     def _set_up_section(self, heading):
23 |         line = "##%s\n" % heading
24 |         return gs2._make_section(line)
25 |         
26 |     def test_make_section(self):
27 |         heading = "Time-series"
28 |         section = self._set_up_section(heading)
29 |         self.assertEqual(section[0], heading)
30 |         self.assertIs(type(section[1]), dict)
31 | 
32 |     def test_split_single_line(self):
33 |         key, value = gs2._split_line("#No-of-values=168\n", None)
34 |         self.assertEqual(key, "No-of-values")
35 |         self.assertEqual(value, ["168"])
36 | 
37 |     def test_split_multi_line(self):
38 |         line = "#Value=<\n"
39 |         stream = StringIO.StringIO(" 2.2//0\n 2.5//0\n 2.8//0\n>\n")
40 |         key, value = gs2._split_line(line, stream)
41 |         self.assertEqual(key, "Value")
42 |         self.assertEqual(value, ["2.2//0", "2.5//0", "2.8//0"])
43 |         
44 |     def test_split_raises(self):
45 |         self.assertRaises(ValueError, gs2._split_line,
46 |                           "#No-of-values:168\n", None)
47 | 
48 |     def _set_up_section_generator(self, path):
49 |         stream = open(path, "r")
50 |         return [section for section in gs2.section_generator(stream)]
51 |         
52 |     def test_section_generator(self):
53 |         parsed = self._set_up_section_generator(_PATH_TO_GS2_TESTDATA_SHORT)
54 |         self.assertEqual(len(parsed), 4)
55 |         self.assertEqual(parsed[0][0], "Start-message")
56 |         self.assertEqual(parsed[1][0], "Time-series")
57 |         self.assertEqual(parsed[2][0], "Time-series")
58 |         self.assertEqual(parsed[3][0], "End-message")
59 |         self.assertEqual(parsed[1][1]["Value"],
60 |                          ["0//0", "1.285//0", "0//0", "1.285//0", "0//0",
61 |                           "1.285//0", "0//0"])
62 | 
63 |     def test_parse_gs2(self):
64 |         parsed_manual = self._set_up_section_generator(_PATH_TO_GS2_TESTDATA)
65 |         parsed_auto = gs2.parse_file(_PATH_TO_GS2_TESTDATA)
66 |         self.assertEqual(parsed_manual, parsed_auto)
67 |         self.assertEqual(len(parsed_auto), 29)
68 |         self.assertEqual(parsed_auto[0][0], "Start-message")
69 |         for i in range(1, 28):
70 |             self.assertEqual(parsed_auto[i][0], "Time-series")
71 |         self.assertEqual(parsed_auto[-1][0], "End-message")
72 | 
73 |     def test_find_first_heading(self):
74 |         stream = StringIO.StringIO("##Start-message\n#Id=PD-gs2exp\n")
75 |         line = gs2._find_first_heading(stream)
76 |         self.assertEqual(line, "##Start-message\n")
77 | 
78 |     def test_find_first_heading_no_heading(self):
79 |         stream = StringIO.StringIO("#Start-message\n#Id=PD-gs2exp\n")
80 |         line = gs2._find_first_heading(stream)
81 |         self.assertIs(line, None)
82 | 
83 |     def test_find_first_heading_empty_file(self):
84 |         stream = StringIO.StringIO("")
85 |         line = gs2._find_first_heading(stream)
86 |         self.assertIs(line, None)
87 | 
88 | if __name__ == '__main__':
89 |     unittest.main()
90 | 


--------------------------------------------------------------------------------
/sg/data/sintef/test_userloads.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | import sg.utils.testutils as testutils
 8 | 
 9 | from userloads import * 
10 | 
11 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
12 | 
13 | class TestUserLoads(testutils.ArrayTestCase):
14 |     def setUp(self):
15 |         pass
16 |     
17 |     def tearDown(self):
18 |         pass
19 | 
20 |     def test_invalid_user(self):
21 |         self.assertRaises(KeyError, tempfeeder_dup().__getitem__, -1)
22 |         
23 |     def test_users_equal(self):
24 |         users_dup = tempfeeder_dup().user_ids
25 |         users_nodup = tempfeeder_nodup().user_ids
26 |         self.assertEqual(users_dup, users_nodup)
27 |         
28 |     def test_num_users(self):
29 |         users_dup = tempfeeder_dup().user_ids
30 |         self.assertEqual(len(users_dup), 2416)
31 | 
32 |     def test_getitem(self):
33 |         user_id = 83169400
34 |         ul = UserLoads(tempfeeder_dup().path)
35 |         self.assertNotIn(user_id, ul.loads)
36 |         user_loads = ul[user_id]
37 |         self.assertIn(user_id, ul.loads)
38 |         self.assertEqual(len(user_loads), 36602)
39 |         self.assertIs(type(user_loads), pd.DataFrame)
40 |         self.assertNaNArraysEqual(user_loads.ix[14077], np.array([1., np.nan]))
41 |         self.assertArraysEqual(user_loads.ix[-1], np.array([1., 0.]))
42 |         
43 |     def test_get_set_get(self):
44 |         user_id = 29605779
45 |         idx = 15689
46 |         ul = UserLoads(tempfeeder_nodup().path)
47 |         user_loads = ul[user_id]
48 |         user_loads.ix[idx] = np.array([123, 12])
49 |         user_loads.ix[-1] = np.array([124, 14])
50 |         self.assertArraysEqual(ul[user_id].ix[idx], np.array([123, 12]))
51 |         self.assertArraysEqual(ul[user_id].ix[-1], np.array([124, 14]))
52 |         ul.read(user_id)
53 |         self.assertNaNArraysEqual(user_loads.ix[14077], np.array([0., np.nan]))
54 |         self.assertArraysEqual(ul[user_id].ix[-1], np.array([3., 0.]))
55 | 
56 |     def test_pop(self):
57 |         user_id = 448601
58 |         ul = tempfeeder_dup()
59 |         self.assertNotIn(user_id, ul.loads)
60 |         loads = ul[user_id]
61 |         self.assertIn(user_id, ul.loads)
62 |         ul.pop(user_id)
63 |         self.assertNotIn(user_id, ul.loads)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     unittest.main()
68 |     
69 | 


--------------------------------------------------------------------------------
/sg/data/sintef/test_userloads.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/test_userloads.pyc


--------------------------------------------------------------------------------
/sg/data/sintef/testfile_short.gs2:
--------------------------------------------------------------------------------
 1 | ##Start-message
 2 | #Id=PD-gs2exp -> LogNo: 2724
 3 | #Message-type=Settlement-data
 4 | #Version=1.2
 5 | #Time=2006-01-10.20:46:36
 6 | #To=9999
 7 | #From=981915550
 8 | #GMT-reference=1
 9 | #Description=Verdier fra 'MVS Buskerud Kraft Nett'
10 | 
11 | 
12 | ##Time-series
13 | #Start=2006-01-02.00:00:00
14 | #Stop=2006-01-08.24:00:00
15 | #Step=0000-00-00.01:00:00
16 | #Unit=kWh
17 | #Installation=16807             
18 | #Plant=3
19 | #Meter-location=1
20 | #Value=<
21 |  0//0
22 |  1.285//0
23 |  0//0
24 |  1.285//0
25 |  0//0
26 |  1.285//0
27 |  0//0
28 | >
29 | #No-of-values=168
30 | #Sum=204.321
31 | #Description=4041 "ENERGI"
32 | 
33 | ##Time-series
34 | #Start=2006-01-02.00:00:00
35 | #Stop=2006-01-08.24:00:00
36 | #Step=0000-00-00.01:00:00
37 | #Unit=kWh
38 | #Installation=282475249         
39 | #Plant=3
40 | #Meter-location=1
41 | #Value=<
42 |  2.2//0
43 |  2.5//0
44 |  2.8//0
45 |  3.9//0
46 | >
47 | #No-of-values=168
48 | #Sum=464.5
49 | #Description=4041 "ENERGI"
50 | 
51 | ##End-message
52 | #Id=PD-gs2exp -> LogNo: 2724
53 | 
54 | 


--------------------------------------------------------------------------------
/sg/data/sintef/unique.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import collections
 3 | import pprint
 4 | 
 5 | tags = collections.defaultdict(int)
 6 | for line in sys.stdin:
 7 |     tags[line[:-1]] += 1
 8 | 
 9 | for key in tags:
10 |     print key, ":", tags[key]
11 | 
12 | 


--------------------------------------------------------------------------------
/sg/data/sintef/userloads.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/data/sintef/userloads.pyc


--------------------------------------------------------------------------------
/sg/data/test_dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import datetime
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | import sg.utils.testutils as testutils
 9 | import sg.data.bchydro as bchydro
10 | from dataset import *
11 | 
12 | class DatasetTester(testutils.ArrayTestCase):
13 |     def setUp(self):
14 |         month_index = pd.period_range(start='2005-01-01', periods=12, freq='M')
15 |         day_index = pd.period_range(start='2005-01-01', periods=365, freq='D')
16 |         hour_index = pd.period_range(start='2005-01-01', periods=365*24, freq='H')
17 |         self.months = pd.Series(np.arange(12), index=month_index)
18 |         self.days = pd.Series(np.arange(365), index=day_index)
19 |         self.hours = pd.Series(np.arange(365*24), index=hour_index)
20 |         self.period = datetime.timedelta(days = 9)
21 |         self.month_data = Dataset(self.months, self.period)
22 |         self.day_data = Dataset(self.days, self.period)
23 |         self.hour_data = Dataset(self.hours, self.period,
24 |                                  datetime.timedelta(hours = 12))
25 | 
26 |     def test_calculate_period(self):
27 |         self.assertEqual(self.month_data._period_length, 1)
28 |         self.assertEqual(self.day_data._period_length, 9)
29 |         self.assertEqual(self.hour_data._period_length, 9 * 24)
30 | 
31 |     def test_number_of_periods(self):
32 |         self.assertEqual(self.month_data.num_periods, 12)
33 |         self.assertEqual(self.day_data.num_periods, 357)
34 |         self.assertEqual(self.hour_data.num_periods, 356 * 2)
35 | 
36 |     def test_get_last_period(self):
37 |         last_month = self.month_data.get_period(self.month_data.num_periods - 1)
38 |         self.assertEqual(len(last_month), 1)
39 |         self.assertEqual(last_month[0], 11)
40 |         last_days = self.day_data.get_period(self.day_data.num_periods - 1)
41 |         self.assertEqual(len(last_days), 9)
42 |         self.assertArraysEqual(last_days, self.days[-9:])
43 |         hour_periods = self.hour_data.num_periods
44 |         last_hours = self.hour_data.get_period(hour_periods - 1)
45 |         self.assertEqual(len(last_hours), 9 * 24)
46 |         self.assertArraysEqual(last_hours, self.hours[-9*24-12:-12])
47 | 
48 |     def test_get_first_period(self):
49 |         self.assertArraysEqual(self.month_data.get_period(0), self.months[0:1])
50 |         self.assertArraysEqual(self.day_data.get_period(0), self.days[0:9])
51 |         self.assertArraysEqual(self.hour_data.get_period(0), self.hours[0:9*24])
52 |         
53 |     def test_get_second_period(self):
54 |         self.assertArraysEqual(self.month_data.get_period(1), self.months[1:2])
55 |         self.assertArraysEqual(self.day_data.get_period(1), self.days[1:10])
56 |         self.assertArraysEqual(self.hour_data.get_period(1),
57 |                                self.hours[12:9*24+12])
58 | 
59 |     def test_get_random_period(self):
60 |         for i in range(100):
61 |             (ts, number) = self.month_data.get_random_period(True)
62 |             self.assertArraysEqual(ts, self.months[number:number+1])
63 |             (ts, number) = self.day_data.get_random_period(True)
64 |             self.assertArraysEqual(ts, self.days[number:number+9])
65 |             (ts, number) = self.hour_data.get_random_period(True)
66 |             index = self.hour_data.index_of(number)
67 |             self.assertEqual(index, number * 12)
68 |             self.assertArraysEqual(ts, self.hours[index:index+9*24])
69 | 
70 | class MiscTester(testutils.ArrayTestCase):
71 |     def test_remove_one_outlier(self):
72 |         dataset = np.array([0, 1, 2, 0, 3, 4, 0, 5])
73 |         remove_outlier_set_previous(dataset, outlier_val=0)
74 |         self.assertArraysEqual(dataset, np.array([0, 1, 2, 2, 3, 4, 4, 5]))
75 | 
76 |     def test_remove_consecutive_outliers(self):
77 |         dataset = np.array([0, 1, 0, 0, 0, 4, 0, 5])
78 |         retset = remove_outlier_set_previous(dataset)
79 |         self.assertArraysEqual(retset, np.array([0, 1, 1, 1, 1, 4, 4, 5]))
80 | 
81 |     def test_remove_other_outliers(self):
82 |         dataset = np.array([0, 1, 2, 0, 3, 4, 0, 5])
83 |         remove_outlier_set_previous(dataset, outlier_val=2)
84 |         self.assertArraysEqual(dataset, np.array([0, 1, 1, 0, 3, 4, 0, 5]))
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/sg/data/yr.no/README.txt:
--------------------------------------------------------------------------------
 1 | Denne mappen inneholder data lastet ned i XML- og GRIB-format fra
 2 | yr.no, jfr retningslinjene på http://om.yr.no/verdata/xml/ og
 3 | http://om.yr.no/verdata/grib/. 
 4 | 
 5 | ==================================================
 6 | 
 7 | Forarbeid:
 8 | ----------
 9 | 
10 | Filen noreg.txt er lastet ned fra yr.no:
11 | $ wget http://fil.nrk.no/yr/viktigestader/noreg.txt
12 | 
13 | Filen noreg_viktige.txt er en filtrert versjon av noreg.txt hvor
14 | alle steder med prioritet 99 er fjernet, jfr anbefalingen på
15 | yr.no:
16 | $ /store/gnu/bin/awk --field-separator='\t' '{ if ($4 != 99) print $0}' noreg.txt >noreg_viktige.txt
17 | 
18 | Likeledes inneholder noreg_viktige_namn.txt kun stedsnavn og nummer:
19 | $ /store/gnu/bin/awk --field-separator='\t' '{print $1, $2}' noreg_viktige.txt >noreg_viktige_namn.txt
20 | 
21 | Værvarslene samles i undermapper under mappen "steder". Hver
22 | mappe som skal inneholde værvarsel (i utgangspunktet alle løvnodene
23 | i mappetreet) må inneholde en fil kalt "address.txt". Denne filen
24 | skal inneholde en (og kun en) linje fra noreg.txt, som
25 | spesifiserer sted og URL for varselet som skal lastes ned.
26 | 
27 | For de "enkle" stedene (ikke æøå i navnet og ingen undermapper)
28 | ble address.txt automatisk generert (sed-biten fjerner DOS linjeskift):
29 | $ for d in Bergen Drammen Oslo Stavanger Steinkjer Trondheim; do /store/gnu/bin/awk --field-separator='\t' "{ if (\$2 == \"$d\") print \$0}" noreg_viktige.txt | sed -e's/forecast\.xml.*$/forecast.xml/' >steder/$d/address.txt; done
30 | 
31 | For de andre stedene ble address.txt laget manuelt vha
32 | klipp-og-lim-teknologi.
33 | 
34 | ==================================================
35 | 
36 | Nedlasting:
37 | -----------
38 | 
39 | Selve nedlastingen skjer vha skriptet "get-forecast.sh". 
40 | 
41 | Dette søker opp alle filene kalt address.txt og henter ut
42 | XML-adressen derfra. Adressen endres til å hente timesvarsel
43 | heller enn 6-timers. Varsel og wget-log lagres i de respektive
44 | mappene.
45 | 
46 | Deretter lastes GRIB-filen for Nord-Europa ned og lagres i mappa
47 | "GRIB".
48 | 


--------------------------------------------------------------------------------
/sg/data/yr.no/crontab.txt:
--------------------------------------------------------------------------------
1 | # Don't redirect error messages -> get mail alert on failure 
2 | 0 10 * * * $HOME/sg-shared/src/sg/data/yr.no/get-forecasts.sh >>sg-shared/data/yr.no/forecasts_log.sh
3 | 0 22 * * * $HOME/sg-shared/src/sg/data/yr.no/get-forecasts.sh >>sg-shared/data/yr.no/forecasts_log.sh
4 | 
5 | 


--------------------------------------------------------------------------------
/sg/data/yr.no/get-forecasts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | find=/store/gnu/bin/find
 4 | awk=/store/gnu/bin/awk
 5 | sed=/store/gnu/bin/sed
 6 | date=/store/gnu/bin/date
 7 | wget=/opt/pkg/bin/wget
 8 | 
 9 | BASE_DIR="$HOME/sg-shared/data/yr.no"
10 | GRIB_DIR="$BASE_DIR/GRIB"
11 | PLACES_DIR="$BASE_DIR/steder"
12 | ADDRESSES="address.txt"
13 | FORECAST_FILE="forecast_hour_by_hour.xml"
14 | NOW=`$date --iso-8601=hours`
15 | OUTPUT_FILE="${NOW}_forecast_hour_by_hour.xml"
16 | LOG_FILE="wget_log.txt"
17 | 
18 | echo -n "Retrieving hourly forecasts for $NOW:"
19 | $find "$PLACES_DIR" -type f -name "$ADDRESSES" | while read ADDRESS_FILE; do
20 |     PLACE_DIR=`dirname "$ADDRESS_FILE"`
21 |     PLACE=`echo $PLACE_DIR | $awk --field-separator='/' '{print $NF}'`
22 |     URL=`$awk --field-separator='\t' '{print $NF}' "$ADDRESS_FILE" | $sed -e"s/forecast.xml\$/$FORECAST_FILE/"`
23 |     echo -n "  $PLACE: "
24 |     OUTPUT_PATH="$PLACE_DIR/$OUTPUT_FILE"
25 |     $wget --output-document="$OUTPUT_PATH" "$URL" >>"$PLACE_DIR/$LOG_FILE" 2>&1
26 |     if [ "$?" == 0 ]; then 
27 |         echo -n "Ok."
28 |     else
29 |         echo -n "FAILED!"
30 |         rm -f "$OUTPUT_PATH"
31 |         echo "Failed to retrieve forecasts for $PLACE from yr.no" >&2
32 |     fi
33 |     sleep 1
34 |  done
35 | echo ""
36 | 
37 | echo -n "Retrieving GRIB forecasts for $NOW..."
38 | OUTPUT_FILE="${NOW}_metno-neurope.grb"
39 | OUTPUT_PATH="$GRIB_DIR/$OUTPUT_FILE"
40 | URL="http://api.met.no/weatherapi/gribfiles/1.0/?area=north_europe;content=weather;content_type=application/octet-stream;"
41 | $wget --no-verbose --output-document="$OUTPUT_PATH" "$URL" >>"$GRIB_DIR/$LOG_FILE" 2>&1
42 | if [ "$?" == 0 ]; then 
43 |     echo "Ok."
44 | else
45 |     echo "FAILED!"
46 |     rm -f "$OUTPUT_PATH"
47 |     echo "Failed to retrieve GRIB forecast from yr.no" >&2
48 | fi
49 | 
50 | 


--------------------------------------------------------------------------------
/sg/globals.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
4 | SG_DATA_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "data")
5 | SG_PAPERS_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "papers")
6 | SG_SIM_PATH = os.path.join(_PATH_TO_HERE, "..", "..", "simulations")
7 | SG_MODELS_PATH = os.path.join(_PATH_TO_HERE, "models")
8 | 


--------------------------------------------------------------------------------
/sg/globals.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/globals.pyc


--------------------------------------------------------------------------------
/sg/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/__init__.py


--------------------------------------------------------------------------------
/sg/models/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/__init__.pyc


--------------------------------------------------------------------------------
/sg/models/demo_cleansing.py:
--------------------------------------------------------------------------------
 1 | """Demonstrate the cleansing algorithm on datasets of varying length."""
 2 | 
 3 | import sys
 4 | import time
 5 | from datetime import timedelta as dt
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | import sg.data.sintef.userloads as ul
12 | import spclean as cln
13 | from sg.utils.timer import SimpleTimer
14 | 
15 | def _get_smoother():
16 |     # Set slow_smoother to True in order to see the actual time consumed by the
17 |     # B-spline smoothing operation. If set to False, will use the default
18 |     # smoother where the roughness matrices are cached.
19 |     slow_smoother = True
20 |     if slow_smoother:
21 |         print "Using slow, analytic, non-caching smoother."
22 |         return cln.BSplineAnalyticSmoother
23 |     else:
24 |         print "Using not quite so slow, caching smoother."
25 |         return cln.BSplineSmoother
26 |         
27 | # Load a dataset containing power load history. This set is divided into
28 | # training and test data, we only keep the traning part for now.
29 | dataset, _ = ul.total_experiment_load()
30 | 
31 | # Set parameters for the B-spline smoother/cleanser
32 | smoothness = 10
33 | zscore = 0.5
34 | # Try smoothing/cleansing different time series lengths
35 | for hindsight_days in [1]:
36 |     # Select data
37 |     num_hours = 24 * hindsight_days
38 |     data = dataset["Load"][-num_hours:].copy()
39 |     # Some output and rough timing
40 |     print "Cleansing %d hours of data with smoothness %.2f, z-score %.2f..." % \
41 |       (num_hours, smoothness, zscore)
42 |     sys.stdout.flush()
43 |     start_time = time.time()
44 |     # This is the part that takes time    
45 |     smoother = _get_smoother()(data, smoothness)
46 |     cleaner = cln.RegressionCleaner(smoother, zscore)
47 |     cleaned, _ = cleaner.get_cleaned_data(
48 |         method=cln.RegressionCleaner.replace_with_bound)
49 |     # Wrap up and plot the result
50 |     end_time = time.time()
51 |     print "Done in %s." % SimpleTimer.period_to_string(start_time, end_time)
52 | 
53 |     print cleaned
54 |     sys.stdout.flush()
55 |     plt.figure()
56 |     data.plot(style='r', label='Raw load')
57 | 
58 |     spline = pd.TimeSeries(data=smoother.splev(range(len(cleaned))),
59 |                            index=cleaned.index)
60 |     spline.plot(style='g', label='Smoothing spline')
61 | 
62 |     # THE SAUSAGE!
63 |     lower, upper = cleaner.get_confidence_interval()
64 |     ax = plt.gca()
65 |     ax.fill_between(cleaned.index, lower, upper, facecolor='g', alpha=0.1)
66 | 
67 |     cleaned.plot(style='b', label='Cleaned load')
68 |     plt.legend(loc=3)
69 | 
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/sg/models/esn.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/esn.pyc


--------------------------------------------------------------------------------
/sg/models/exp_cleaning.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from datetime import timedelta as dt
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | 
 7 | import sg.data.bchydro as bc
 8 | import spclean as cln
 9 | 
10 | def clean_all_bc_data(period_days=7, step_days=6, 
11 |                       smoothnesses=(0.1, 1, 3, 6, 10, 100)):
12 |     """Clean the entire BC Hydro dataset period by period with the given
13 |     smoothnesses. By default clean a week at a time with 1 day overlap (step
14 |     length 6 days).
15 | 
16 |     Returns a dictionary keyed on the smoothness, where the values are lists of
17 |     tuples, each tuple consisting of the period number and the outlier indices
18 |     for all periods with outliers."""
19 | 
20 |     dataset = bc.Dataset(period=dt(days=period_days), 
21 |                          step_length=dt(days=step_days))
22 |     outliers_at = dict()
23 |     for smoothness in smoothnesses:
24 |         outliers_at[smoothness] = cln.clean_entire_dataset(dataset, smoothness)
25 |         print "cleaned with smoothness", smoothness
26 |     return (dataset, outliers_at)
27 | 
28 | def clean_and_process_bc_data(period_days=7, step_days=6, 
29 |                       smoothnesses=(0.1, 1, 3, 6, 10, 100)):
30 |     """Clean BC data using clean_all_bc_data. Then plot the data to show the
31 |     distribution of outliers per period and smoothness."""
32 |     data, outliers_at = clean_all_bc_data(period_days, step_days, smoothnesses)
33 |     x = np.arange(data.num_periods)
34 |     y_at = dict()
35 |     for (smoothness, outliers) in outliers_at.iteritems():
36 |         y = np.zeros(data.num_periods)
37 |         for (period, outlier_indices) in outliers:
38 |             y[period] = len(outlier_indices)
39 |         y_at[smoothness] = y
40 |     plt.figure()
41 |     plt.hold(True)
42 |     plt.title("Number of cleaned points for various smoothnesses")
43 |     axes = plt.gcf().gca()
44 |     for (smoothness, y) in y_at.iteritems():
45 |         plt.figure()
46 |         plt.plot(x, y, 'x')
47 |         plt.title("Number of cleaned points for smoothness %.2f" % smoothness)
48 |         axes.plot(x, y, 'x', label="Smoothness %.2f" % smoothness)
49 |         plt.figure()
50 |         plt.hist(y)
51 |         plt.title("Histogram of number of cleaned points for " \
52 |                       "smoothness %.2f" % smoothness)
53 |     axes.legend()
54 | 
55 | def show_max_cleaning():
56 |     week = 264
57 |     dataset = bc.Dataset(period=dt(days=7), step_length=dt(days=6))
58 |     period = dataset.get_period(week)
59 |     smoother = cln.BSplineSmoother(period, smoothness=3)
60 |     cleaner = cln.RegressionCleaner(smoother, zscore=0.67)
61 |     (clean_data, outliers) = cleaner.get_cleaned_data(
62 |         cln.RegressionCleaner.replace_with_estimate)
63 |     plt.figure()
64 |     plt.hold(True)
65 |     n = len(smoother.dataset)
66 |     knots = smoother.knots
67 |     t = np.linspace(knots[0], knots[-1], n * 25)
68 |     y = smoother.splev(t)
69 |     plt.hold(True)
70 |     plt.plot(t, y)
71 |     x = np.linspace(knots[0], knots[-1], n)
72 |     plt.plot(x, smoother.dataset, 'mx')
73 |     (lower, upper) = cleaner.get_confidence_interval()
74 | 
75 |     plt.plot(lower, 'g-')
76 |     plt.plot(upper, 'g-')
77 |     if len(outliers) > 0:        
78 |         print "Drawing %d outliers." % len(outliers)
79 |         plt.plot(outliers, clean_data[outliers], 'r*', label="Cleaned data")
80 |     else:
81 |         print "No outliers!"
82 |     
83 | if __name__ == "__main__":
84 |     show_max_cleaning()
85 |     print "Done cleaning, showing plot"
86 |     plt.show()
87 | 


--------------------------------------------------------------------------------
/sg/models/filter-R-messages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import re
 4 | import sys
 5 | 
 6 | def rfilter(stream):
 7 |     messages = ["In log\(s2\) : NaNs produced",
 8 |                 "^Warning message[s]*:",
 9 |                 "[Ii]n arima\(x = loads, order = order, xreg = temp_hc\)",
10 |                 "non-stationary AR part from CSS",
11 |                 "possible convergence problem: optim gave code=",
12 |                 "Error in optim\(init\[mask\], armafn",
13 |                 "non-finite finite-difference value",
14 |                 "There were [0-9]* warnings \(use warnings\(\) to see them\)"]
15 |     re_objs = [re.compile(msg) for msg in messages]
16 |     for line in stream:
17 |         do_filter = False
18 |         for prog in re_objs:
19 |             if prog.search(line) is not None:
20 |                 do_filter = True
21 |         if not do_filter:
22 |             print line[:-1]
23 | 
24 | if __name__ == "__main__":
25 |     if len(sys.argv) == 1:
26 |         rfilter(sys.stdin)
27 |     else:
28 |         for path in sys.argv[1:]:
29 |             with open(path, "r") as f:
30 |                 rfilter(f)
31 |     
32 | 


--------------------------------------------------------------------------------
/sg/models/ga.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/ga.pyc


--------------------------------------------------------------------------------
/sg/models/genome_evaluator.py:
--------------------------------------------------------------------------------
  1 | """Use this program to evaluate one genome at a time, read from standard
  2 | input."""
  3 | 
  4 | import sys
  5 | import ast
  6 | import traceback
  7 | import random
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | import sg.utils.pyevolve_utils as pu
 12 | import sg.utils
 13 | import ga
 14 | import sg.data.sintef.userloads as ul
 15 | import load_prediction as lp
 16 | from load_prediction_ar import *
 17 | from load_prediction_ar24 import *
 18 | from load_prediction_arima import *
 19 | from load_prediction_dshw import *
 20 | from load_prediction_esn import *
 21 | from load_prediction_esn24 import *
 22 | try:
 23 |     from load_prediction_CBR import *
 24 |     from load_prediction_wavelet import *
 25 |     from load_prediction_wavelet24 import *
 26 | except ImportError:
 27 |     print >>sys.stderr, "Genome evaluator can't import CBR/wavelet modules, probably some of the dependencies are not installed."
 28 | 
 29 | options = None
 30 | def get_options():
 31 |     global options
 32 |     parser = lp.prediction_options()
 33 |     parser = lp.ga_options(parser)
 34 |     parser = lp.data_options(parser)
 35 |     parser.add_option("--model", dest="model", help="The model class that the genomes instantiate", default=None)
 36 |     parser.add_option("--test-set", dest="test_set", action="store_true",
 37 |                       help="Test the genomes on the test set, rather than on the training set", default=False)
 38 |     parser.add_option("--plot", dest="plot", action="store_true",
 39 |                       help="Make a plot (in combination with --test-set)", default=False)
 40 |     (options, args) = parser.parse_args()
 41 |     lp.options = options
 42 |     if options.model is None:
 43 |         print >>sys.stderr, "Model argument is required."
 44 |         sys.exit(1)
 45 | 
 46 | def read_next_genome_list():
 47 |     print "Enter genome to be evaluated: "
 48 |     line = sys.stdin.readline()
 49 |     if line == "":
 50 |         print "End of input, exiting."
 51 |         sys.exit(0)
 52 |     return ast.literal_eval(line)
 53 | 
 54 | def next_indiv():
 55 |     gl = read_next_genome_list()
 56 |     genome = pu.AllelesGenome()
 57 |     genome.setInternalList(gl)
 58 |     genome.setParams(num_trials=options.num_trials)
 59 |     return genome
 60 |     
 61 | def gene_test_loop(model):
 62 |     while sys.stdin:
 63 |         ga._model = model
 64 |         indiv = next_indiv()
 65 |         if options.test_set:
 66 |             print "Evaluating genome on test set: ", indiv[:]
 67 |             sys.stdout.flush()
 68 |             try:
 69 |                 (target, predictions) = lp.parallel_test_genome(indiv, model) if options.parallel else lp.test_genome(indiv, model)
 70 |             except Exception, e:
 71 |                 print >>sys.stderr, "Exception raised, failed to evaluate genome."
 72 |                 tb = "  " + traceback.format_exc(limit=50)[:-1]
 73 |                 print >>sys.stderr, tb.replace("\n", "\n  ")
 74 |                 continue
 75 |             error = sg.utils.concat_and_calc_error(predictions, target, model.error_func)
 76 |             print "Error on test phase: {}".format(error)
 77 |             if options.plot:
 78 |                 sg.utils.plot_target_predictions(target, predictions)
 79 |                 plt.show()
 80 |         else:
 81 |             print "Evaluating genome on training set: ", indiv[:]
 82 |             sys.stdout.flush()
 83 |             fitness = ga._fitness(indiv)
 84 |             print "Fitness:", fitness
 85 |             if fitness != 0:
 86 |                 print "Error:", ga._fitness_to_error(fitness)
 87 |             else:
 88 |                 print "Error not calculated for 0 fitness."
 89 | 
 90 | def run(): 
 91 |     """."""
 92 |     get_options()
 93 |     prev_handler = np.seterrcall(lp.float_err_handler)
 94 |     prev_err = np.seterr(all='call')
 95 |     np.seterr(under='ignore')
 96 |     random.seed(options.seed)
 97 |     np.random.seed(options.seed)
 98 |     model_creator = eval(options.model + "(options)")
 99 |     model = model_creator.get_model()
100 |     lp._print_sim_context(model._dataset)
101 |     print "Number of training sequences: %d" % options.num_trials 
102 |     print "Start days of training sequences:", model._dataset.train_periods_desc
103 |     gene_test_loop(model)
104 |     ul.tempfeeder_exp().close()
105 | 
106 | if __name__ == "__main__":
107 |     run()
108 |     
109 | 


--------------------------------------------------------------------------------
/sg/models/gridopt_load_prediction.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta as dt
 2 | import math
 3 | import random
 4 | import os
 5 | import sys
 6 | import cPickle as pickle
 7 | 
 8 | import numpy as np
 9 | import Oger
10 | import mdp, mdp.nodes
11 | import matplotlib.pyplot as plt
12 | import scikits.timeseries as ts
13 | 
14 | import esn
15 | import sg.utils
16 | from sg.data.sintef.create_full_temp_data import data as read_temperatures
17 | import sg.data.sintef.userloads as ul
18 | import load_prediction
19 | 
20 | def optimize(postfix):
21 |     # sg.utils.redirect(sys.stdout, "gridopt_output_%s.txt" % postfix)
22 |     
23 |     user_id = 55864860
24 |     (dataset, test) = load_prediction.prepare_datasets(user_id)
25 |     
26 |     day = 24
27 |     freerun = day
28 |     today = 4600
29 |     
30 |     # [len_data, res_size, leak, input, bias, spectral, 
31 |     #  seed, ridge, tmp_sm, load_sm]
32 |     train_hours = 336
33 |     
34 |     datas = \
35 |         [sg.utils.Normalizer(dataset[today-train_hours:today+day-freerun,:], axis=0)
36 |          for today in (1000, 2000, 3000, 4000)]
37 |     
38 |     input_data = []
39 |     for data in datas:
40 |         temps, loads = zip(*data.normalized)
41 |         input_data.append([np.array((temps[24:], loads[:-24], loads[24:])).T])
42 | 
43 |     reservoir = Oger.nodes.LeakyReservoirNode(output_dim=400,
44 |                                               leak_rate=1,
45 |                                               input_scaling=0.5,
46 |                                               bias_scaling=0.75,
47 |                                               spectral_radius=1,
48 |                                               reset_states=False)
49 |     readout = Oger.nodes.RidgeRegressionNode(ridge_param = 0.001)
50 |     flow = Oger.nodes.FreerunFlow(reservoir + readout,
51 |                                   freerun_steps = freerun,
52 |                                   external_input_range= \
53 |                                   np.array([0, 1]))
54 | 
55 |     # gridsearch_parameters = {reservoir: {'_instance': range(5), 
56 |     #                                      'spectral_radius': [0.6, 0.8, 1],
57 |     #                                      'input_scaling': [0.1, 0.5, 0.9],
58 |     #                                      'bias_scaling': [0.1, 0.5, 0.9],
59 |     #                                      'leak_rate': [0.1, 0.5, 0.9]},
60 |     #                          readout: {'_instance': range(5),
61 |     #                                    'ridge_param': [0.1, 0.5, 0.9]}}
62 |     
63 |     gridsearch_parameters = {reservoir: {'_instance': range(20)},
64 |                              readout: {'ridge_param': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}}
65 | 
66 |     print "gridsearch_parameters = " + str(gridsearch_parameters)
67 |     optimizer = Oger.evaluation.Optimizer(gridsearch_parameters, 
68 |                                           Oger.utils.nrmse)
69 | 
70 |     optimizer.grid_search([[], input_data], flow,
71 |                           cross_validate_function=Oger.evaluation.leave_one_out)
72 | 
73 |     return (optimizer, reservoir)
74 | 
75 | def store_optimal_flow(optimizer, postfix):
76 |     optflow = optimizer.get_optimal_flow(verbose=True)
77 | 
78 |     with open("gridopt_optimal_flow_%s.pickle" % postfix, "w") as f:
79 |         pickle.dump(optflow, f)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     #postfix = str(os.getpid())
84 |     postfix = "deleteme"
85 |     optimizer, reservoir = optimize(postfix)
86 |     store_optimal_flow(optimizer, postfix)
87 |     optimizer.plot_results([(reservoir, '_instance')])
88 | 
89 | 


--------------------------------------------------------------------------------
/sg/models/gui.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/gui.pyc


--------------------------------------------------------------------------------
/sg/models/lib_atlas/BsplineAnalyticSmoother.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BsplineAnalyticSmoother.h
 3 |  *
 4 |  * Created on: Dec 25, 2012
 5 |  *	Author: Hasib
 6 |  */
 7 | 
 8 | #ifndef BSPLINEANALYTICSMOOTHER_H_
 9 | #define BSPLINEANALYTICSMOOTHER_H_
10 | 
11 | #include <iostream>
12 | #include <stdio.h>
13 | 
14 | #ifdef __cplusplus
15 | extern "C"
16 | {
17 | #endif
18 | 	#include <math.h>
19 | 	#include <cblas.h>
20 | 	#include <clapack.h>
21 | 	#include <stdlib.h>	
22 | 	#include <string.h>
23 | 	#include <malloc.h>
24 | 	#include <omp.h>
25 | 	
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 
30 | #define N_ALIGN (size_t)64
31 | 
32 | class BsplineAnalyticSmoother {
33 | 
34 | 	private:
35 | 		int degree;
36 | 		int n_threads;
37 | 		unsigned int n_data;
38 | 		unsigned int n_knot;
39 | 		unsigned int n_coef;
40 | 		double smoothness;
41 | 		double zscore;
42 | 		double *knots;
43 | 		double *dataset;
44 | 		double *S;
45 | 		double *smoothed_data;
46 | 		double *cleaned_data;
47 | 
48 | 		double bsplinebasis(unsigned int i, int p, double t);
49 | 		double bsplinebasis_deriv(int i, int p, int n, double t);
50 | 		double* get_phi();
51 | 		double* get_roughness();
52 | 		void calc_hatMatrix();
53 | 
54 | 	public:
55 | 		BsplineAnalyticSmoother(double *dataset, unsigned int n_data, double *knots, unsigned int n_knot,  int degree, double smoothness, double zscore, int n_threads);
56 | 		virtual ~BsplineAnalyticSmoother();
57 | 		void calc_smoothedData();
58 | 		double* calc_cleanedData();
59 | 		double *get_smoothedData();
60 | };
61 | 
62 | #endif /* BSPLINEANALYTICSMOOTHER_H_ */
63 | 


--------------------------------------------------------------------------------
/sg/models/lib_atlas/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # source files
 3 | #SRC = BsplineAnalyticSmoother.cpp demoCleansing.cpp
 4 | SRC = BsplineAnalyticSmoother.cpp
 5 | 
 6 | OBJ = $(SRC:.cpp=.o)
 7 | 
 8 | PHYLIB = libspclean.so
 9 | 
10 | # #include directories
11 | #INCLUDES = -I. -I/usr/local/include -I/usr/local/include/atlas -I/usr/include
12 | #INCLUDES = -I. -I/usr/local/include -I/usr/local/include/atlas -I/home/hasib/software/mathlib/plasma-installer_2.5.0b1/install/include -I$(HOME)/include
13 | INCLUDES = -I. -I/usr/local/include -I/usr/include -I/usr/local/include/atlas -I$(HOME)/include
14 | 
15 | #C++ compiler flags
16 | #CFLAGS = -O3 -fPIC -Wall	#for lib
17 | CFLAGS = -O3 -fPIC -fopenmp -Wall	#for lib
18 | #CFLAGS = -O3 -fopenmp	#for executable
19 | #CFLAGS = -g -pg -O0 -Wall -fPIC # for debugging
20 | 
21 | #compiler
22 | CC = g++
23 | 
24 | #library paths
25 | hostname := $(shell hostname)
26 | 
27 | ifeq ($(hostname), rocks.hpc.ntnu.no)
28 |    LIBS = -llapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -lgfortran
29 | else
30 |    #LIBS = -L/home/hasib/software/mathlib/plasma-installer_2.5.0b1/install/lib -L/usr/lib64 -L/usr/local/lib64
31 |    LIBS = -L/usr/local/lib64 -L$(HOME)/lib/ptlib -lptlapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran
32 |    #LIBS = -L/usr/local/lib64 -L$(HOME)/lib/ptlib -lptlapack -lptcblas -lptf77blas -latlas -lm -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran
33 | endif
34 | 
35 | default: $(PHYLIB)
36 | 
37 | .cpp.o:
38 | 	$(CC) $(INCLUDES) $(CFLAGS) -c $< -o $@
39 | 
40 | $(PHYLIB):$(OBJ)
41 | 	$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS)
42 | 
43 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcoreblas -lquark -lcblas -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lrt -lpthread -lgomp -L/usr/lib64/gcc/x86_64-suse-linux/4.6/ -lgfortran
44 | #$(CC) -shared  -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcblas -lcoreblas -lquark -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lpthread -lgomp
45 | #$(CC) -shared  -o $(PHYLIB) $(OBJ) $(LIBS) -llapack -lptcblas -lptf77blas -latlas -lpthread -lgomp
46 | #$(CC)  -o $(PHYLIB) $(OBJ) $(LIBS) -lplasma -lcoreblas -lquark -lcblas -llapacke -ltmg -llapack -lf77blas -latlas -lm -lrt -lhwloc -lrt -lpthread
47 | #$(CC)  -o $(PHYLIB) $(OBJ) $(LIBS) -lpthread -lptcblas -latlas -llapack -llapacke -lquark -ltmg -lcoreblas  -lplasma  -lm -lrt
48 | #$(CC) -shared -o $(PHYLIB) $(OBJ) $(LIBS) -lcblas -latlas -lgomp
49 | 
50 | clean:
51 | 	rm -f *.o $(PHYLIB)
52 | 
53 | 


--------------------------------------------------------------------------------
/sg/models/lib_mkl/BsplineAnalyticSmoother.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BsplineAnalyticSmoother.cpp
 3 |  *
 4 |  * Created on: Dec 25, 2012
 5 |  * Last Modified on: Feb 7, 2013
 6 |  * Feature: parallel BLAS and LAPACK
 7 |  * Author: Hasib
 8 |  *
 9 |  * */
10 | 
11 | #ifndef BSPLINEANALYTICSMOOTHER_H_
12 | #define BSPLINEANALYTICSMOOTHER_H_
13 | 
14 | #include <iostream>
15 | 
16 | #ifdef __cplusplus
17 | extern "C"
18 | {
19 | #endif
20 | 	#include <math.h>
21 | 	#include <stdlib.h>	
22 | 	#include <string.h>
23 | 	#include <malloc.h>
24 | 	#include <omp.h>
25 | 	#include <mkl_lapacke.h>
26 | 	#include <mkl.h>
27 | 
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 | 
32 | #define N_ALIGN (size_t)64
33 | 
34 | class BsplineAnalyticSmoother {
35 | 
36 | 	private:
37 | 		int degree;
38 | 		unsigned int n_data;
39 | 		unsigned int n_knot;
40 | 		unsigned int n_coef;
41 | 		double smoothness;
42 | 		double zscore;
43 | 		double *knots;
44 | 		double *dataset;
45 | 		double *S;
46 | 		double *smoothed_data;
47 | 		double *cleaned_data;
48 | 
49 | 		double bsplinebasis(unsigned int i, int p, double t);
50 | 		double bsplinebasis_deriv(int i, int p, int n, double t);
51 | 		double* get_phi();
52 | 		double* get_roughness();
53 | 		void calc_hatMatrix();
54 | 
55 | 	public:
56 | 		BsplineAnalyticSmoother(double *dataset, unsigned int n_data, double *knots, unsigned int n_knot,  int degree, double smoothness, double zscore);
57 | 		virtual ~BsplineAnalyticSmoother();
58 | 		void calc_smoothedData();
59 | 		double* calc_cleanedData();
60 | 		void print_cleanedData();
61 | 		double *get_smoothedData();
62 | 
63 | };
64 | 
65 | #endif /* BSPLINEANALYTICSMOOTHER_H_ */
66 | 


--------------------------------------------------------------------------------
/sg/models/lib_mkl/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | #source files
 3 | SRC = BsplineAnalyticSmoother.cpp
 4 | 
 5 | OBJ = $(SRC:.cpp=.o)
 6 | 
 7 | PHYLIB = libspclean.so
 8 | 
 9 | #include directories
10 | INCLUDES = -I. -I$(MKLROOT)/include
11 | 
12 | #C++ compiler flags
13 | CFLAGS = -O3 -fPIC -openmp
14 | 
15 | #compiler
16 | CC = icpc
17 | 
18 | #library paths
19 | LIBS = -L$(MKLROOT)/lib/intel64/
20 | 
21 | default: $(PHYLIB)
22 | 
23 | .cpp.o:
24 | 	$(CC) $(INCLUDES) $(CFLAGS) -c $< -o $@
25 | 
26 | #create library
27 | $(PHYLIB):$(OBJ)
28 | 	$(CC) -shared  -o $(PHYLIB) $(OBJ) $(LIBS) -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -lmkl_avx -lmkl_def
29 | 
30 | 	
31 | clean:
32 | 	rm -f *.o $(PHYLIB)
33 | 


--------------------------------------------------------------------------------
/sg/models/linear.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 |     
4 | 


--------------------------------------------------------------------------------
/sg/models/load_cleansing.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from multiprocessing import Lock
  3 | import sys
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import sg.models.spclean as cln
 10 | from sg.utils.cache import ATimeCache
 11 | from sg.utils.timer import SimpleTimer
 12 | 
 13 | _smoother = None
 14 | _max_cache_size = 10000
 15 | _temp_mutex = Lock()
 16 | _load_mutex = Lock()
 17 | _temp_cache = ATimeCache(_max_cache_size)
 18 | _load_cache = ATimeCache(_max_cache_size)
 19 | 
 20 | def _get_dataset_hash(dataset):
 21 |     m = hashlib.md5()
 22 |     m.update(dataset)
 23 |     # Put the index in the hash, otherwise invalid datasets will be created
 24 |     # when we have no temperature data (different dates, same data ->
 25 |     # TimeSeries with lots of NaNs).
 26 |     m.update(str(dataset.index[0].value))
 27 |     m.update(str(dataset.index[-1].value))
 28 |     return m.digest()
 29 |     
 30 | def bspline_clean_dataset(dataset, genome, loci, prediction_steps):
 31 |     """Clean a dataset containing temperatures and loads using cleaning
 32 |     parameters from the genome. The dataset is expected to contain NaNs in the
 33 |     last *prediction_steps* elements of the Load series"""
 34 |     # Having the smoother as a global is not nice, but it speeds up things A
 35 |     # LOT, because pickling the smoother caches takes a long time for large
 36 |     # matrices (long time series).
 37 |     global _smoother, _temp_cache, _load_cache
 38 |     if _smoother is None:
 39 |         _smoother = cln.BSplineSmoother(dataset, smoothness=1)
 40 |     clean_data = dataset.copy()
 41 |     key = (_get_dataset_hash(dataset["Temperature"]), 
 42 |            genome[loci.t_smooth], genome[loci.t_zscore])
 43 |     try:
 44 |         _temp_mutex.acquire()
 45 |         clean_data['Temperature'] = _temp_cache[key].copy()
 46 |         # print "Got temp from cache: <dataset_hash>", key[1], key[2]
 47 |         # sys.stdout.flush()
 48 |     except KeyError:
 49 |         _temp_mutex.release()
 50 |         # print "Storing temp to cache: <dataset_hash>", key[1], key[2]
 51 |         # sys.stdout.flush()
 52 |         clean_data['Temperature'] = \
 53 |           cln.bspline_clean(dataset['Temperature'], 
 54 |                             genome[loci.t_smooth], 
 55 |                             genome[loci.t_zscore], _smoother)
 56 |         _temp_mutex.acquire()
 57 |         _temp_cache[key] = clean_data['Temperature'].copy()
 58 |     _temp_mutex.release()
 59 |     key = (_get_dataset_hash(dataset["Load"]), 
 60 |            genome[loci.l_smooth], genome[loci.l_zscore])
 61 |     try:
 62 |         _load_mutex.acquire()
 63 |         clean_data['Load'][:-prediction_steps] = _load_cache[key].copy()
 64 |         # print "Got load from cache: <dataset_hash>", key[1], key[2]
 65 |         # sys.stdout.flush()
 66 |     except KeyError:
 67 |         _load_mutex.release()
 68 |         # print "Storing load to cache: <dataset_hash>", key[1], key[2]
 69 |         # sys.stdout.flush()
 70 |         clean_data['Load'][:-prediction_steps] = \
 71 |           cln.bspline_clean(dataset['Load'][:-prediction_steps], 
 72 |                             genome[loci.l_smooth], 
 73 |                             genome[loci.l_zscore], _smoother)
 74 |         _load_mutex.acquire()
 75 |         _load_cache[key] = clean_data['Load'][:-prediction_steps].copy()
 76 |     _load_mutex.release()
 77 |     return clean_data
 78 | 
 79 | def bspline_clean_dataset_no_cache(dataset, genome, loci, prediction_steps):
 80 |     """Clean a dataset containing temperatures and loads using cleaning
 81 |     parameters from the genome. The dataset is expected to contain NaNs in the
 82 |     last *prediction_steps* elements of the Load series"""
 83 |     # Having the smoother as a global is not nice, but it speeds up things A
 84 |     # LOT, because pickling the smoother caches takes a long time for large
 85 |     # matrices (long time series).
 86 |     global _smoother
 87 |     if _smoother is None:
 88 |         _smoother = cln.BSplineSmoother(dataset, smoothness=1)
 89 |     clean_data = dataset.copy()
 90 |     clean_data['Temperature'] = cln.bspline_clean(dataset['Temperature'], 
 91 |                                                   genome[loci.t_smooth], 
 92 |                                                   genome[loci.t_zscore], _smoother)
 93 |     clean_data['Load'][:-prediction_steps] = \
 94 |       cln.bspline_clean(dataset['Load'][:-prediction_steps], 
 95 |                         genome[loci.l_smooth], 
 96 |                         genome[loci.l_zscore], _smoother)
 97 |     return clean_data
 98 | 
 99 | def bspline_clean_dataset_fast(dataset, genome, loci, prediction_steps):
100 |     """Clean a dataset containing temperatures and loads using cleaning
101 |     parameters from the genome. The dataset is expected to contain NaNs in the
102 |     last *prediction_steps* elements of the Load series"""
103 |     clean_data = dataset.copy()
104 |     clean_data['Temperature'] = cln.bspline_clean_fast(
105 |         dataset['Temperature'], genome[loci.t_smooth], genome[loci.t_zscore])
106 |     clean_data['Load'][:-prediction_steps] = \
107 |       cln.bspline_clean_fast(
108 |         dataset['Load'][:-prediction_steps], 
109 |         genome[loci.l_smooth], genome[loci.l_zscore])
110 |     return clean_data
111 | 


--------------------------------------------------------------------------------
/sg/models/load_cleansing.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/load_cleansing.pyc


--------------------------------------------------------------------------------
/sg/models/load_prediction.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/load_prediction.pyc


--------------------------------------------------------------------------------
/sg/models/load_prediction.py~:
--------------------------------------------------------------------------------
  1 | """Initiates models, runs them through a genetic algorithm to find the
  2 | optimal parameters, and tests the models in a production setting."""
  3 | 
  4 | import random
  5 | from datetime import timedelta as dt
  6 | import numpy as np
  7 | import mdp, Oger, pdb
  8 | import matplotlib.pyplot as plt
  9 | import scipy
 10 | import scikits.timeseries as ts
 11 | import itertools as it
 12 | 
 13 | from pyevolve import GAllele
 14 | import sg.data.bchydro as bchydro
 15 | import sg.utils
 16 | from model import Model
 17 | from ga import run_GA
 18 | import esn
 19 | 
 20 | def _load_prediction():
 21 |     """This is where the models are defined. The models are passed to the GA
 22 |     engine for evolution of the optimal set of parameters. Afterwards,
 23 |     the models are tested, and performance is measured."""
 24 | 
 25 |     dataset = bchydro.Dataset(period=dt(days=7*3))
 26 |     train, test = dataset.split()
 27 | 
 28 |     alleles = GAllele.GAlleles()
 29 |     alleles.add(GAllele.GAlleleRange(24,1000)) # Data length
 30 |     alleles.add(GAllele.GAlleleRange(10, 250)) # Network size
 31 |     alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Leak rate
 32 |     alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Input scaling
 33 |     alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Bias scaling
 34 |     alleles.add(GAllele.GAlleleRange(0, 1, real=True)) # Spectral radius
 35 |     alleles.add(GAllele.GAlleleRange(0,100000)) # Seed
 36 | 
 37 |     ESN_feedforward = Model(genes = alleles, error_func = Oger.utils.nrmse,
 38 |                             train_and_predict_func = esn.feedforward,
 39 |                             dataset = train)
 40 | 
 41 |     for model in [ ESN_feedforward ]:
 42 |         run_GA(model)
 43 |         print model.genome
 44 |         day = model.genome.getParam('day')
 45 |         i = 1
 46 |         target = []
 47 |         prediction = []
 48 |         while day*i+model.genome[0] <= len(test.series):
 49 |             test_day = sg.utils.scale(test.series[day*(i-1):day*i+model.genome[0]].data)
 50 |             test_day.shape = [test_day.shape[0], 1]
 51 |             ytest = model.train_and_predict_func(test_day[:-day],
 52 |                                                  model.genome)
 53 |             target.append(test_day[-day:])
 54 |             prediction.append(ytest[-day:])
 55 |             i += 1
 56 | 
 57 |         target = [ e[0] for e in it.chain(*target) ]
 58 |         prediction = [ e[0] for e in it.chain(*prediction) ]
 59 |         plt.figure()
 60 |         plt.title('Prediction through test phase, %i days, error = %3.2f'%\
 61 |             (i,model.error_func(np.array(prediction),np.array(target))))
 62 |         plt.plot(target, 'b', label='Target')
 63 |         plt.plot(prediction, 'r', label='Prediction')
 64 |         plt.legend(loc=3)
 65 |     plt.show()
 66 |     
 67 | if __name__ == "__main__":
 68 |     _load_prediction()
 69 | 
 70 | 
 71 |     # Plot the best individual, with a red line showing when the
 72 |     # prediction started.
 73 |     # plt.figure()
 74 |     # plt.title('Target, test, real life plots')
 75 |     # plt.plot(top_dog.target, 'b', label='Target')
 76 |     # plt.plot(top_dog.xtest, top_dog.ytest,
 77 |     #          'g', label='%s, Error=%3.2f'%(top_dog.label,top_dog.score))
 78 |     # plt.axvline(top_dog.xtest[0], plt.ylim()[0], plt.ylim()[1], color='r')
 79 |     # # Plot real life example, magenta line showing real life performance start.
 80 |     # plt.plot(top_dog.xreal, top_dog.yreal,
 81 |     #          'c', label='Real life prediction, Error=%3.2f' \
 82 |     #          %top_dog.getParam('error_func')(top_dog.yreal, top_dog.target[-24:]))
 83 |     # plt.plot(top_dog.xreal, top_dog.retrain,
 84 |     #          'k', label='Retrained, Error=%3.2f' \
 85 |     #          %top_dog.getParam('error_func')(top_dog.retrain, top_dog.target[-24:]))
 86 |     # plt.axvline(top_dog.xreal[0], plt.ylim()[0], plt.ylim()[1], color='m')
 87 |     # plt.legend(loc=3)
 88 |     # # Plot real life performance
 89 |     # plt.figure()
 90 |     # plt.title('Real life Error')
 91 |     # plt.errorbar(np.arange(0,len(real_avg),1), real_avg, yerr=real_std, label='Mean')
 92 |     # plt.plot(real_max, label='Max')
 93 |     # plt.plot(real_min, label='Min')
 94 |     # plt.legend()
 95 |     # # Scatter plot of tested error vs actual error
 96 |     # plt.figure()
 97 |     # plt.scatter([ I.score for I in ga.getPopulation() ],
 98 |     #             [ I.getParam('error_func')(I.yreal, I.target[-24:]) for I in ga.getPopulation() ])
 99 |     # plt.xlabel('Raw score')
100 |     # plt.ylabel('Real life score')
101 |     # plt.show()
102 | 
103 |     # We store the result for plotting purposes.
104 |     # chromosome.target = load_data
105 |     # chromosome.ytest = ytest[-day:]
106 |     # chromosome.xtest = scipy.arange(len(load_data)-day*2, len(load_data)-day, 1)
107 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_ar.py:
--------------------------------------------------------------------------------
 1 | '''Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor.'''
 2 | 
 3 | from pyevolve import GAllele
 4 | import Oger
 5 | 
 6 | import sg.utils
 7 | import sg.utils.pyevolve_utils as pu
 8 | from model import Model
 9 | import arima
10 | import load_cleansing
11 | import load_prediction
12 | 
13 | class ARModelCreator(load_prediction.ModelCreator):
14 |     def _add_transform_genes(self):
15 |         '''Sets up for evolution of the ARIMA model.'''    
16 |         self._alleles.add(pu.make_int_gene(1, 1, 8*24, 5))
17 |         self._alleles.add(pu.make_int_gene(1, 0, 8*24, 5))
18 |         self._loci_list += ['AR_order']
19 |         self._loci_list += ['EXO_order']
20 | 
21 |     def _get_transform(self):
22 |         return arima.ar_ga
23 | 
24 | 
25 | class ARBitmapModelCreator(load_prediction.ModelCreator):
26 |     def _add_transform_genes(self):
27 |         '''Sets up for evolution of the ARIMA model.'''    
28 |         self._alleles.add(pu.make_bitmap_gene(24*8))
29 |         self._alleles.add(pu.make_bitmap_gene(24*8))
30 |         self._loci_list += ['AR_lags', 'EXO_lags']
31 | 
32 |     def _get_transform(self):
33 |         return arima.bitmapped_ar_ga
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     load_prediction.run(ARModelCreator)
38 |     #load_prediction.run(ARBitmapModelCreator())
39 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_ar24.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor."""
 2 | 
 3 | from pyevolve import GAllele
 4 | import Oger
 5 | 
 6 | import sg.utils
 7 | import sg.utils.pyevolve_utils as pu
 8 | from model import Model
 9 | import arima
10 | import load_cleansing
11 | import load_prediction
12 | import load_prediction_ar
13 | 
14 | class ARHourByHourModelCreator(load_prediction_ar.ARModelCreator):
15 |     def _get_transform(self):
16 |         return arima.hourbyhour_ar_ga
17 | 
18 | 
19 | class ARHourByHourBitmapModelCreator(load_prediction_ar.ARBitmapModelCreator):
20 |     def _get_transform(self):
21 |         return arima.bitmapped_hourbyhour_ar_ga
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     load_prediction.run(ARHourByHourModelCreator)
26 |     #load_prediction.run(ARHourByHourBitmapModelCreator())
27 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_arima.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor."""
 2 | 
 3 | from pyevolve import GAllele
 4 | import Oger
 5 | 
 6 | import sg.utils
 7 | import sg.utils.pyevolve_utils as pu
 8 | from model import Model
 9 | import arima
10 | import load_cleansing
11 | import load_prediction
12 | 
13 | class ARIMAModelCreator(load_prediction.ModelCreator):
14 |     def _add_transform_genes(self):
15 |         """Sets up for evolution of the ARIMA model."""
16 |         self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # 'AR' backshift (p)
17 |         self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # 'I' backshift (d) 
18 |         self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # 'MA' backshift (q)
19 |         self._loci_list += ['AR_order', 'I_order', 'MA_order']
20 | 
21 |     def _get_transform(self):
22 |         return arima.arima_with_weather
23 | 
24 | 
25 | class SeasonalARIMAModelCreator(load_prediction.ModelCreator):
26 |     def _add_transform_genes(self):
27 |         """Sets up for evolution of a seasonal ARIMA model."""
28 |         self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # 'AR' backshift (p)
29 |         self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # 'I' backshift (d) 
30 |         self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # 'MA' backshift (q)
31 |         self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Seasonal 'AR' backshift (p)
32 |         self._alleles.add(pu.make_choice_gene(1, [0, 1, 2])) # Seasonal 'I' backshift (d) 
33 |         self._alleles.add(pu.make_choice_gene(1, [1, 2, 3])) # Seasonal 'MA' backshift (q)
34 |         self._loci_list += ['AR_order', 'I_order', 'MA_order',
35 |                            'ssn_AR_order', 'ssn_I_order', 'ssn_MA_order']
36 | 
37 |     def _get_transform(self):
38 |         return arima.seasonal_arima_with_weather
39 | 
40 | 
41 | class AutoARIMAModelCreator(load_prediction.ModelCreator):
42 |     def _add_transform_genes(self):
43 |         """Sets up for evolution of the ARIMA model."""
44 |         pass
45 | 
46 |     def _get_transform(self):
47 |         return arima.auto_arima_with_weather
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     load_prediction.run(ARIMAModelCreator)
52 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_averagedaily.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and predictor as daily or 24-hour averages."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | import load_prediction
 7 | import load_prediction_averagehourly as lpah
 8 | 
 9 | def daily_average(data, genome, loci, prediction_steps):
10 |     start = -prediction_steps - genome[loci.hindsight]
11 |     end = -prediction_steps
12 |     return pd.TimeSeries(data=data["Load"][start:end].mean(), 
13 |                          index=data.index[-prediction_steps:])
14 | 
15 | 
16 | class DailyAverageModelCreator(lpah.HourlyAverageModelCreator):
17 |     def _get_transform(self):
18 |         return daily_average
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     load_prediction.run(DailyAverageModelCreator)
23 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_averagehourly.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and predictor as daily or 24-hour averages."""
 2 | 
 3 | from pyevolve import GAllele
 4 | import Oger
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | import sg.utils
 9 | import sg.utils.pyevolve_utils as pu
10 | from model import Model
11 | import load_prediction
12 | 
13 | def hourly_average(data, genome, loci, prediction_steps):
14 |     assert(prediction_steps == 24)
15 |     start = -prediction_steps - genome[loci.hindsight]
16 |     end = -prediction_steps
17 |     avg_data = pd.DataFrame({"Load": data["Load"][start:end].copy()})
18 |     avg_data["Hour of day"] = [i.hour for i in avg_data.index]
19 |     means = avg_data.groupby(["Hour of day"]).mean()["Load"]
20 |     return pd.TimeSeries(data=means.values, 
21 |                          index=data.index[-prediction_steps:])
22 | 
23 | class HourlyAverageModelCreator(load_prediction.ModelCreator):
24 |     def _add_transform_genes(self):
25 |         """Sets up for evolution of the ARIMA model."""
26 |         self._alleles.add(pu.make_real_gene(1, 0, 1, 0.1)) # Dummy to make 1D crossover work in Pyevolve
27 |         self._loci_list += ['crossover_dummy']
28 | 
29 |     def _get_transform(self):
30 |         return hourly_average
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     load_prediction.run(HourlyAverageModelCreator)
35 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_dshw.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and double seasonal Holt
 2 | Winters predictor."""
 3 | 
 4 | from pyevolve import GAllele
 5 | import Oger
 6 | 
 7 | import sg.utils
 8 | import sg.utils.pyevolve_utils as pu
 9 | from model import Model
10 | import arima
11 | import load_cleansing
12 | import load_prediction
13 | 
14 | class DSHWModelCreator(load_prediction.ModelCreator):
15 |     def _add_transform_genes(self):
16 |         """Sets up for evolution of the DSHW model."""
17 |         self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # alpha
18 |         self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # beta
19 |         self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # gamma
20 |         self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # omega
21 |         self._alleles.add(pu.make_real_gene(1, 0, 1, .1), weight=1) # phi
22 |         self._loci_list += ['alpha', 'beta', 'gamma', 'omega', 'phi']
23 | 
24 |     def _get_transform(self):
25 |         return arima.dshw
26 | 
27 | 
28 | class AutoDSHWModelCreator(load_prediction.ModelCreator):
29 |     def _add_transform_genes(self):
30 |         """Sets up for evolution of the DSHW model."""
31 |         pass
32 | 
33 |     def _get_transform(self):
34 |         return arima.auto_dshw
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     load_prediction.run(DSHWModelCreator)
39 |     #load_prediction.run(AutoDSHWModelCreator())
40 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_esn.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and ESN predictor."""
 2 | 
 3 | import numpy as np
 4 | import Oger
 5 | 
 6 | 
 7 | import sg.utils
 8 | import sg.utils.pyevolve_utils as pu
 9 | from model import Model
10 | import esn 
11 | import load_cleansing
12 | import load_prediction
13 | 
14 | class ESNModelCreator(load_prediction.ModelCreator):
15 |     def _add_transform_genes(self):
16 |         """Sets up for evolution of the ESN model."""
17 |         self._alleles.add(pu.make_int_gene(1, 10, 500, 25), weight=1) # Network size
18 |         self._alleles.add(pu.make_real_gene(1, 0, 1, 0.05), weight=1) # Leak rate
19 |         self._alleles.add(pu.make_real_gene(1, 0.1, 0.75, 0.05), weight=1) # Input scaling
20 |         self._alleles.add(pu.make_real_gene(1, 0, 1, 0.05), weight=1) # Bias scaling
21 |         self._alleles.add(pu.make_real_gene(1, 0.5, 2, 0.05), weight=1) # Spectral radius
22 |         # We don't want too many seeds per evolutions, but we don't want to
23 |         # always evolve on the same 5 networks either:
24 |         self._alleles.add(pu.make_choice_gene(
25 |             1, np.random.random_integers(0, 2**16, 5)), weight=1) # Seed
26 |         # Grid optimization showed that for a training length of 336, with
27 |         # other params set based on previous gridopts and operating on the
28 |         # total dataset rather than single AMS'es, optimal ridge was ~5. Scaled
29 |         # thus 5/336=0.015.
30 |         self._alleles.add(pu.make_choice_gene(
31 |             1, [0.0001/self._max_hindsight_hours]), weight=1) # Scaled ridge
32 |         self._loci_list += ['size', 'leak', 'in_scale', 
33 |                       'bias_scale', 'spectral', 'seed', 'ridge' ]
34 | 
35 |     def _get_transform(self):
36 |         return esn.feedback_with_external_input
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     load_prediction.run(ESNModelCreator)
41 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_esn24.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and ESN predictor."""
 2 | 
 3 | import sg.utils.pyevolve_utils as pu
 4 | import esn 
 5 | import load_prediction_esn
 6 | 
 7 | class ESNHourByHourModelCreator(load_prediction_esn.ESNModelCreator):
 8 |     def _add_transform_genes(self):
 9 |         """Sets up for evolution of the ESN model."""
10 |         # The 24 hour lags. 
11 |         gene = pu.make_choice_gene(1, [i for i in self._hindsight_days])
12 |         self._alleles.add(gene, weight=1)
13 |         self._loci_list += ['lags']
14 |         ESNModelCreator._add_transform_genes(self)
15 | 
16 |     def _get_transform(self):
17 |         return esn.hourbyhour_esn_feedback_with_external_input_ga
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     load_prediction.run(ESNHourByHourModelCreator)
22 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_identity.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and AR/ARIMA predictor."""
 2 | 
 3 | from pyevolve import GAllele
 4 | import Oger
 5 | import pandas as pd
 6 | 
 7 | import sg.utils
 8 | import sg.utils.pyevolve_utils as pu
 9 | from model import Model
10 | import load_cleansing
11 | import load_prediction
12 | 
13 | def identity_transformer(data, genome, loci, prediction_steps):
14 |     """This prediction model assumes tomorrow will be the same as today."""
15 |     return data["Load"][-prediction_steps*2:-prediction_steps].tshift(prediction_steps)
16 | 
17 | def null_transformer(data, genome, loci, prediction_steps):
18 |     """This prediction model assumes tomorrow will be entirely flat."""
19 |     return pd.TimeSeries(data=data["Load"][:-prediction_steps].mean(),
20 |                          index=data.index[-prediction_steps:])
21 | 
22 | class IdentityModelCreator(load_prediction.ModelCreator):
23 |     def _add_transform_genes(self):
24 |         """Sets up for evolution of a system without transformer."""    
25 |         pass
26 | 
27 |     def _get_transform(self):
28 |         return identity_transformer
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     load_prediction.run(IdentityModelCreator)
33 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_regul_ar.py:
--------------------------------------------------------------------------------
 1 | '''Evolve a load predictor with regularized vector AR predictor.'''
 2 | 
 3 | import functools
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | import sg.utils.pyevolve_utils as pu
 9 | import load_prediction
10 | import regul_ar
11 | import arima
12 | 
13 | 
14 | class LinearRegularizedVectorARModelCreator(load_prediction.ModelCreator):
15 |     def _add_transform_genes(self):
16 |         '''Sets up for evolution of the regularized vector AR model.'''    
17 |         self._alleles.add(pu.make_int_gene(1, 1, 8*24, 5))
18 |         self._alleles.add(pu.make_int_gene(1, 0, 8*24, 5))
19 |         self._add_lambda_gene()
20 |         self._loci_list += ['AR_order']
21 |         self._loci_list += ['EXO_order']
22 |         self._loci_list += ['lambda_cont']
23 | 
24 |     def _add_lambda_gene(self):
25 |         self._alleles.add(pu.make_real_gene(1, 0, 9, 0.2))
26 | 
27 |     def _lambda_mapper(self, lc_gene_val):
28 |         return lc_gene_val
29 | 
30 |     def _transform(self, data, genome, loci, prediction_steps):
31 |         lags_2d = arima.lags_from_order_ga(data, genome, loci)
32 |         lambda_cont = self._lambda_mapper(genome[loci.lambda_cont])
33 |         x_start = max(-len(data), -genome[loci.hindsight] - prediction_steps)
34 |         svp = regul_ar.SmoothVectorARPredictor(
35 |             data[x_start:-prediction_steps].values,
36 |             num_models=prediction_steps,
37 |             lags_2d=lags_2d,
38 |             relative_lags=True,
39 |             add_bias=True,
40 |             out_cols=[data.columns.tolist().index('Load')])
41 |         svp.estimate(lambda_cont=lambda_cont)
42 |         prediction = svp.predict(
43 |             exo_series=np.atleast_2d(data['Temperature'].ix[-prediction_steps:].values).T,
44 |             prediction_steps=prediction_steps)
45 |         return pd.TimeSeries(data=prediction[:,0], index=data[-prediction_steps:].index)
46 | 
47 |     def _get_transform(self):
48 |         return functools.partial(type(self)._transform, self)
49 | 
50 |     
51 | class LogRegularizedVectorARModelCreator(LinearRegularizedVectorARModelCreator):
52 |     def _add_lambda_gene(self):
53 |         self._alleles.add(pu.make_int_gene(1, 0, 1e6, 100))
54 | 
55 |     def _lambda_mapper(self, lc_gene_val):
56 |         return (np.power(10, lc_gene_val) - 1) / 1e3
57 | 
58 |     
59 | class RegularizedVanillaModelCreator(load_prediction.ModelCreator):
60 |     def __init__(self, *args, **kwargs):
61 |         load_prediction.ModelCreator.__init__(self, *args, **kwargs)
62 |         self._warning_printed = False
63 |         
64 |     def _add_transform_genes(self):
65 |         '''Sets up for evolution of the regularized vanilla benchmark model.'''    
66 |         self._alleles.add(pu.make_int_gene(1, 0, 1e6, 100))
67 |         self._loci_list += ['lambda_cont']
68 | 
69 |     def _transform(self, data, genome, loci, prediction_steps):
70 |         if not self._warning_printed:
71 |             print 'Hindsight genome ignored, using all available data in Vanilla model.'
72 |             self._warning_printed = True
73 |         svp = regul_ar.VanillaVectorPredictor(data[:-prediction_steps])
74 |         svp.estimate(lambda_cont=genome[loci.lambda_cont])
75 |         return svp.predict(data[-prediction_steps:])
76 |     
77 |     def _get_transform(self):
78 |         return functools.partial(type(self)._transform, self)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     load_prediction.run(LogRegularizedVectorARModelCreator)
83 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_taohong.py:
--------------------------------------------------------------------------------
 1 | import load_prediction
 2 | import taohong
 3 | 
 4 | class VanillaModelCreator(load_prediction.ModelCreator):
 5 |     def _add_transform_genes(self):
 6 |         pass
 7 |     
 8 |     def _get_transform(self):
 9 |         return taohong.vanilla
10 | 
11 |     
12 | if __name__ == '__main__':
13 |     load_prediction.run(VanillaModelCreator)
14 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_wavelet.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor."""
 2 | 
 3 | import random
 4 | 
 5 | from pyevolve import GAllele
 6 | import Oger
 7 | 
 8 | import sg.utils
 9 | import sg.utils.pyevolve_utils as pu
10 | from model import Model
11 | import wavelet
12 | import load_cleansing
13 | import load_prediction
14 | 
15 | class WaveletModelCreator(load_prediction.ModelCreator):
16 |     def _add_transform_genes(self):
17 |         """This is where the models are defined. The models are passed to the
18 |         GA engine for evolution of the optimal set of parameters. Afterwards,
19 |         the models are tested, and performance is measured."""
20 |         self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Scale
21 |         self._alleles.add(pu.make_choice_gene(1, [2])) # Aj, in the paper 2 gives best results.
22 |         self._loci_list += ['scale', 'Aj']
23 | 
24 |     def _get_transform(self):
25 |         #return wavelet.linear_prediction
26 |         #return wavelet.linear_vector
27 |         #return wavelet.vector_multiscale_prediction
28 |         #return wavelet.iterative_multiscale_prediction
29 |         return wavelet.multiscale_prediction
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     load_prediction.run(WaveletModelCreator)
34 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_wavelet.py.orig:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor."""
 2 | 
 3 | import random
 4 | 
 5 | from pyevolve import GAllele
 6 | import Oger
 7 | 
 8 | import sg.utils
 9 | import sg.utils.genemapper as gm
10 | from model import Model
11 | import wavelet
12 | import load_cleansing
13 | import load_prediction
14 | 
15 | class WaveletModelCreator(load_prediction.ModelCreator):
16 |     def get_model(self, options):
17 |         """This is where the models are defined. The models are passed to the
18 |         GA engine for evolution of the optimal set of parameters. Afterwards,
19 |         the models are tested, and performance is measured."""
20 |     
21 |         alleles = GAllele.GAlleles()
22 |         alleles.add(gm.MappedAlleleList(range(1,11))) # Scale
23 |         alleles.add(gm.MappedAlleleList([2])) # Aj, in the paper 2 gives best results.
24 |         alleles.add(gm.MappedAlleleList([ 2**i for i in range(4,12)])) # Train length.
25 | 
26 |         # For ESN training of predictor.
27 |         alleles.add(gm.MappedAlleleRange(10, 500)) # Network size, 1
28 |         alleles.add(gm.MappedAlleleRange(0, 2, real=True)) # Leak rate, 2
29 |         alleles.add(gm.MappedAlleleRange(0.1, 0.75, real=True)) # Input scaling, 3
30 |         alleles.add(gm.MappedAlleleRange(0, 1, real=True)) # Bias scaling, 4
31 |         alleles.add(gm.MappedAlleleRange(0.5, 2, real=True)) # Spectral radius, 5
32 |         bucket_seed = random.randrange(1, 2**16)
33 |         alleles.add(gm.MappedAlleleRange(bucket_seed, bucket_seed + 5)) # Seed, 6
34 |         alleles.add(gm.MappedAlleleList([0.0001/336])) # Scaled ridge, 7
35 | 
36 |         if not options.no_cleaning:
37 |             alleles.add(gm.MappedAlleleRange(0.001, 800, real=True, scaling='log'))
38 |             alleles.add(gm.MappedAlleleRange(0.001, 800, real=True, scaling='log'))
39 |             alleles.add(gm.MappedAlleleRange(0.1, 3, real=True))
40 |             alleles.add(gm.MappedAlleleRange(0.1, 3, real=True))
41 | 
42 |         loci = sg.utils.Enum('scale', 'Aj', 'hindsight',
43 |                              'size', 'leak', 'in_scale', 
44 |                              'bias_scale', 'spectral', 'seed', 'ridge',
45 |                              't_smooth', 'l_smooth', 't_zscore', 'l_zscore')
46 | 
47 |         return Model(genes=alleles, error_func=Oger.utils.nrmse,
48 |                      train_and_predict_func=wavelet.multiscale_prediction,
49 |                      clean_func=load_cleansing.bspline_clean_dataset, loci=loci)
50 | 
51 | if __name__ == "__main__":
52 |     load_prediction.run(WaveletModelCreator())
53 | 


--------------------------------------------------------------------------------
/sg/models/load_prediction_wavelet24.py:
--------------------------------------------------------------------------------
 1 | """Evolve a load predictor with BSpline data cleansing and a wavelet predictor."""
 2 | 
 3 | import random
 4 | 
 5 | from pyevolve import GAllele
 6 | import Oger
 7 | 
 8 | import sg.utils
 9 | import sg.utils.pyevolve_utils as pu
10 | from model import Model
11 | import wavelet
12 | import load_cleansing
13 | import load_prediction
14 | 
15 | class WaveletHourByHourModelCreator(load_prediction.ModelCreator):
16 |     def _add_transform_genes(self):
17 |         """This is where the models are defined. The models are passed to the
18 |         GA engine for evolution of the optimal set of parameters. Afterwards,
19 |         the models are tested, and performance is measured."""
20 |         
21 |         self._alleles.add(pu.make_int_gene(1, 1, 10, 1)) # Scale
22 |         self._alleles.add(pu.make_choice_gene(1, [2])) # Aj, in the paper 2 gives best results.
23 |         gene = pu.make_choice_gene(1, [i for i in self._hindsight_days])
24 |         self._alleles.add(gene, weight=1)
25 |         
26 |         if options.no_cleaning:
27 |             loci = sg.utils.Enum('scale', 'Aj')
28 |         else:
29 |             loci = sg.utils.Enum('scale', 'Aj', 't_smooth', 
30 |                                  'l_smooth', 't_zscore', 'l_zscore')
31 | 
32 | 
33 |     def _get_transform(self):
34 |         return wavelet.hourbyhour_multiscale_prediction_ga
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     load_prediction.run(WaveletHourByHourModelCreator)
39 | 


--------------------------------------------------------------------------------
/sg/models/model.py:
--------------------------------------------------------------------------------
 1 | class Model(object):
 2 |     """A class that holds all the properties necessary for a model to
 3 |     be employed in the GA search for optimal parameters."""
 4 | 
 5 |     def __init__(self, name, genes, error_func, transformer, loci):
 6 |         self._name = name
 7 |         self._genes = genes
 8 |         self._error_func = error_func
 9 |         self._transformer = transformer
10 |         self._loci = loci
11 |         self._dataset = None
12 |         self._day = None
13 |         self._preprocessors = None
14 |         self._postprocessors = None
15 | 
16 |     @property
17 |     def name(self):
18 |         return self._name
19 |     
20 |     def get_day(self):
21 |         return self._day
22 |     def set_day(self, day):
23 |         self._day = day
24 |     day = property(get_day, set_day)
25 | 
26 |     def get_loci(self):
27 |         return self._loci
28 |     def set_loci(self, loci):
29 |         self._loci = loci
30 |     loci = property(get_loci, set_loci)
31 | 
32 |     def get_genes(self):
33 |         return self._genes
34 |     def set_genes(self, genes):
35 |         self._genes = genes
36 |     genes = property(get_genes, set_genes)
37 | 
38 |     def get_error_func(self):
39 |         return self._error_func
40 |     def set_error_func(self, error_func):
41 |         self._error_func = error_func
42 |     error_func = property(get_error_func, set_error_func)
43 | 
44 |     def get_preprocessors(self):
45 |         return self._preprocessors
46 |     def set_preprocessors(self, preprocessors):
47 |         self._preprocessors = preprocessors
48 |     preprocessors = property(get_preprocessors, set_preprocessors)
49 | 
50 |     def get_transformer(self):
51 |         return self._transformer
52 |     def set_transformer(self, transformer):
53 |         self._transformer = transformer
54 |     transformer = property(get_transformer, set_transformer)
55 | 
56 |     def get_postprocessors(self):
57 |         return self._postprocessors
58 |     def set_postprocessors(self, postprocessors):
59 |         self._postprocessors = postprocessors
60 |     postprocessors = property(get_postprocessors, set_postprocessors)
61 | 
62 |     def get_genome(self):
63 |         return self._genome
64 |     def set_genome(self, genome):
65 |         self._genome = genome
66 |     genome = property(get_genome, set_genome)
67 | 
68 |     def get_dataset(self):
69 |         return self._dataset
70 |     def set_dataset(self, dataset):
71 |         self._dataset = dataset
72 |     dataset = property(get_dataset, set_dataset)
73 | 


--------------------------------------------------------------------------------
/sg/models/model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/model.pyc


--------------------------------------------------------------------------------
/sg/models/onemax_mpi.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | 
  5 | import numpy as np
  6 | from mpi4py import MPI
  7 | 
  8 | pop_size = 512*10
  9 | genome_length = 100
 10 | generations = 250
 11 | mutation_rate = 0.1
 12 | 
 13 | comm = MPI.COMM_WORLD
 14 | nhosts = comm.Get_size()
 15 | rank = comm.Get_rank()
 16 | 
 17 | 
 18 | def evolve():
 19 |     population = np.random.randint(2, size=(pop_size, genome_length)).astype('float')
 20 |     rest_time = None
 21 |     for gen in range(generations):
 22 |         eval_start = time.time()
 23 |         fitnesses = evaluate(population)
 24 |         eval_time = time.time() - eval_start
 25 |         rest_start = time.time()
 26 |         print_stats(gen, population, fitnesses, eval_time, rest_time)
 27 |         reproduce(population, fitnesses)
 28 |         rest_time = time.time() - rest_start
 29 | 
 30 | def eval_loop():
 31 |     for gen in range(generations):
 32 |         evaluate(None)
 33 | 
 34 | def eval_local(population):
 35 |     fitnesses = np.empty(len(population))
 36 |     target = np.arange(1, genome_length+1)
 37 |     for idx in range(population.shape[0]):
 38 |         fitnesses[idx] = -np.abs((population[idx,:] - target)).sum()
 39 |     return fitnesses
 40 |     
 41 | def evaluate_ndarray(population=None):
 42 |     indices = np.linspace(0, pop_size, nhosts+1).astype('int')
 43 |     displs = indices[:-1]
 44 |     sendcounts = indices[1:] - displs
 45 |     if rank == 0:
 46 |         sendbuf = (population, np.array(sendcounts) * genome_length, 
 47 |                    np.array(displs) * genome_length, MPI.DOUBLE)
 48 |         recvbuf = (np.empty(pop_size), sendcounts, displs, MPI.DOUBLE)
 49 |     else:
 50 |         sendbuf = None
 51 |         recvbuf = None
 52 | 
 53 |     indivs = np.empty(sendcounts[rank] * genome_length)
 54 |     fitnesses = np.empty(sendcounts[rank])
 55 |         
 56 |     comm.Scatterv(sendbuf,indivs)
 57 |     indivs.shape = (len(indivs)/genome_length, genome_length)
 58 | 
 59 |     fitnesses = eval_local(indivs)
 60 | 
 61 |     comm.Gatherv(fitnesses, recvbuf)
 62 |     if rank == 0:
 63 |         return recvbuf[0]
 64 | 
 65 | def evaluate_pickle(population=None):
 66 |     if rank == 0:
 67 |         indices = np.linspace(0, pop_size, nhosts+1).astype('int')
 68 |         starts = indices[:-1]
 69 |         ends = indices[1:]
 70 |         scattered = [population[s:e,:] for s,e in zip(starts, ends)]
 71 |     else:
 72 |         scattered = None
 73 |     indivs = comm.scatter(scattered)
 74 |     fitnesses = eval_local(indivs)
 75 |     all_fitnesses = comm.gather(fitnesses)
 76 |     if rank==0:
 77 |         return np.concatenate(all_fitnesses)
 78 | 
 79 | evaluate = evaluate_ndarray
 80 | #evaluate = evaluate_pickle
 81 | 
 82 | def print_stats(gen, population, fitnesses, eval_time, rest_time):
 83 |     if rest_time is None:
 84 |         timetxt = "%.4f" % eval_time
 85 |     else:
 86 |         timetxt = "%.4s/%.4s" % (eval_time, rest_time)
 87 |     print "Generation %d in %s: Fitnesses %.2f/%.2f/%.2f. Best indiv:" \
 88 |       % (gen, timetxt, fitnesses.min(), fitnesses.mean(), fitnesses.max())
 89 |     print population[fitnesses.argmax(),:]
 90 | 
 91 | def mutate(indiv):
 92 |     for i in range(len(indiv)):
 93 |         if random.random() < mutation_rate:
 94 |             indiv[i] = indiv[i] + 1 if random.random() < 0.5 else indiv[i] - 1
 95 | 
 96 | def reproduce(population, fitnesses):
 97 |     best = population[fitnesses.argmax(),:]
 98 |     for idx in range(pop_size):
 99 |         population[idx] = best
100 |     mutations = np.where(np.random.random((pop_size, genome_length)) < mutation_rate)
101 |     mutvals = np.random.randint(low=-1, high=2, size=len(mutations[0]))
102 |     population[mutations] += mutvals
103 |         #mutate(population[idx])
104 |     
105 | if __name__ == "__main__":
106 |     if rank == 0:
107 |         evolve()
108 |     else:
109 |         eval_loop()
110 |         
111 | 


--------------------------------------------------------------------------------
/sg/models/roughness.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/roughness.tex


--------------------------------------------------------------------------------
/sg/models/run_experiments.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import os
 3 | import socket
 4 | 
 5 | from sg.data.sintef import tempfeeder_exp
 6 | from sg.utils.timer import SimpleTimer
 7 | 
 8 | import run_experiments_params as params
 9 | 
10 | def run_one_wrapper(arg):
11 |     reload(params)
12 |     params.run_one(arg)
13 |     
14 | def make_runs(user_ids, num_runs):
15 |     """Create a list of (user_id, run_number) pairs that can be sent via
16 |     pool.map to the run_one function."""
17 |     return [(user, run) for user in user_ids for run in range(num_runs)]
18 | 
19 | def run_simulations(runs):
20 |     """Run all the simulations provided in runs by sending them on to the
21 |     run_one function."""
22 |     num_parallel_processes = 12
23 |     pool = Pool(processes=num_parallel_processes)
24 |     pool.map(run_one_wrapper, runs, chunksize=1)
25 | 
26 | if __name__ == "__main__":
27 |     # if socket.gethostname() == "tanzenmusik.idi.ntnu.no":
28 |     #     user_ids = tempfeeder_exp().user_ids[25:50]
29 |     # else:
30 |     #     user_ids = tempfeeder_exp().user_ids[0:25]
31 | 
32 |     user_ids = [tempfeeder_exp().user_ids[0]]
33 |     num_runs = 12
34 | 
35 |     print "Master pid is %d " % os.getpid()
36 |     timer = SimpleTimer(output_stream=None)
37 |     tempfeeder_exp().close()
38 |     runs = make_runs(user_ids, num_runs)
39 |     run_simulations(runs)
40 |     print "All simulations complete. %s" % timer.end()
41 |     tempfeeder_exp().close()
42 | 


--------------------------------------------------------------------------------
/sg/models/run_experiments_params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | 
 5 | from sg.globals import SG_MODELS_PATH
 6 | from sg.utils.timer import SimpleTimer
 7 | from sg.globals import SG_SIM_PATH
 8 | 
 9 | # This function is defined in a separate file, so the main runner can reload it
10 | # before each launch. This allows us to adjust parameters "on the fly".
11 | def run_one(arg):
12 |     """Run one evolution. Arg is a tuple containing user ID and run number."""
13 |     user_id, run_number = arg
14 |     # Note that the PID printed below is the PID in which this function is
15 |     # running, which is different from the PID of the evolution.
16 |     print "Launching evolution for user %d run %d (pid %d) at %s..." % \
17 |       (user_id, run_number, os.getpid(), time.asctime())
18 |     sys.stdout.flush()
19 |       
20 |     timer = SimpleTimer(output_stream=None)
21 |     out_dir = os.path.join(SG_SIM_PATH, "id_%d" % user_id)
22 |     model = os.path.join(SG_MODELS_PATH, "load_prediction.py")
23 |     postfix = "run_%d" % run_number
24 |     generations = 50
25 |     pop_size = 400
26 |     mutation = 0.05
27 |     crossover = 0.5
28 |     # NB Total-load sims:
29 |     total = " --total-load"
30 |     data_seed = 12
31 |     
32 |     stdout_path = os.path.join(out_dir, 
33 |         "output_run_%d.txt" % run_number)
34 |     os.system("test -d %s || mkdir -p %s" % (out_dir, out_dir))
35 |     os.system("python %s " % model + \
36 |               " --userid=%d" % user_id + \
37 |               " --out-dir=%s --out-postfix=%s " % (out_dir, postfix) + \
38 |               " --generations=%d --pop-size=%d " % (generations, pop_size) + \
39 |               " --mutation=%f --crossover=%f " % (mutation, crossover) + \
40 |               " --no-show-plot --save-plot " + \
41 |               total + \
42 |               " --data-seed=%d " % data_seed + \
43 |               " >%s" % stdout_path)
44 | 
45 |     print "Evolution completed for user %d run %d. %s" \
46 |       % (user_id, run_number, timer.end())
47 |     sys.stdout.flush()
48 | 


--------------------------------------------------------------------------------
/sg/models/spclean.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/spclean.pyc


--------------------------------------------------------------------------------
/sg/models/spclean_wrapper.py:
--------------------------------------------------------------------------------
  1 | """Demonstrate the cleansing algorithm on datasets of varying length."""
  2 | 
  3 | import sys
  4 | import time
  5 | from datetime import timedelta as dt
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | import sg.data.sintef.userloads as ul
 12 | import spclean as cln
 13 | from sg.utils.timer import SimpleTimer
 14 | import splines as sp
 15 | 
 16 | import array
 17 | import ctypes
 18 | 
 19 | from ctypes import cdll
 20 | from ctypes import c_double
 21 | 
 22 | # Load a dataset containing power load history. This set is divided into
 23 | # training and test data, we only keep the traning part for now.
 24 | 
 25 | def _get_smoother():
 26 |     # Set slow_smoother to True in order to see the actual time consumed by the
 27 |     # B-spline smoothing operation. If set to False, will use the default
 28 |     # smoother where the roughness matrices are cached.
 29 |     slow_smoother = True
 30 |     if slow_smoother:
 31 |         #print "Using slow, analytic, non-caching smoother."
 32 |         return cln.BSplineAnalyticSmoother
 33 |     else:
 34 |         #print "Using not quite so slow, caching smoother."
 35 |         return cln.BSplineSmoother
 36 | 
 37 | ds_array = 0
 38 | kn_array = 0
 39 | 
 40 | class BsplineFastSmoother(object):
 41 |     def __init__(self, data, smoothness, zscore):
 42 |     	#create knot vector
 43 |     	knots = sp.get_uniform_knots_from_points(data, degree, knotrange=(0, len(data) - 1))
 44 | 
 45 |     	#determine datasize
 46 |     	n_data = len(data)
 47 |     	n_knot = len(knots)
 48 | 
 49 |     	#create a pointer to the dataset
 50 |     	ds = np.array(data)
 51 |     	ds_type = c_double*n_data
 52 |     	ds_array = ds_type(*ds)
 53 | 
 54 |     	#create a pointer to the knots
 55 |     	kn = np.array(knots)
 56 |     	kn_type = c_double*n_knot
 57 |     	kn_array = kn_type(*kn)
 58 |     
 59 |     	#number of threads
 60 |     	
 61 |     	self._lib = cdll.LoadLibrary('lib_mkl/libspclean.so')
 62 |         self.obj = self._lib.Smoother_new(ds_array, n_data, kn_array, n_knot, degree, c_double(smoothness), c_double(zscore))
 63 | 
 64 | 
 65 |     def __del__(self):
 66 |         self._lib.Smoother_delete(self.obj)
 67 |         
 68 |     def bsm_cleanData(self):
 69 |         return self._lib.bsm_cleanData(self.obj)
 70 | 
 71 |     def bsm_smoothedData(self):
 72 |         return self._lib.bsm_smoothedData(self.obj)
 73 |         
 74 | 
 75 | # load data
 76 | dataset, _ = ul.total_experiment_load()
 77 | 
 78 | # Set parameters for the B-spline smoother/cleanser
 79 | degree = 3
 80 | smoothness = 100.0
 81 | zscore = 1.0
 82 |     	
 83 | # Try smoothing/cleansing different time series lengths
 84 | for hindsight_days in [1]:
 85 |     # Select data
 86 |     num_hours = 24 * hindsight_days
 87 |     data = dataset["Load"][-num_hours:].copy()
 88 |     
 89 |     #determine datasize   
 90 |     n_data = len(data)
 91 | 
 92 |     # Some output and rough timing
 93 |     #print "Cleansing %d hours of data with smoothness %.2f, z-score %.2f..." % \
 94 |     #  (num_hours, smoothness, zscore)
 95 |     #sys.stdout.flush()
 96 |     start_time = time.time()    
 97 |     
 98 |     # This is the part that takes time    
 99 |     #smoother = _get_smoother()(data, smoothness)
100 |     #cleaner = cln.RegressionCleaner(smoother, zscore)
101 |     #cleaned, _ = cleaner.get_cleaned_data(method=cln.RegressionCleaner.replace_with_bound)
102 | 
103 |     #call cpp smpline object and get the result
104 |     sm = BsplineFastSmoother(data, smoothness, zscore)
105 |     res = sm.bsm_cleanData()
106 | 
107 |     # Wrap up and plot the result
108 |     end_time = time.time()
109 |     
110 |     #convert the pointer to nparray
111 | #    ArrayType = ctypes.c_double*n_data
112 | #    array_pointer = ctypes.cast(res, ctypes.POINTER(ArrayType))
113 | #    cleaned_data = np.frombuffer(array_pointer.contents, dtype=np.double)
114 | 
115 | #    print "Done in %s." % SimpleTimer.period_to_string(start_time, end_time)
116 | #    sys.stdout.flush()
117 | 
118 | #    res = sm.bsm_smoothedData()
119 |     
120 |     #convert the pointer to nparray
121 | #    ArrayType = ctypes.c_double*n_data
122 | #    array_pointer = ctypes.cast(res, ctypes.POINTER(ArrayType))
123 | #    print "Getting smoothed data..."
124 | #    sys.stdout.flush()
125 | #    smoothed_data = np.frombuffer(array_pointer.contents, dtype=np.double)
126 |     print "Got smoothed data..."
127 | #    sys.stdout.flush()
128 | 
129 | #    print data
130 | #    print cleaned_data
131 | #    print smoothed_data
132 | 
133 | #    plt.figure()
134 | #    data.plot(style='b', label="Raw data") 
135 | #    print "Creating time series from smoothed data..."
136 | #    sys.stdout.flush()
137 | #    smoothed_series = pd.TimeSeries(data=smoothed_data, index=data.index)
138 | #    print "Plotting smoothed series..."
139 | #    sys.stdout.flush()
140 | #    smoothed_series.plot(style='r', label="Smoothed data")
141 | #    print "Done plotting smoothed series."
142 | #    sys.stdout.flush()
143 | #    plt.legend()
144 | #    plt.show()
145 | #    data.plot(style='r', label='Raw load')
146 | #    cleaned_data.plot(style='b', label='Cleaned load')
147 | #    spline = pd.TimeSeries(data=smoother.splev(range(len(cleaned))), index=cleaned.index)
148 | #    spline.plot(style='g', label='Smoothing spline')
149 | #    plt.legend(loc=3)
150 |     
151 | #plt.savefig('cfig.pdf')
152 | 


--------------------------------------------------------------------------------
/sg/models/splines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/splines.pyc


--------------------------------------------------------------------------------
/sg/models/static.py:
--------------------------------------------------------------------------------
 1 | import Oger, mdp, pdb
 2 | import numpy as NP
 3 | 
 4 | class StaticNode(Oger.nodes.ReservoirNode):
 5 |     """ Extends the Reservoir node for static classification by letting the inner dynamics of the reservoir settle before the final state of that timestep is
 6 | stored.
 7 | 
 8 | Note: in the original paper, the transfer function is not used. It is not clear why one shouldn't use the tanh function,
 9 | however, this is not hardcoded. Use the identity function as an input parameter if this behaviour is desired. It seems to me that
10 | the network performs better when the tanh transfer function is used. 
11 | 
12 | Author: Axel Tidemann
13 | """
14 |     def _execute(self, x):
15 |         """ Executes simulation with input vector x.
16 |         """
17 |         steps = x.shape[0]
18 |         
19 |         # Pre-allocate the state vector, adding the initial state. All zeros.
20 |         states = mdp.numx.zeros((steps, self.output_dim))
21 | 
22 |         # A vector to store how many steps were needed to stabilize the reservoir.
23 |         stabilize = mdp.numx.zeros(steps)
24 | 
25 |         # Loop over the input data and compute the reservoir states.
26 |         for n in range(steps):
27 |             # Let the reservoir stabilize before collection.
28 |             previous_state = states[n,:]
29 |             current_state = self.nonlin_func(mdp.numx.dot(self.w, states[n, :]) + mdp.numx.dot(self.w_in, x[n, :]) + self.w_bias)
30 | 
31 |             i = 0
32 |             # We continue until a 0.1% change. Formula taken from Wikipedia for % difference (not percent error).
33 |             while abs(NP.sum(previous_state - current_state))/max(abs(NP.sum(previous_state)), abs(NP.sum(current_state))) > 0.001: 
34 |                 previous_state = current_state
35 |                 # Added flattening of previous_state in the following line, 2012-07-18. Somehow the transposing did not happen before, or
36 |                 # maybe MDP was more tolerant.
37 |                 current_state = self.nonlin_func(mdp.numx.dot(self.w, NP.ndarray.flatten(previous_state)) + mdp.numx.dot(self.w_in, x[n, :]) + self.w_bias)
38 |                 i += 1
39 | 
40 |             stabilize[n] = i
41 |             states[n, :] = current_state
42 |             self._post_update_hook(states, x, n)    
43 | 
44 |         print 'StaticNode: Steps to stabilize the reservoir (avg std min max)', NP.average(stabilize), NP.std(stabilize), min(stabilize), max(stabilize)
45 | 
46 |         #print NP.max(states), NP.min(states), NP.average(states)
47 |         
48 |         return states
49 | 
50 | 
51 | ##### Testing #####
52 | if __name__ == "__main__":
53 | 
54 |     #Generate random vectors
55 |     NP.random.seed()
56 |     x = NP.random.randn(100,20) # (number of cases, number of features)
57 |     #Generate target vector - one vector for each case.
58 |     y = NP.eye(100)
59 | 
60 |     #Create ESN
61 |     reservoir = StaticNode(input_dim = x.shape[1], output_dim = 20, spectral_radius = 0.55) #Too large reservoir -> trouble.
62 |     readout = Oger.nodes.RidgeRegressionNode()
63 | 
64 |     flow = mdp.hinet.FlowNode(reservoir + readout)
65 |     flow.train(x, y)
66 |     flow.stop_training()
67 | 
68 |     ytest = flow(x) 
69 | 
70 |     # See how well the classification works, e.g. if the highest activated output node is the correct one. 
71 |     c = 0
72 |     for i in range(y.shape[0]):
73 |         if NP.argmax(ytest[i,:]) == NP.argmax(y[i,:]):
74 |             c += 1
75 | 
76 |     print 'Absolute error:', NP.mean(ytest - y), 'Classfication rate:', 100*c/y.shape[0], '%'
77 | 
78 | 


--------------------------------------------------------------------------------
/sg/models/static.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/static.pyc


--------------------------------------------------------------------------------
/sg/models/subset_runs/make-runfiles.sh:
--------------------------------------------------------------------------------
 1 | bindir="$HOME/SmartGrid/src/sg/models"
 2 | rundir="$bindir/subset_runs"
 3 | outdir="$rundir/evo_output_files"
 4 | 
 5 | generations=10
 6 | popsize=100
 7 | 
 8 | evocmd="python $bindir/load_prediction_arima.py --out-dir=$outdir --out-postfix=|postfix| --generations=$generations --pop-size=$popsize --mutation=0.2 --crossover=0.5 --mutation-sigma=10 --no-plot --elite=0 --num-trials=7 --env-replace=3 --data-seed=|dataseed| --no-cleaning --parallel --user-subset=|numberofusers|"
 9 | 
10 | max_subset_size=150
11 | runs_per_subset=10
12 | 
13 | runfile_path_base="$rundir/run-subset-"
14 | rm ${runfile_path_base}*
15 | 
16 | num_subset_files_created=0
17 | for (( subset=1; $subset<${max_subset_size}; subset=$subset+1 )); do
18 |     runfile=${runfile_path_base}$subset.sh
19 |     let num_subset_files_created=${num_subset_files_created}+1
20 |     cat >$runfile <<EOF
21 | #!/bin/bash
22 | #
23 | # These commands set up the environment for your job:
24 | #
25 | # Name of the job
26 | #PBS -N subset_$subset
27 | #
28 | # Using IDI account when possible
29 | #PBS -A acc-idi
30 | #
31 | # 
32 | #PBS -l walltime=24:00:00
33 | #
34 | # Specify resources number of nodes:cores per node
35 | #PBS -l nodes=1:ppn=12
36 |  
37 | # Specify queue to submit to: default, bigmem, express or default
38 | #PBS -q default
39 | 
40 | # Send me an email if it is killed.
41 | #PBS -m ae
42 | ## PBS -M hoversta@idi.ntnu.no
43 | ## PBS -M tidemann@idi.ntnu.no
44 | 
45 | # Set up Python virtual environment. Virtualenv doesn't play well in parallel,
46 | # so reduce the risk of errors due to synchronized calls to 'workon' by
47 | # sleeping for a brief random period before calling workon.
48 | seconds=\`awk "END {srand(\$RANDOM); print int(rand()*5)}" /dev/null\`
49 | echo "Sleeping for \$seconds second(s) before activating the virtual environment"
50 | sleep \$seconds
51 | echo 'Go!'
52 | source $HOME/.bash_profile
53 | workon smartgrid
54 | 
55 | outputfile=$rundir/output_subset_$subset.txt
56 | evocmd="$evocmd"
57 | echo "Command; Subset size; Data seed; RMSE" >\$outputfile
58 | for (( run=0; \$run<${runs_per_subset}; run=\$run+1 )); do
59 |     cmd="\`echo \$evocmd | sed -e\"s/|postfix|/subset_${subset}_/; s/|dataseed|/\$run/\; s/|numberofusers|/${subset}/"\`"
60 |     echo "Launching \$cmd..."
61 |     rmse=\`\$cmd 2>/dev/null | tail -n 3 | head -n 1 |awk '{print \$NF}'\`
62 |     echo "\$cmd; $subset; \$run; \$rmse" >>\$outputfile
63 |     echo "Done with run \$run for subset size $subset."
64 | done
65 | EOF
66 |     chmod u+x $runfile
67 | done
68 | 
69 | echo "Made ${num_subset_files_created} from 1 to ${max_subset_size} files with ${runs_per_subset} runs per subset, evolving a population of $popsize individuals over $generations generations."


--------------------------------------------------------------------------------
/sg/models/taohong.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def vanilla(data, genome, loci, prediction_steps, spinup=0):
 5 |     """ Tao Hong's Vanilla Benchmark method, as described in
 6 |     "A Naive Multipe Linear Regression Benchmark for Short Term Load
 7 |     Forecasting" (Hong, 2011)
 8 | 
 9 |     Note: this model is specifically built for hourly based predictions,
10 |     and will not work properly otherwise."""
11 |     
12 |     temps = data.Temperature
13 | 
14 |     num_params = 2 + 7*24 + 4*12 + 3*24
15 |     print 'Created model with', num_params, 'parameters.'
16 |     a = np.zeros((len(data), num_params))
17 | 
18 |     for i in range(a.shape[0]):
19 |         day, hour, month = temps.index[i].dayofweek, temps.index[i].hour, temps.index[i].month
20 |         month -= 1
21 |         tmp = temps[i]
22 |         trend = (temps.index[i].value - temps.index[0].value)/(3600*10**9) + 1
23 |         a[i, 0:2] = [ 1, trend]
24 |         offset = 2
25 |         a[i, offset + day*hour] = 1
26 |         offset += 7*24
27 |         a[i, offset + month] = 1
28 |         offset += 12
29 |         a[i, offset + month] = tmp
30 |         offset += 12
31 |         a[i, offset + month] = tmp**2
32 |         offset += 12
33 |         a[i, offset + month] = tmp**3
34 |         offset += 12
35 |         a[i, offset + hour] = tmp
36 |         offset += 24
37 |         a[i, offset + hour] = tmp**2
38 |         offset += 24
39 |         a[i, offset + hour] = tmp**3
40 |         assert(offset + 24 == num_params)
41 |     x,_,_,_ = np.linalg.lstsq(a[:-prediction_steps], data.Load[:-prediction_steps])
42 | 
43 |     return pd.Series(data=np.dot(a[-prediction_steps:],x), 
44 |                     index=data.index[-prediction_steps:])
45 | 
46 | 


--------------------------------------------------------------------------------
/sg/models/test_esn.py:
--------------------------------------------------------------------------------
 1 | """Early attempt. Be patient."""
 2 | 
 3 | from datetime import timedelta as dt
 4 | import math
 5 | import random
 6 | 
 7 | import numpy as np
 8 | import Oger, mdp
 9 | import matplotlib.pyplot as plt
10 | import scikits.timeseries as ts
11 | 
12 | import esn
13 | import sg.utils
14 | from sg.data.sintef.create_full_temp_data import data as read_temperatures
15 | import sg.data.sintef.userloads as ul
16 | import load_prediction
17 | 
18 | user_id = 55864860
19 | 
20 | (dataset, test) = load_prediction.prepare_datasets(user_id, True)
21 | 
22 | day = 24
23 | today = random.randint(1000, dataset.shape[0]-day*2)
24 | today = 4600
25 | 
26 | 
27 | # [len_data, res_size, leak, input, bias, spectral, 
28 | #  seed, ridge, tmp_sm, load_sm]
29 | genome = [336, 500, 0.1, 0.5, 0.5, 0.9, 1000, 0.0001, 10, 10]
30 | genome = [168, 360, 0.1370736370770198, 1.322886484520891, 0.3211445098985698,
31 |           0.9492725784817237, 42979, 0.043436305850920925, 93, 52, 
32 |           1.3053755202564812, 0.5905128791783507]
33 | 
34 | alleles.loci = sg.utils.enum('hindsight', 'size', 'leak', 'in_scale',
35 |                              'bias_scale', 'spectral', 'seed', 'ridge',
36 |                              't_smooth', 'l_smooth', 't_zscore', 'l_zscore')
37 | 
38 | test = sg.utils.Normalizer(dataset[today-genome[0]:today+day,:], axis=0)
39 | 
40 | ytest = esn.feedback_with_external_input(test.normalized, genome, day)
41 | 
42 | print Oger.utils.nrmse(ytest[-day:], test.normalized[-day:,1])
43 | 
44 | plt.figure()
45 | plt.plot(test.normalized[:,1], label="Input loads")
46 | offset = len(test.raw) - genome[0] 
47 | plt.plot(range(offset, offset + len(ytest)), ytest, label="Prediction")
48 | plt.show()
49 | 
50 | # ytest.shape = (len(ytest), 1)
51 | # ytest = test.expand(np.concatenate((ytest, ytest), axis=1))[:,1]
52 | 
53 | # print sg.utils.mape(ytest[-day:], test.raw[-day:,1])
54 | 
55 | # out_series = ts.time_series(data=ytest, dates=loads[524:1000].dates)
56 | # sg.utils.plot_time_series([loads[524:1000], out_series],
57 | #                           ["r-", "g-"], ["Loads", "Prediction"])
58 | 
59 | 


--------------------------------------------------------------------------------
/sg/models/test_sequence_scan.py:
--------------------------------------------------------------------------------
 1 | # The dumbest form of similar sequence retrieval: sequential scan. To see if there actually
 2 | # are any similar sequences.
 3 | 
 4 | 
 5 | from datetime import timedelta as dt
 6 | import math
 7 | import random
 8 | 
 9 | import numpy as np
10 | import Oger, mdp
11 | import matplotlib.pyplot as plt
12 | import scikits.timeseries as ts
13 | from rtree import index
14 | 
15 | import pywt
16 | import sg.utils
17 | from sg.data.sintef.create_full_temp_data import data as read_temperatures
18 | import sg.data.sintef.userloads as ul
19 | import load_prediction
20 | 
21 | user_id = 55864860
22 | 
23 | (dataset, test) = load_prediction.prepare_datasets(user_id, False)
24 | 
25 | window = 256
26 | 
27 | candidate = sg.utils.Normalizer(dataset[:window+24,1]).normalized
28 | 
29 | sim = np.argmin([ Oger.utils.nrmse(sg.utils.Normalizer(test[i:i+window,1]).normalized, candidate[:-24]) for i in range(len(test)) if len(test)-i >= window ])
30 | print 'Done.'
31 | plt.plot(candidate, label='target')
32 | most_similar = sg.utils.Normalizer(test[sim:sim+window+24,1]).normalized
33 | plt.plot(most_similar, label='most similar, NRMSE %f' % Oger.utils.nrmse(most_similar[-24:], candidate[-24:]))
34 | plt.legend()
35 | plt.show()
36 | 
37 | 


--------------------------------------------------------------------------------
/sg/models/test_wavelet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Testing.
  3 | Author: Axel Tidemann
  4 | """
  5 | 
  6 | from datetime import timedelta as dt
  7 | import math
  8 | import random
  9 | 
 10 | import numpy as np
 11 | import Oger, mdp
 12 | import matplotlib.pyplot as plt
 13 | import scikits.timeseries as ts
 14 | 
 15 | import pywt
 16 | import sg.utils
 17 | from sg.data.sintef.create_full_temp_data import data as read_temperatures
 18 | import sg.data.sintef.userloads as ul
 19 | import load_prediction
 20 | 
 21 | from static import StaticNode
 22 | 
 23 | user_id = 55864860
 24 | 
 25 | (dataset, test) = load_prediction.prepare_datasets(user_id, False)
 26 | 
 27 | #day = 24
 28 | #today = random.randint(1000, dataset.shape[0]-day*2)
 29 | #today = 4600
 30 | 
 31 | #See if we can predict 24 times based on instances, learned from the training set.
 32 | 
 33 | data_raw = sg.utils.Normalizer(dataset, axis=0)
 34 | 
 35 | data = data_raw.normalized[:2**14,1]
 36 | 
 37 | # One year is 365*24 = 8760 datapoints. If we round down to 8192, we will get
 38 | # the maximum amount of scales for the decomposition (13), i.e. math.pow(2,13)
 39 | # The number of levels/scales determine how far we look back.
 40 | level = 4
 41 | 
 42 | coeffs = pywt.swt(data, 'haar', level=level)
 43 | 
 44 | # Collect coeffecients for training. Aj = 2 is taken from the paper.
 45 | 
 46 | Aj = 2
 47 | 
 48 | # The first 2^level datapoints cannot be used to predict because of lack of history.
 49 | # level+1 because of the smooth array.
 50 | x = np.zeros((len(data) - 2**level, (level+1)*Aj))
 51 | 
 52 | for i in range(len(x)):
 53 |     row = []
 54 |     # Collect coefficients for each level. cAn, i.e. the smoothest array.
 55 |     for k in range(1, Aj+1):
 56 |         row.append(coeffs[-1][0][2**level + i - 2**level*(k-1)])
 57 |     # cD, the details.
 58 |     for j in range(1, level+1):
 59 |         for k in range(1, Aj+1):
 60 |             row.append(coeffs[j-1][1][2**level + i - 2**j*(k-1)])
 61 |     
 62 |     x[i] = np.array(row)
 63 | 
 64 | # Target
 65 | y = data_raw.normalized[2**level:,1]
 66 | y.shape = (len(y), 1)
 67 | 
 68 | # Split into train/test sets
 69 | x_train = x[:356*24]
 70 | y_train = y[:356*24]
 71 | 
 72 | print 'Start ESN training...'
 73 | 
 74 | # Do 24hr predictions based on single day instances
 75 | x_24 = x[::24]
 76 | y_24 = np.zeros((len(y)/24,24))
 77 | for i in range(len(y_24)):
 78 |     y_24[i] = np.transpose(y[i*24:i*24+24])
 79 | x_24_train = x_24[:365]
 80 | y_24_train = y_24[:365]
 81 | 
 82 | flow_24 = mdp.hinet.FlowNode(Oger.nodes.LeakyReservoirNode(input_dim = x_24.shape[1], output_dim = 100, spectral_radius = 0.9) + Oger.nodes.RidgeRegressionNode())
 83 | flow_24.train(x_24_train, y_24_train)
 84 | flow_24.stop_training()
 85 | 
 86 | x_24_test = x_24[365:-1] # There is one more element than the y target due to rounding.
 87 | y_24_target = y_24[365:]
 88 | 
 89 | y_24_test = flow_24(x_24_test)
 90 | print 'NRMSE 24hr:', Oger.utils.nrmse(np.ndarray.flatten(y_24_test), np.ndarray.flatten(y_24_target))
 91 | 
 92 | plt.figure()
 93 | plt.plot(np.ndarray.flatten(y_24_target), label='24 hr target')
 94 | plt.plot(np.ndarray.flatten(y_24_test), label='24 hr test')
 95 | plt.legend()
 96 | 
 97 | # Test with a classifier ESN
 98 | #reservoir = StaticNode(input_dim = x.shape[1], output_dim = 2000, spectral_radius = 0.9)
 99 | reservoir = Oger.nodes.LeakyReservoirNode(input_dim = x.shape[1], output_dim = 2000, spectral_radius = 0.9)
100 | readout = Oger.nodes.RidgeRegressionNode()
101 | 
102 | flow = mdp.hinet.FlowNode(reservoir + readout)
103 | flow.train(x_train, y_train)
104 | flow.stop_training()
105 | 
106 | x_test = x[356*24:]
107 | y_target = y[356*24:]
108 | 
109 | y_test = flow(x_test) 
110 | print 'NRMSE:', Oger.utils.nrmse(y_test, y_target)
111 |  
112 | plt.figure()
113 | plt.plot(data, label="Input loads")
114 | plt.plot(coeffs[-1][0], label='Smooth array')
115 | i = 1
116 | for _,cD in coeffs:
117 |     plt.plot(cD, label='cD%i'%i)
118 |     i += 1
119 | plt.legend()
120 | 
121 | plt.figure()
122 | plt.plot(y_target, label='Target')
123 | plt.plot(y_test, label='Prediction')
124 | plt.legend()
125 | plt.show()
126 | 
127 | 


--------------------------------------------------------------------------------
/sg/models/test_wavelet_retrieve.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing.
 3 | Author: Axel Tidemann
 4 | """
 5 | 
 6 | from datetime import timedelta as dt
 7 | import math
 8 | import random
 9 | 
10 | import numpy as np
11 | import Oger, mdp
12 | import matplotlib.pyplot as plt
13 | import scikits.timeseries as ts
14 | from rtree import index
15 | 
16 | import pywt
17 | import sg.utils
18 | from sg.data.sintef.create_full_temp_data import data as read_temperatures
19 | import sg.data.sintef.userloads as ul
20 | import load_prediction
21 | 
22 | user_id = 55864860
23 | 
24 | (dataset, test) = load_prediction.prepare_datasets(user_id, False)
25 | 
26 | #See if we can predict 24 times based on instances, learned from the training set.
27 | window = 512
28 | 
29 | hours = [ sg.utils.Normalizer(dataset[i:i+window,1]).normalized for i in range(len(dataset)) if len(dataset)-i >= window+24 ]
30 | 
31 | coeffs = [ pywt.wavedec(segment,'haar') for segment in hours ]
32 | 
33 | # Grow tree
34 | p = index.Property()
35 | p.dimension = 20
36 | idx = index.Index(properties=p)
37 | 
38 | i = 0
39 | for c in coeffs:
40 |     key = [item for sublist in c for item in sublist ][:p.dimension]
41 |     idx.insert(i, tuple(key))
42 |     i+=1
43 | 
44 | def retrieve(query):
45 |     query_key = [item for sublist in pywt.wavedec(query[:-24],'haar') for item in sublist ][:p.dimension]
46 |     results = list(idx.nearest(tuple(query_key), 3))
47 |     print results
48 |     plt.plot(query, label='query')
49 |     for i in range(len(results)):
50 |         candidate = sg.utils.Normalizer(dataset[results[i]:results[i]+window+24,1]).normalized
51 |         plt.plot(candidate, label='candidate %i, NRMSE %f'%(i,Oger.utils.nrmse(candidate[-24:], query[-24:])))
52 |     plt.axvline(x=window, color='r', linewidth=1)
53 |     plt.legend()
54 |     plt.show()
55 |     
56 | 
57 | # Try to find 5 random points in the training dataset
58 | for test_point in np.random.permutation(range(len(dataset) - window - 24))[:5]:
59 |     retrieve(sg.utils.Normalizer(dataset[test_point:test_point+window+24,1]).normalized)
60 | 
61 | # Try 5 different random points in the test dataset
62 | for test_point in np.random.permutation(range(len(test) - window - 24))[:5]:
63 |     retrieve(sg.utils.Normalizer(test[test_point:test_point+window+24,1]).normalized)
64 | 


--------------------------------------------------------------------------------
/sg/models/wavelet.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/models/wavelet.pyc


--------------------------------------------------------------------------------
/sg/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | Required packages to get the code running:
3 | * python 2.7 (or possibly another 2.x, see note for scikits below)
4 | * numpy, scipy and matplotlib
5 | * distribute (successor of setuptools) (not for macports?): http://pypi.python.org/pypi/distribute#installation-instructions 
6 | * Pip (not macports?) http://www.pip-installer.org/en/latest/installing.html
7 | * scikits.timeseries from macports, pip or http://sourceforge.net/projects/pytseries/files/scikits.timeseries. Only for Python 2.6 in macports at the time of writing.
8 | 


--------------------------------------------------------------------------------
/sg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 | 


--------------------------------------------------------------------------------
/sg/utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/__init__.pyc


--------------------------------------------------------------------------------
/sg/utils/_test_template.py:
--------------------------------------------------------------------------------
 1 | """This is a unit test skeleton, meant to be used as a template for the
 2 | boilerplate code when creating a new unit test file."""
 3 | 
 4 | import os
 5 | import sys
 6 | import unittest
 7 | 
 8 | import numpy as np
 9 | 
10 | import sg.utils.testutils as testutils
11 | 
12 | from xxx import * 
13 | 
14 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
15 | 
16 | class Test(testutils.ArrayTestCase):
17 |     def setUp(self):
18 |         pass
19 |     
20 |     def tearDown(self):
21 |         pass
22 |     
23 |     def test_(self):
24 |         """."""
25 |         pass
26 |         
27 | class Test(unittest.TestCase):
28 |     def setUp(self):
29 |         pass
30 |     
31 |     def tearDown(self):
32 |         pass
33 |     
34 |     def test_(self):
35 |         """."""
36 |         pass
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     unittest.main()
41 | 
42 | # if __name__ == "__main__":
43 | #     from unittest import main
44 | #     main(module="test_" + __file__[:-3])
45 |     
46 | 


--------------------------------------------------------------------------------
/sg/utils/analyze_gefcom_temp_genes.py:
--------------------------------------------------------------------------------
 1 | """Miscellaneous routines to import/extract/plot the evolution of the
 2 | temperature genes in evolved forecasters for the GEFCom 2012 dataset."""
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import pandas as pd
 6 | 
 7 | def _plot_on_axis(means, station, ax):
 8 |     m = means['temp_{}'.format(station)]
 9 |     for i in range(30):
10 |         try:
11 |             m.ix[i].plot(ax=ax, color='r', legend=False)
12 |         except:
13 |             print "Trouble plotting run {}, station {}. Missing data?".format(i, station)
14 |     plt.title('Temp station {}'.format(station))
15 |     
16 | def multi(means):
17 |     """Create one plot for each temperature station in 'means'. Draw the
18 |     evolution in each run as a separate line."""
19 |     for s in range(11):
20 |         _plot_on_axis(means, s, plt.figure().gca())
21 | 
22 | def multi_sub(means, title=None):
23 |     """Create one subplot for each temperature station in 'means'. Draw the
24 |     evolution in each run as a separate line."""
25 |     fig = plt.figure()
26 |     if title is not None:
27 |         plt.suptitle(title)
28 |     for s in range(11):
29 |         _plot_on_axis(means, s, fig.add_subplot(3, 4, s+1))
30 |     
31 |     # for s in range(11):
32 |     #     ax = fig.add_subplot(3, 4, s)
33 |     #     m = means['temp_{}'.format(s)]
34 |     #     for i in range(30):
35 |     #         m.ix[i].plot(ax=ax, color='b', alpha=0.3, legend=False)
36 |     #     plt.title('Temp station {}'.format(s))
37 | 
38 | def multi_2(means, stations=range(11), fig=None):
39 |     """All runs and (the given) stations on the same plot"""
40 |     if fig is None:
41 |         fig = plt.figure()
42 |     ax = fig.gca()
43 |     columns = ['temp_{}'.format(s) for s in stations]
44 |     lbls = ['Temperature Station {}'.format(s+1) for s in stations]
45 |     for i in range(30):
46 | #        means[columns].ix[i].plot(ax=ax, colormap='jet', alpha=1, legend=False)
47 |         means[columns].ix[i].plot(ax=ax, color=['b', 'c', 'm', 'g', 'y', 'r'], alpha=1, legend=False)
48 |     plt.legend(lbls, loc='right')
49 |         # m.ix[0].plot(ax=ax, colormap='jet', alpha=0.05, legend=False)
50 |         # for i in range(1,30):
51 |         #     m.ix[i].plot(ax=ax, colormap='jet', alpha=0.05, legend=False)
52 | #        plt.title('Temp station {}'.format(t))
53 | 
54 | def import_from_csv(path):
55 |     """Read the CSV file in 'path', output a pandas Dataframe with 11
56 |     columns, one for each temperature gene, and 100 rows, one for each
57 |     generation in each run. Each value is averaged across all
58 |     individuals in all runs found in the CSV file. The CSV was typically
59 |     made with a command similar to [...]/scripts/parse-logs-into-csv.sh output_*.txt."""
60 |     all = pd.read_csv(open(path, 'r'))
61 |     cols = ['file', 'gen', 'fitn1', 'fitn2', 'hindsight', 'AR_order']
62 |     cols += [ 'temp_{}'.format(i) for i in range(11)]
63 |     all.columns = cols
64 |     grouped = all.groupby(['file', 'gen'], as_index='False')
65 |     means = grouped.mean()
66 |     stds = grouped.std()
67 |     means = means.drop(['fitn1', 'fitn2', 'hindsight', 'AR_order'], axis=1)
68 |     stds = stds.drop(['fitn1', 'fitn2', 'hindsight', 'AR_order'], axis=1)
69 |     return means
70 | 


--------------------------------------------------------------------------------
/sg/utils/cache.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | class ATimeCache(object):
 4 |     """Cache class (dictionary) with a limited size, where only the
 5 |     'max_entries' most recently added or accessed entries are stored."""
 6 | 
 7 |     def __init__(self, max_entries):
 8 |         self._cache = OrderedDict()
 9 |         self._max_entries = max_entries
10 | 
11 |     def _shrink(self):
12 |         while len(self._cache) > self._max_entries:
13 |             self._cache.popitem(last=False)
14 |         
15 |     def get_max_entries(self):
16 |         return self._max_entries
17 | 
18 |     def set_max_entries(self, value):
19 |         self._max_entries = value
20 |         self._shrink()
21 |         
22 |     max_entries = property(
23 |         get_max_entries, set_max_entries, None, "Set or get the cache size")
24 | 
25 |     def has_key(self, key):
26 |         return self._cache.has_key(key)
27 |     
28 |     def __eq__(self, other):
29 |         try:
30 |             return self._cache.__eq__(other._cache)
31 |         except:
32 |             return False
33 |     
34 |     def __len__(self):
35 |         return self._cache.__len__()
36 | 
37 |     def __getitem__(self, key):
38 |         value = self._cache.pop(key)
39 |         self._cache[key] = value
40 |         return value
41 | 
42 |     def __setitem__(self, key, value):
43 |         if self._cache.has_key(key):
44 |             self._cache.pop(key)
45 |         self._cache.__setitem__(key, value)
46 |         self._shrink()
47 | 
48 |     def __contains__(self, key):
49 |         return self.has_key(key)
50 | 
51 |     def __str__(self):
52 |         return self.cache.__str__()
53 | 
54 |     def __iter__(self):
55 |         # Iterate directly on the underlying dict, rather than on this
56 |         # class, in order to change the order of cached items (as
57 |         # opposed to []/__getitem__, which will reinsert an item on top
58 |         # of the stack whenever it is looked up.
59 |         return iter(self._cache)
60 | 
61 | if __name__ == "__main__":
62 |     from unittest import main
63 |     main(module="test_" + __file__[:-3])
64 |     
65 | 


--------------------------------------------------------------------------------
/sg/utils/genemapper.py:
--------------------------------------------------------------------------------
 1 | """The genemappers map a real-valued gene to an allele."""
 2 | 
 3 | import math
 4 | 
 5 | from pyevolve import GAllele, G1DList, Consts
 6 | 
 7 | class _AlleleMapper():
 8 |     def _get_normalized_gene(self, gene_val, gene_range):
 9 |         gene_norm = float(gene_val - gene_range[0]) / \
10 |           (gene_range[1] - gene_range[0])
11 |         if gene_norm < 0 or gene_norm > 1:
12 |             raise ValueError("Gene value (%f) outside allowed range (%f - %f)." \
13 |                              % (gene_val, gene_range[0], gene_range[1]))
14 |         return gene_norm
15 | 
16 | class MappedAlleleRange(GAllele.GAlleleRange, _AlleleMapper): 
17 |     """Subclass of GAllele.GAlleleRange that provides a way of mapping from a
18 |     real-valued gene to a range allele gene."""
19 | 
20 |     def __init__(self, begin=Consts.CDefRangeMin,
21 |                 end=Consts.CDefRangeMax, real=False, scaling='linear'):
22 |         """See superclass for begin, end and real args.  'scaling' scales the
23 |         mapping, and can be linear or log. If scaling is log, then begin < end
24 |         must hold."""
25 |         GAllele.GAlleleRange.__init__(self, begin, end, real)
26 |         self._scaling = scaling
27 |         
28 |     def map_to_allele(self, gene_val, gene_range):
29 |         """Map a gene value in gene_range to the corresponding allele value."""
30 |         if len(self.beginEnd) != 1:
31 |             raise NotImplementedError("The mapper can currently only handle " \
32 |                                       "alleles with a single range.")
33 |         gene_norm = self._get_normalized_gene(gene_val, gene_range)
34 |         beginEnd = self.beginEnd[0]
35 |         to_range = (beginEnd[1] - beginEnd[0])
36 |         if self._scaling == 'log':
37 |             to_range = math.log(1 + to_range)
38 |             mapped_val = beginEnd[0] + math.exp(gene_norm * to_range) - 1
39 |         elif self._scaling == 'linear':
40 |             mapped_val = beginEnd[0] + gene_norm * to_range
41 |         else:
42 |             raise ValueError("Unknown scaling method: %s" % self._scaling)
43 |         if not self.real:
44 |             return int(round(mapped_val))
45 |         return max(beginEnd[0], min(beginEnd[1], mapped_val))
46 | 
47 | class MappedAlleleList(GAllele.GAlleleList, _AlleleMapper):
48 |     """Subclass of GAllele.GAlleleList that provides a way of mapping from a
49 |     real-valued gene to a list allele gene."""
50 | 
51 |     def map_to_allele(self, gene_val, gene_range):
52 |         gene_norm = self._get_normalized_gene(gene_val, gene_range)
53 |         to_idx = int(gene_norm * len(self.options))
54 |         # In case gene_norm == 1:
55 |         if to_idx == len(self.options):
56 |             to_idx -= 1
57 |         return self.options[to_idx]
58 | 
59 | 
60 | def map_to_alleles(genome):
61 |     """Maps from a (real-valued G1DList) genome to a list of allele genes."""
62 |     alleles = genome.getParam("allele")
63 |     genes = genome[:]
64 |     gene_range = (genome.getParam("rangemin"), genome.getParam("rangemax"))
65 |     return [alleles[i].map_to_allele(genes[i], gene_range)
66 |             for i in range(len(genes))]
67 | 
68 | 
69 |     
70 | if __name__ == "__main__":
71 |     from unittest import main
72 |     main(module="test_" + __file__[:-3])
73 |     
74 | 


--------------------------------------------------------------------------------
/sg/utils/genemapper.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/genemapper.pyc


--------------------------------------------------------------------------------
/sg/utils/output.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/output.pyc


--------------------------------------------------------------------------------
/sg/utils/plot_fitnesses.py:
--------------------------------------------------------------------------------
  1 | import sqlite3 as sql
  2 | import glob
  3 | import numpy as np
  4 | import sys
  5 | import optparse
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import sg.utils
 10 | 
 11 | def fetch_one(db_path, exp_id="ex1"):
 12 |     """Fetch one evolution, identified by exp_id, from the database given in
 13 |     db_path. Returns an array holding: generation, min/avg/max fitness, and
 14 |     fitness std dev."""
 15 |     with sql.connect(db_path) as conn:
 16 |         crs = conn.execute("select generation, rawMin, rawAve, rawMax, "
 17 |                            "rawDev from statistics")
 18 |         return np.array(zip(*crs.fetchall()))
 19 | 
 20 | def fetch_match(pattern, exp_id="ex1"):
 21 |     """Fetch all files matching pattern."""
 22 |     # While tempting to test for isinstance(pattern, collections.iterable) in
 23 |     # order to support multiple patterns (e.g. "fetch_match(sys.argv[1:])"), a
 24 |     # string (such as the pattern) will also pass this test.
 25 |     return [fetch_one(path, exp_id) for path in glob.glob(pattern)]
 26 | 
 27 | def _common_generations(evolutions):
 28 |     """Return the list of evolutions shortened to only common generations."""
 29 |     stats_lengths = [evo.shape[1] for evo in evolutions]
 30 |     last_common_idx = min(stats_lengths)
 31 |     first_evo = evolutions[0]
 32 |     last_common_gen = first_evo[0, last_common_idx - 1]
 33 |     for evo in evolutions[1:]:
 34 |         for gen in range(last_common_idx):
 35 |             if evo[0,gen] != first_evo[0,gen]:
 36 |                 last_common_gen = min(last_common_gen, gen)
 37 |                 last_common_idx = gen
 38 |                 break
 39 | 
 40 |     max_gen = max([evo[0, -1] for evo in evolutions])
 41 |     print "max gen is", max_gen
 42 |     print "last_common_gen is", last_common_gen
 43 |     if last_common_gen < max_gen:
 44 |         print >>sys.stderr, "Some generations missing in at least one " \
 45 |           "simulation. Plotting only the first generations 0-%d." \
 46 |           % last_common_gen
 47 |         print >>sys.stderr, "Lengths of statistics for each evolution:"
 48 |         print >>sys.stderr, "\t", stats_lengths
 49 |         print >>sys.stderr, "Generation at last common index for each evolution:"
 50 |         print >>sys.stderr, "\t", [evo[0, last_common_idx-1] 
 51 |                                    for evo in evolutions]
 52 |     if last_common_idx <= 0:
 53 |         raise ValueError("No overlapping generations (one empty evolution?).")
 54 |     return [evo[:,:last_common_idx] for evo in evolutions]
 55 |     
 56 | def join(evolutions):
 57 |     """Join the output from several evolutions. Evolutions is a list where each
 58 |     evolution element is an array as returned from fetch_one.
 59 | 
 60 |     Return generation and averages.
 61 |     """
 62 |     evolutions = _common_generations(evolutions)
 63 |     return np.average(np.array(evolutions), axis=0)
 64 | 
 65 | def plot_evols(evolutions, generations=None, axes=None, **plt_kwargs):
 66 |     if axes is None:
 67 |         axes = plt.axes()
 68 |     col = sg.utils.Enum("gen", "min", "avg", "max", "dev")
 69 |     mg = -1 if generations is None else generations + 1
 70 |     axes.plot(evolutions[col.gen,0:mg], evolutions[col.min,0:mg],
 71 |               label="Minimum", **plt_kwargs)
 72 |     axes.plot(evolutions[col.gen,0:mg], evolutions[col.avg,0:mg], 
 73 |               label="Average", **plt_kwargs)
 74 |     axes.plot(evolutions[col.gen,0:mg], evolutions[col.max,0:mg], 
 75 |               label="Maximum", **plt_kwargs)
 76 |     axes.plot(evolutions[col.gen,0:mg], evolutions[col.dev,0:mg],
 77 |               label="Devation", **plt_kwargs)
 78 |     return axes
 79 | 
 80 | def _get_options():
 81 |     parser = optparse.OptionParser()
 82 |     parser.usage = "[options] path_to_pyevolve.db [more_pyevolve.dbs]"
 83 |     parser.description = "Plot fitness averaged over evolutions from multiple Pyevolve sqlite3 databases"
 84 |     parser.add_option("--title", dest="title", default=None, help="Title for the plot")
 85 |     parser.add_option("--exp", dest="exp_id", default="ex1", help="Name identifying experiment in database")
 86 |     parser.add_option("--generations", dest="generations", type="int", default=None, help="Max number of generations to plot")
 87 |     parser.add_option("--ymin", dest="ymin", type="float", default=None, help="Fix Y axis to given min value")
 88 |     parser.add_option("--ymax", dest="ymax", type="float", default=None, help="Fix Y axis to given max value")
 89 |     return parser.parse_args()
 90 | 
 91 | if __name__ == "__main__":
 92 |     options, args = _get_options()
 93 |     evolutions = [fetch_one(path, options.exp_id) for path in args]
 94 |     print "Plotting the average of %d evolutions." % len(evolutions)
 95 |     average = join(evolutions)
 96 |     plot_evols(average, generations=options.generations)
 97 |     plt.legend(loc=(0.2, 0.2))
 98 |     if options.ymin is not None:
 99 |         plt.ylim(ymin=options.ymin)
100 |     if options.ymax is not None:
101 |         plt.ylim(ymax=options.ymax)
102 |     if options.title is not None:
103 |         plt.title(options.title)
104 |     plt.show()
105 | 


--------------------------------------------------------------------------------
/sg/utils/pyevolve_mpi.py:
--------------------------------------------------------------------------------
  1 | """MPI variant of Pyevolve."""
  2 | 
  3 | import numpy as np
  4 | import random
  5 | import sys
  6 | import collections
  7 | 
  8 | from mpi4py import MPI
  9 | import pyevolve
 10 | from pyevolve.GPopulation import GPopulation, multiprocessing_eval
 11 | from pyevolve.GSimpleGA import GSimpleGA
 12 | 
 13 | import sg.utils
 14 | import pyevolve_utils as pu
 15 | from sg.utils.cache import ATimeCache
 16 | 
 17 | class MPIPopulation(pu.SpecifiedPopulation):
 18 |     def __init__(self, ga, genome):
 19 |         self._ga = ga
 20 |         if isinstance(genome, pu.SpecifiedPopulation):
 21 |             if not isinstance(genome, MPIPopulation):
 22 |                 raise RuntimeError("A non-MPI population has crept into the system!")
 23 |         pu.SpecifiedPopulation.__init__(self, genome)        
 24 |         
 25 |     def _make_data_cache_key(self):
 26 |         key = []
 27 |         train_iter = self._ga.model.dataset.train_data_iterator()
 28 |         for (data_in, data_out) in train_iter():
 29 |             key += [data_in.index[0].value, data_in.index[-1].value,
 30 |                     data_out.index[0].value, data_out.index[-1].value]
 31 |         return tuple(key)
 32 |         
 33 |     def _make_cache_key(self, indiv):
 34 |         genome = pu.raw_genes(indiv, True)
 35 |         return tuple(sg.utils.safe_deep_flatten(genome))
 36 | 
 37 |     def evaluate(self, **args):        
 38 |         if self._ga.rank == 0:
 39 |             cache = self._ga.caches[self._make_data_cache_key()]
 40 |             keys = [self._make_cache_key(indiv) for indiv in self.internalPop]
 41 |             uncached = [key not in cache for key in keys]
 42 |             uncached_indices = np.where(uncached)[0]
 43 |             cached_indices = np.where(np.logical_not(uncached))[0]
 44 |             unevaled_pop = [self.internalPop[index] for index in uncached_indices]
 45 |             pop_size = len(unevaled_pop)
 46 |             print "Cache size is {}, unevaluated population size is {}. Now scattering"\
 47 |                 .format(len(cache), pop_size)
 48 |             sys.stdout.flush()
 49 |             indices = np.linspace(0, pop_size, self._ga.nhosts+1).astype('int')
 50 |             scattered = [unevaled_pop[start:end] for (start, end) in \
 51 |                          zip(indices[:-1], indices[1:])]
 52 |         else:
 53 |             scattered = None
 54 |         indivs = self._ga.comm.scatter(scattered)
 55 |         fitnesses = np.array([multiprocessing_eval(indiv) for indiv in indivs])
 56 | #        print "Evaluation of {} indivs complete on host {}".format(len(indivs), self._ga.rank)
 57 |         sys.stdout.flush()
 58 |         all_fitnesses = self._ga.comm.gather(fitnesses)
 59 |         if self._ga.rank == 0:
 60 |             # Fetch from cache before adding newly evaluated genomes, as
 61 |             # these may otherwise delete old cached entries before their
 62 |             # values are retrieved.
 63 |             for index in cached_indices:
 64 |                self.internalPop[index].score = cache[keys[index]]
 65 |             for index, score in zip(uncached_indices, np.concatenate(all_fitnesses)):
 66 |                self.internalPop[index].score = score
 67 |                cache[keys[index]] = score
 68 |         self.clearFlags()
 69 | 
 70 |         
 71 | class SimpleMPIGA(pu.SimpleGAWithFixedElitism):
 72 |     def __init__(self, model, genome, seed=None, interactiveMode=True):
 73 |         self._init_MPI()
 74 |         self._model = model
 75 |         self._caches = collections.defaultdict(lambda: ATimeCache(1000))
 76 |         pu.SimpleGAWithFixedElitism.__init__(self, genome, seed, interactiveMode)
 77 | 
 78 |     def _init_MPI(self):
 79 |         self._comm = MPI.COMM_WORLD
 80 |         self._nhosts = self._comm.Get_size()
 81 |         self._rank = self._comm.Get_rank()
 82 | 
 83 |     def make_population(self, genome):
 84 |         return MPIPopulation(self, genome)
 85 | 
 86 |     def evolve(self, freq_stats=0):
 87 |         if not self.terminationCriteria.isEmpty():
 88 |             raise RuntimeError("Termination criteria other than number of generations unsupported under MPI.")
 89 |         if self._rank != 0:
 90 |             raise RuntimeError("Evolve should only be called on rank 0 process.")
 91 |         return pu.SimpleGAWithFixedElitism.evolve(self, freq_stats)
 92 | 
 93 |     def eval_loop(self):
 94 |         stopFlagCallback = False
 95 |         for gen in range(self.nGenerations + 1):
 96 |             self.internalPop.evaluate()
 97 |             if not self.stepCallback.isEmpty():
 98 |                 for it in self.stepCallback.applyFunctions(self):
 99 |                     stopFlagCallback = it
100 |             if stopFlagCallback:
101 |                 break
102 | 
103 |     @property
104 |     def model(self):
105 |         return self._model
106 | 
107 |     @property
108 |     def caches(self):
109 |         return self._caches
110 |     
111 |     @property
112 |     def comm(self):
113 |         return self._comm
114 | 
115 |     @property
116 |     def rank(self):
117 |         return self._rank
118 | 
119 |     @property
120 |     def nhosts(self):
121 |         return self._nhosts
122 | 


--------------------------------------------------------------------------------
/sg/utils/queue_jobs.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import sys
  4 | import os
  5 | 
  6 | class JobSubmitter(object):
  7 |     def __init__(self, stream):
  8 |         self.batch_size = 1
  9 |         self.max_in_queue = 10
 10 |         self.wait_time_secs = 5
 11 |         self._jobs = [job[:-1] for job in stream]
 12 |         self._user = os.environ['USER']
 13 |         self._queue_status = ""
 14 |         self._slots_available = 0
 15 |         
 16 |     def _sys_cmd(self, cmd):
 17 |         #print "_sys_cmd with command:", cmd
 18 |         return subprocess.check_output(cmd, shell=True)
 19 | 
 20 |     def _log(self, *args):
 21 |         sys.stdout.write(*args)
 22 |         sys.stdout.write("\n")
 23 |         sys.stdout.flush()
 24 |         
 25 |     def _jobs_remaining(self):
 26 |         return len(self._jobs) > 0
 27 | 
 28 |     def _is_job_running(self, job):
 29 |         return self._queue_status.find(job) >= 0
 30 |     
 31 |     def _update_queue_status(self):
 32 |         self._queue_status = self._sys_cmd("qstat -f")
 33 | 
 34 |     def _resources_available(self):
 35 |         cmd = "qstat |grep %s |wc |awk '{print $1}'" % self._user
 36 |         in_queue = int(self._sys_cmd(cmd)[:-1])
 37 |         self._slots_available = self.max_in_queue - in_queue
 38 |         return self._slots_available > 0
 39 | 
 40 |     def _submit_more_jobs(self):
 41 |         self._update_queue_status()
 42 |         submitted = 0
 43 |         max_submissions = min(self._slots_available, self.batch_size)
 44 |         while self._jobs and submitted < max_submissions:
 45 |             job = self._jobs.pop(0)
 46 |             if self._is_job_running(job):
 47 |                 self._log("Skipping job, already running: " + job)
 48 |             else:
 49 |                 self._sys_cmd("qsub " + job)
 50 |                 self._log("Submitted job at " + time.strftime("%b. %d, %X: ") \
 51 |                           + job)
 52 |                 submitted += 1
 53 |         return submitted
 54 | 
 55 |     def _wait(self, brief=False):
 56 |         if brief:
 57 |             time.sleep(15)
 58 |         else:
 59 |             time.sleep(self.wait_time_secs)
 60 | 
 61 |     def submit_jobs(self):
 62 |         self._log("Queueing " + str(len(self._jobs)) + " jobs for submission in " + \
 63 |                   "batches of " + str(self.batch_size) + ". Polling the queue " + \
 64 |                   "for free space every " + str(self.wait_time_secs/60.0) + " minutes.")
 65 |         while self._jobs_remaining():
 66 |             brief_wait = False
 67 |             if self._resources_available():
 68 |                 num_submitted = self._submit_more_jobs()
 69 |                 brief_wait = num_submitted < self._slots_available
 70 |             self._wait(brief=brief_wait)
 71 |         self._log("All jobs submitted. Bye for now.")
 72 | 
 73 | def get_options():
 74 |     """Add prediction-related options to the parser. If no parser is provided, one
 75 |     will be created."""
 76 |     import optparse
 77 |     parser = optparse.OptionParser()
 78 |     parser.usage = "[options] [jobfile]"
 79 |     parser.description = "Send jobs in batches to the queueing system. The list of jobs can be sent to stdin or be stored in jobfile."
 80 |     parser.add_option("--wait", dest="wait", type="float", help="Wait time in minutes between each check of the queue", default=10)
 81 |     parser.add_option("--queued", dest="queued", type="long", help="Max number of jobs in the queue at once", default=10)
 82 |     parser.add_option("--batch", dest="batch", type="long", help="Number of jobs to submit each time", default=1)
 83 |     (options, args) = parser.parse_args()
 84 |     return options, args
 85 | 
 86 | def make_submitter(path=None):
 87 |     if path is None:
 88 |         if sys.stdin.isatty():
 89 |             sys.stderr.write("You must submit a path or cat jobs to stdin.")
 90 |             exit(1)
 91 |         print "Reading jobs from standard input."
 92 |         return JobSubmitter(sys.stdin)
 93 |     else:
 94 |         print "Reading jobs from " + path + "."
 95 |         with open(path, "r") as f:
 96 |             return JobSubmitter(f)
 97 |     
 98 | if __name__ == "__main__":
 99 |     options, args = get_options()
100 |     submitter = make_submitter(args[0] if args else None)
101 |     submitter.wait_time_secs = options.wait * 60
102 |     submitter.batch_size = options.batch
103 |     submitter.max_in_queue = options.queued
104 |     submitter.submit_jobs()
105 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/best-genomes-found.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$0
 4 | if test -n "`type rev 2>/dev/null`"; then
 5 |     NAME="`echo $0 | rev | cut -d '/' -f 1 | rev`";
 6 | fi
 7 | 
 8 | NAME=$0
 9 | if test -n "`type basename 2>/dev/null`"; then
10 |     NAME="`basename $0`";
11 | fi
12 | 
13 | USAGE="Usage: 
14 |    $NAME [options] [ file [ file2 ...] ]
15 | 
16 | Print the best genomes found (as alleles) by evolution for each of the files
17 | given on the command line, or from standard input if no files are given. The
18 | script works by finding the last line containing the word 'alleles', and then
19 | printing everything inside brackets on that line.
20 | 
21 | Options:
22 |   -a
23 |     Print all generations, not just the last (or first).
24 |   -r
25 |     Print raw genes (instead of genes mapped to allele ranges) from old log files.
26 |   -m
27 |     Print mapped genes from old log files.
28 |   -h
29 |     Prints this help.
30 |   -f
31 |     Print the first generation rather than the last
32 | "
33 | 
34 | KEYWORD="Best genome at generation [0-9]\+: \["
35 | TAIL="tail -n 1"
36 | while getopts afmrh'?' opt
37 | do
38 |     case $opt in
39 |         a)
40 |             TAIL="cat";;
41 |         f)
42 |             TAIL="head -n 1";;
43 |         r)
44 |             KEYWORD="raw genes";;
45 |         m)
46 |             KEYWORD="alleles";;
47 | 	h|'?'|?|*)
48 | 	    echo "$USAGE"
49 | 	    exit 2;;
50 |     esac
51 | done
52 | shift `expr $OPTIND - 1`
53 | 
54 | if [ $# -eq 0 ]; then
55 |     INPUT='-'
56 | else
57 |     INPUT="$@"
58 | fi
59 | 
60 | for f in $INPUT; do 
61 |     extension="${f##*.}"
62 |     # Likewise, filename="${f%.*}"
63 |     if [ "$extension" == "bz2" ]; then
64 |         GREP=bzgrep
65 |     else
66 |         GREP=grep
67 |     fi
68 |     $GREP "$KEYWORD" "$f" |$TAIL;
69 | done |sed -e's/.*\[//; s/\]$//'
70 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/list-finished-jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=arima
 4 | datasets="total-load bc-data"
 5 | match="Error on test phase for best genome found"
 6 | for model in $models; do
 7 |     for data in $datasets; do
 8 |         for (( r=0; $r<30; r++ )); do
 9 |             base=output_${model}_run_${r}_${data}-noholidays_;
10 |             for prep in "" subtract-daily-pattern_ subtract-weekly-pattern_; do
11 |                 f=${base}${prep}100_0.txt;
12 |                 have_txt=false
13 |                 fin_txt=false
14 |                 have_bz2=false
15 |                 fin_bz2=false
16 |                 if [[ -e "$f" ]]; then
17 |                     have_txt=true
18 |                     if [[ -n "`grep \"$match\" $f`" ]]; then
19 |                         fin_txt=true
20 |                     fi
21 |                 fi
22 |                 if [[ -e "$f.bz2" ]]; then
23 |                     have_bz2=true
24 |                     if [[ -n "`bzgrep \"$match\" $f.bz2`" ]]; then
25 |                         fin_bz2=true
26 |                     fi
27 |                 fi
28 |                 if [[ ${have_txt} == true && ${have_bz2} == true ]]; then
29 |                     if [[ ${fin_txt} == true && ${fin_bz2} == true ]]; then
30 |                         echo "Duplicate, both finished: $f/.bz2"
31 |                     elif [[ ${fin_txt} == true ]]; then
32 |                         echo "Duplicate, only .txt finished: $f/.bz2"
33 |                     elif [[ ${fin_bz2} == true ]]; then
34 |                         echo "Duplicate, only .bz2 finished: $f/.bz2"
35 |                     else
36 |                         echo "Duplicate, both incomplete: $f/.bz2"
37 |                     fi
38 |                 elif [[ ${have_txt} == true ]]; then
39 |                     if [[ ${fin_txt} == true ]]; then
40 |                         echo "Finished: $f"
41 |                     else
42 |                         echo "Incomplete: $f (no .bz2)"
43 |                     fi
44 |                 elif [[ ${have_bz2} == true ]]; then
45 |                     if [[ ${fin_bz2} == true ]]; then
46 |                         echo "Finished: $f.bz2"
47 |                     else
48 |                         echo "Incomplete: $f.bz2 (no .txt)"
49 |                     fi
50 |                 else
51 |                     echo "Both missing: $f/.bz2"
52 |                 fi
53 |             done;
54 |         done
55 |     done
56 | done
57 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/parse-logs-into-csv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script takes a list of output_....txt evolution log files as
 4 | # input, and parses them into one big csv file where the fitnesses
 5 | # (scaled and raw) and genes of each individual in each generation is
 6 | # stored. Each line is prepended with the run number and the generation
 7 | # number.
 8 | #
 9 | # Input: List of files to parse, already bunzipped
10 | # Output: Huge CSV file
11 | 
12 | inputs=$@
13 | output=all.csv
14 | 
15 | dirs="generated filed genes stripped"
16 | for d in $dirs; do
17 |     if test -e $d; then
18 |         echo "Temporary directory '$d' already exists. Exiting."
19 |         exit
20 |     fi
21 | done
22 | if test -e $output; then
23 |     echo "Output file '$output' already exists. Exiting."
24 |     exit
25 | fi
26 | 
27 | mkdir -v $dirs
28 | 
29 | echo "Appending generation number to each line of logs..."
30 | for f in $inputs; do
31 |     awk 'BEGIN{gen=0} /Best genome at generation [0-9]* had/{gen++} {print gen, ",", $0}' $f >generated/$f;
32 | done
33 | 
34 | echo "Appending run number to each line in each file..."
35 | pushd generated/
36 | for f in $inputs; do
37 |     run=`echo $f | awk -F_ '{print $4}'`;
38 |     awk "{print $run, \",\", \$0}" $f >../filed/$f;
39 | done
40 | popd
41 | 
42 | echo "Extracting lines with genes and fitnesses (population dump) from logs"
43 | pushd filed/
44 | for f in $inputs; do
45 |     gsed -n -e '/^[][0-9,[:space:]e.+-]\+$/p' $f |gsed -n -e'/[][]\+/p' >../genes/$f;
46 | done
47 | popd
48 | echo "Removing braces from genes, comma-separating fields."
49 | pushd genes/
50 | for f in $inputs; do
51 |     sed -e's/[][]//g' $f |sed -e's/\([0-9]\) \([0-9]\)/\1, \2/g';
52 | done >../$output
53 | popd
54 | 
55 | echo "CSV saved to $output."
56 | 
57 | echo "Removing temporary directories."
58 | rm -rf $dirs
59 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/resubmit-jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$0
 4 | if test -n "`type rev 2>/dev/null`"; then
 5 |     NAME="`echo $0 | rev | cut -d '/' -f 1 | rev`";
 6 | fi
 7 | 
 8 | NAME=$0
 9 | if test -n "`type basename 2>/dev/null`"; then
10 |     NAME="`basename $0`";
11 | fi
12 | 
13 | submitter=$USER
14 | 
15 | USAGE="Usage: 
16 |    $NAME jobscript [ more jobscripts...]
17 | 
18 | Submit the job(s) given on the command line, as long as it/they aren't already
19 | present in the job queue.
20 | 
21 | Options:
22 |   -h
23 |     Prints this help.
24 |   -u user
25 |     Check jobs submitted by user rather than jobs submitted by '$submitter'.
26 | "
27 | 
28 | while getopts u:h'?' opt
29 | do
30 |     case $opt in
31 |         u)
32 |             submitter=$OPTARG;;
33 | 	h|'?'|?|*)
34 | 	    echo "$USAGE"
35 | 	    exit 2;;
36 |     esac
37 | done
38 | shift `expr $OPTIND - 1`
39 | 
40 | if [ $# -eq 0 ]; then
41 |     echo "$USAGE"
42 |     exit 2
43 | fi
44 | 
45 | for job in $@; do
46 |     if [ -z "`qstat -f -u $submitter | grep "$job"`" ]; then
47 |         echo "(Re)submitting job $job:"
48 |         qsub "$job"
49 |     else
50 |         echo "Job '$job' already in queue, skipping."
51 |     fi
52 | done
53 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/split-test-validate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Split the output from running a prediction model on the test set (i.e. the
 4 | # output file from running a GA) into two halves: validation and test.
 5 | #
 6 | # In other words: filter the input, keeping only lines containing
 7 | # $filter_pattern. Split the resulting set of lines in the middle, and average
 8 | # the values found in the last field on each line in each half. 
 9 | 
10 | filter_pattern="^Error for test at"
11 | filter () {
12 |     extension="${1##*.}"
13 |     if [ "$extension" == "bz2" ]; then
14 |         CAT=bzcat
15 |     else
16 |         CAT=cat
17 |     fi
18 |     $CAT $1 | sed -n -e"/$filter_pattern/p"
19 | }
20 | 
21 | split_and_calc() {
22 |     flines=`filter $1 | wc | awk '{print $1}'`
23 |     filter $1 \
24 |         | awk "{
25 |                   if (NR <= $flines/2) {
26 |                      valid += (\$NF)^2;
27 |                      vlines++;
28 |                   } else {
29 |                      test += (\$NF)^2;
30 |                      tlines++;
31 |                   }
32 |                } 
33 |                END {
34 |                   print \"This script assumes RMSE for each day is based on the same number of predictions (24).\";
35 |                   print \"RMSE on validation phase (\", vlines, \" lines): \", sqrt(valid/vlines);
36 |                   print \"RMSE on test phase (\", tlines, \" lines): \", sqrt(test/tlines);
37 |                }"
38 |     echo ""
39 | }
40 | 
41 | split_and_calc $1
42 | 


--------------------------------------------------------------------------------
/sg/utils/scripts/summarize-simulation-results.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | . $HOME/local/bin/query.sh
  4 | 
  5 | NAME=$0
  6 | if test -n "`type basename 2>/dev/null`"; then
  7 |     NAME="`basename $0`";
  8 | fi
  9 | 
 10 | USAGE="Usage: 
 11 |    $NAME [options]
 12 | 
 13 | Summarize simulation results, with or without plotting fitness graphs
 14 | 
 15 | Options:
 16 | -g
 17 |    Show graphs
 18 | "
 19 | 
 20 | show_graphs="FALSE"
 21 | 
 22 | while getopts 'gh?' opt
 23 | do
 24 |     case $opt in
 25 |         g)
 26 |             show_graphs="TRUE";;
 27 | 	h|'?'|?|*)
 28 | 	    echo "$USAGE"
 29 | 	    exit 2;;
 30 |     esac
 31 | done
 32 | shift `expr $OPTIND - 1`
 33 | 
 34 | 
 35 | children() {
 36 |     ps -o pid,ppid,command | grep "[0-9][0-9]*[[:space:]]\+$$" | awk '{print $1}'
 37 | }
 38 | 
 39 | for dataset in _gef-data _gef-temp-data; do
 40 |     if [ -z "$dataset" ]; then
 41 |         ymax=0.7
 42 |     elif [ "$dataset" == "_total-load" ]; then
 43 |         ymax=0.07
 44 |     else
 45 |         ymax=0.009
 46 |     fi
 47 |     for model in ar; do
 48 | 	clean="" #_no-cleaning # Axl: Everything is without cleaning. To avoid errors in the following loop (cleaning
 49 |                            # and subtract can not be in the same parameter, since they both exist in the filename), 
 50 | 	                   # this is where the clean parameter loop should be.
 51 |         for subtract in _subtract-weekly-pattern ; do
 52 |             # Find the relevant log files. Using 'find' to ensure regexp search
 53 |             # rather than full wildcard matching on the '*' in the filename.
 54 |             GREP=bzgrep
 55 |             ext=txt.bz2
 56 |             pattern="./output_${model}_run_[0-9]*${dataset}${subtract}${clean}_100_0_zone_1" # Axl note: _100_0 was added. 
 57 |             logs=`find . -regex "$pattern.${ext}"`
 58 |             if [ -z "$logs" ]; then
 59 |                 echo -e "\nNo matches for $pattern.${ext}, trying non-compressed files."
 60 |                 ext=txt
 61 |                 logs=`find . -regex "$pattern.${ext}"`
 62 |                 if [ -z "$logs" ]; then
 63 |                     echo "No matches for $pattern.${ext} either."
 64 |                     continue
 65 |                 fi
 66 |                 GREP=grep
 67 |             fi
 68 |             #echo $logs
 69 |             # Use log file paths to find database paths
 70 |             dbs=`echo $logs | sed -e"s/output_/pyevolve_/g; s/\.${ext}/.db/g"`
 71 | 
 72 |             # Make a "nice" title
 73 |             if [ -z "$dataset" ]; then
 74 |                 datasettxt="single-user"
 75 |             else
 76 |                 datasettxt=$dataset
 77 |             fi
 78 |             if [ -z "$clean" ]; then
 79 |                 cleantxt="with cleaning"
 80 |             else
 81 |                 cleantxt=$clean
 82 |             fi
 83 |             nlogs=`ls -1 $logs 2>/dev/null | wc | awk '{print $1}'`
 84 |             ndbs=`ls -1 $dbs 2>/dev/null | wc | awk '{print $1}'`
 85 |             title="$model ${datasettxt} ${subtract} ${cleantxt} ($nlogs output logs, $ndbs databases)"
 86 | 
 87 |             # Calculate average prediction errors
 88 |             mm="python $HOME/local/bin/minmax.py"
 89 |             echo -e "\n               $title"
 90 |             echo "Test set prediction error as stored in file (old and new runs use different measures):"
 91 |             $GREP "Error on test phase" $logs | awk '{print $12}' | $mm --header
 92 |             echo "Test set prediction error as mean of daily errors:"
 93 |             # for log in $logs; do
 94 |             #     echo "$GREP 'Error for test at' $log"
 95 |             #     $GREP 'Error for test at' $log | awk '{rows++; total += $NF}END{print total/rows}'
 96 |             # done
 97 |             for log in $logs; do
 98 |                 $GREP 'Error for test at' $log | awk '{rows++; total += $NF}END{print total/rows}'
 99 |             done | $mm --header
100 |             #echo "Fitnesses (not error) of last generation (rows=Min,ave,max,dev):"
101 |             echo -n "Average maximum fitness (not error) of last generation: "
102 |             # The sed selects the max fitness row, the awk selects the average column.
103 |             for db in $dbs; do
104 |                 cat <<EOF | sqlite3 $db
105 | .separator " "
106 | select rawMin, rawAve, rawMax, rawDev from statistics where generation=99;
107 | EOF
108 |             done | $mm | sed -n -e'3p' | awk '{print $2'}
109 |             if [[ "$clean" != "_no_clean" && "$clean" != "_no-cleaning" ]]; then
110 |                 echo "Load smoothing:"
111 |                 $GREP 'Best genome found shown as alleles' $logs | sed -e's/\]//g; s/\[//g' | awk '{print $(NF-2)}' | $mm
112 |                 echo "Load Z-score:"
113 |                 $GREP 'Best genome found shown as alleles' $logs | sed -e's/\]//g; s/\[//g' | awk '{print $NF}' | $mm
114 |             fi
115 |             # Plotting
116 |             if [ "${show_graphs}" == "TRUE" ]; then
117 |                 python $HOME/Documents/SmartGrid/src/sg/utils/plot_fitnesses.py --ymax=$ymax --title="$title" $dbs >/dev/null &
118 |             fi
119 | 
120 |         done
121 |         echo ""
122 |     done
123 | done
124 | 
125 | if [ "${show_graphs}" == "TRUE" ]; then
126 |     if query "Close plot windows?" "y"; then
127 |         # Children will include the ps and grep processes, so redirect stderr.
128 |         kill $(children) 2>/dev/null
129 |         exit
130 |     fi
131 | fi
132 | 
133 | 


--------------------------------------------------------------------------------
/sg/utils/test_cache.py:
--------------------------------------------------------------------------------
 1 | """Test cache class(es)."""
 2 | 
 3 | import os
 4 | import unittest
 5 | 
 6 | import sg.utils.testutils as testutils
 7 | 
 8 | from cache import * 
 9 | 
10 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | class TestCache(testutils.ArrayTestCase):
13 |     def setUp(self):
14 |         self.size = 10
15 |         self.cache = ATimeCache(max_entries=self.size)
16 | 
17 |     def _overfill_cache(self):
18 |         for i in range(self.size*2):
19 |             self.cache[i] = i
20 |         
21 |     def test_max_size(self):
22 |         """Check that the cache size doesn't exceed the given size."""
23 |         self._overfill_cache()
24 |         self.assertEqual(self.size, len(self.cache))
25 | 
26 |     def test_resize(self):
27 |         self._overfill_cache()
28 |         new_size = self.size - 2
29 |         self.cache.max_entries = new_size
30 |         self.assertEqual(new_size, len(self.cache))
31 |         self._overfill_cache()
32 |         self.assertEqual(new_size, len(self.cache))
33 |         
34 |     def test_store_retrieve(self):
35 |         """Check that storage and retrieval works both on empty and full
36 |         caches."""
37 |         self.cache[12] = 12
38 |         self.assertEqual(self.cache[12], 12)
39 |         self._overfill_cache()
40 |         for i in range(self.size):
41 |             self.cache[-i] = i*12
42 |         for i in range(self.size):
43 |             self.assertEqual(self.cache[-i], i*12)
44 |         
45 |     def test_retrieve_nonexisting(self):
46 |         """Check that retrieval of a non-existing key fails."""
47 |         with self.assertRaises(KeyError):
48 |             x = self.cache[0]
49 |         
50 |     def test_read_refreshes(self):
51 |         """Check that a read refreshes the cache status."""
52 |         for i in range(100):
53 |             self.cache[-12] = 1
54 |             self.cache[i] = i
55 |         with self.assertRaises(KeyError):
56 |             x = self.cache[0]
57 |         self.assertEqual(self.cache[-12], 1)
58 |             
59 | if __name__ == '__main__':
60 |     unittest.main()
61 | 
62 | 


--------------------------------------------------------------------------------
/sg/utils/test_genemapper.py:
--------------------------------------------------------------------------------
 1 | """Testing the gene mapper classes."""
 2 | 
 3 | import os
 4 | import unittest
 5 | 
 6 | from pyevolve import GAllele, G1DList
 7 | 
 8 | from genemapper import * 
 9 | 
10 | class TestGeneMapper(unittest.TestCase):
11 |     def test_range_map(self):
12 |         allele = MappedAlleleRange(10, 100)
13 |         self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 10)
14 |         self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 33)
15 |         self.assertEqual(allele.map_to_allele(0, (-1, 1)), 55)
16 |         self.assertEqual(allele.map_to_allele(1, (-1, 1)), 100)
17 |         self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1))
18 |         self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1))
19 |         
20 |     def test_float_range(self):
21 |         allele = MappedAlleleRange(2, 4, real=True)
22 |         self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2)
23 |         self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2.5)
24 |         self.assertEqual(allele.map_to_allele(0.4, (-1, 1)), 3.4)
25 |         self.assertEqual(allele.map_to_allele(1, (-1, 1)), 4)
26 |         self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1))
27 |         self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1))
28 | 
29 |     def test_float_range_log(self):
30 |         allele = MappedAlleleRange(2, 4, real=True, scaling='log')
31 |         self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2)
32 |         #self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2.5)
33 |         #self.assertEqual(allele.map_to_allele(0.4, (-1, 1)), 3.4)
34 |         self.assertEqual(allele.map_to_allele(1, (-1, 1)), 4)
35 |         self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1))
36 |         self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1))
37 | 
38 |     def test_long_list(self):
39 |         allele = MappedAlleleList([2, 4, 12])
40 |         self.assertEqual(allele.map_to_allele(-1, (-1, 1)), 2)
41 |         self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), 2)
42 |         self.assertEqual(allele.map_to_allele(0.3, (-1, 1)), 4)
43 |         self.assertEqual(allele.map_to_allele(1, (-1, 1)), 12)
44 |         self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1))
45 |         self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1))
46 |         
47 |     def test_short_list(self):
48 |         allele = MappedAlleleList([-12])
49 |         self.assertEqual(allele.map_to_allele(-1, (-1, 1)), -12)
50 |         self.assertEqual(allele.map_to_allele(-0.5, (-1, 1)), -12)
51 |         self.assertEqual(allele.map_to_allele(0.3, (-1, 1)), -12)
52 |         self.assertEqual(allele.map_to_allele(1, (-1, 1)), -12)
53 |         self.assertRaises(ValueError, allele.map_to_allele, -2, (-1, 1))
54 |         self.assertRaises(ValueError, allele.map_to_allele, 1.1, (-1, 1))
55 | 
56 | class TestGenomeMapper(unittest.TestCase):
57 |     def setUp(self):
58 |         self.alleles = GAllele.GAlleles()
59 |         self.alleles.add(MappedAlleleRange(10, 100))
60 |         self.alleles.add(MappedAlleleRange(0, 2, real=True))
61 |         self.alleles.add(MappedAlleleList([2, 4, 12]))
62 |         self.alleles.add(MappedAlleleList([-1]))
63 |         self.genome = G1DList.G1DList(len(self.alleles))
64 |         self.genome.setParams(allele=self.alleles, rangemin=-1, rangemax=1)
65 | 
66 |     def test_map_genome(self):
67 |         self.genome[:] = [-1, -0.5, 0.4, 1]
68 |         mapped_genome = map_to_alleles(self.genome)
69 |         self.assertEqual(mapped_genome, [10, 0.5, 12, -1])
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 
74 | 


--------------------------------------------------------------------------------
/sg/utils/test_output.py:
--------------------------------------------------------------------------------
 1 | """Test the output utilities."""
 2 | 
 3 | import os
 4 | import unittest
 5 | import copy
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | import sg.utils.testutils as testutils
11 | from sg.globals import SG_DATA_PATH
12 | 
13 | from output import * 
14 | 
15 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
16 | 
17 | class TestOutputMisc(testutils.ArrayTestCase):
18 |     def setUp(self):
19 |         test_file = "_test_series_bc_hydro_no_temperatures_esn_run_8_bc-data.pickle"
20 |         test_path = os.path.join(_PATH_TO_HERE, test_file)
21 |         self.dataset = load_pickled_prediction(test_path)
22 |     
23 |     def tearDown(self):
24 |         pass
25 |     
26 |     def test_split_dataset(self):
27 |         """Test splitting of the test dataset, which is known to have 262
28 |         days."""
29 |         left_lengths = [26, 52, 78, 104, 131, 157, 183, 209, 235]
30 |         for splits in zip(np.arange(0.1, 1, 0.1), left_lengths):
31 |             (left, right) = split_dataset(self.dataset, splits[0])
32 |             self.assertEqual(len(left[1]), splits[1])
33 |             self.assertEqual(len(left[0]), splits[1] * len(self.dataset[1][0]))
34 |             self.assertEqual(len(right[1]), 262 - splits[1])
35 |             self.assertArraysEqual(self.dataset[0], left[0].append(right[0]))
36 |             for (whole_days, split_days) in zip(self.dataset[1], left[1] + right[1]):
37 |                 self.assertArraysEqual(whole_days, split_days)
38 |         
39 |     def test_sort_by_validation_error(self):
40 |         """Test sorting by validation error by faking a number of datasets."""
41 |         datasets = [self.dataset]
42 |         # Incrementally append copies with modified target signal
43 |         for i in range(10):
44 |             next_set = copy.deepcopy(datasets[i])
45 |             indices = np.random.random_integers(len(next_set), size=i+1)
46 |             next_set[0][indices] *= 1.2
47 |             datasets.append(next_set)
48 |         # Permute to make sure they are not ordered on entry
49 |         shuffled = [datasets[i] for i in np.random.permutation(len(datasets))]
50 |         val_sorted = sort_data_by_validation_error(shuffled)
51 |         def index_of(left, right):
52 |             for i in range(len(datasets)):
53 |                 if np.all(datasets[i][0] == (left[0].append(right[0]))):
54 |                     return i
55 |         for i, (error, (left, right)) in zip(range(len(val_sorted)), val_sorted):
56 |             self.assertEqual(i, index_of(left, right))
57 | 
58 |     def test_matching_paths(self):
59 |         """Since the output of the matching_paths function depends on the
60 |         contents of the working directory, the tests here may have to be
61 |         updated when files are added to or removed from the relevant
62 |         directory."""
63 |         # Use full path to ensure that it works also when running unit tests
64 |         # from another directory.
65 |         here_wc = os.path.join(_PATH_TO_HERE, "*")
66 |         wildcards = [here_wc, "test", "py$", "output"]
67 |         self.assertEqual(matching_paths(wildcards), 
68 |                          [os.path.join(_PATH_TO_HERE, "test_output.py")])
69 |         wildcards = [here_wc, "__+", ".py$"]
70 |         self.assertEqual(matching_paths(wildcards), 
71 |                          [os.path.join(_PATH_TO_HERE, "__init__.py")])
72 |         there_wc = os.path.join(SG_DATA_PATH, "bchydro", "*")
73 |         wildcards = [there_wc, "area", "[89]"]
74 |         targets = [os.path.join(SG_DATA_PATH, "bchydro", fname) for fname in \
75 |                    ("2008controlareaload.csv", "jandec2009controlareaload.csv")]
76 |         self.assertEqual(matching_paths(wildcards), targets)
77 | 
78 |         
79 |         
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 | 
84 | 


--------------------------------------------------------------------------------
/sg/utils/test_timer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import StringIO
 3 | import time
 4 | 
 5 | from timer import *
 6 | 
 7 | class TimerTester(unittest.TestCase):
 8 |     def _wrapped_timing(self, stream):
 9 |         t = SimpleTimer(stream)
10 |         time.sleep(0.1)
11 |         
12 |     def test_report_when_out_of_scope(self):
13 |         stream = StringIO.StringIO()
14 |         self._wrapped_timing(stream)
15 |         output = stream.getvalue()
16 |         self.assertIn("Started at", output)
17 |         self.assertIn("Ended at", output)
18 | 
19 |         
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/sg/utils/test_utils.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import cPickle as pickle
  3 | import os
  4 | import tempfile
  5 | 
  6 | import numpy as np
  7 | 
  8 | import testutils as testutils
  9 | from utils import *
 10 | 
 11 | _PATH_TO_HERE = os.path.dirname(os.path.abspath(__file__))
 12 | 
 13 | class NormalizerTester(testutils.ArrayTestCase):
 14 |     def setUp(self):
 15 |         self._data = np.array([0., 1., 2., 3., 4., 5.])
 16 |         self._norm = np.array([0., 1./5, 2./5, 3./5, 4./5, 1.])
 17 |         self._shifted = np.array([-3., -2., -1., 0., 1., 2.])
 18 |         self._2d_data = np.array([[0., 1., 2.], [3., 4., 5.]])
 19 |         self._2d_norm = np.array([[0., 1./5, 2./5], [3./5, 4./5, 1.]])
 20 | 
 21 |     def test_normalize(self):
 22 |         normalizer = Normalizer(self._data)
 23 |         self.assertArraysEqual(normalizer.normalized, self._norm)
 24 |         self.assertArraysEqual(normalizer.normalize(self._data), self._norm)
 25 | 
 26 |     def test_normalize_other_data(self):
 27 |         normalizer = Normalizer(self._data)
 28 |         self.assertArraysAlmostEqual(normalizer.normalize(self._shifted),
 29 |                                self._norm - 3./5)
 30 | 
 31 |     def test_expand(self):
 32 |         normalizer = Normalizer(self._data)
 33 |         self.assertArraysEqual(normalizer.expand(self._norm), self._data)
 34 | 
 35 |     def test_expand_other_data(self):
 36 |         normalizer = Normalizer(self._data)
 37 |         shifted_norm = np.array([-0.6, -0.4, -0.2, 0., 0.2, 0.4])
 38 |         expanded = normalizer.expand(shifted_norm)
 39 |         self.assertArraysEqual(expanded, self._shifted)
 40 | 
 41 |     def test_twodim_flatten(self):
 42 |         normalizer = Normalizer(self._2d_data)
 43 |         self.assertArraysEqual(normalizer.normalized, self._2d_norm)
 44 |         self.assertArraysEqual(normalizer.normalize(self._data), self._norm)
 45 |                 
 46 |     def test_twodim_axis_0(self):
 47 |         normalizer = Normalizer(self._2d_data, axis=0)
 48 |         self.assertArraysEqual(normalizer.normalized,
 49 |                                np.array([[0, 0, 0], [1, 1, 1]]))
 50 |         self.assertArraysEqual(normalizer.expand([[-1, 0, 2], [1, 0.5, 2]]),
 51 |                                np.array([[-3, 1, 8], [3, 2.5, 8]]))
 52 | 
 53 |     def test_twodim_axis_1(self):
 54 |         normalizer = Normalizer(self._2d_data, axis=1)
 55 |         self.assertArraysEqual(normalizer.normalized,
 56 |                                np.array([[0, 0.5, 1], [0, 0.5, 1]]))
 57 |         
 58 | 
 59 | class MiscTester(testutils.ArrayTestCase):
 60 |     def _test_enum_values(self, enum):
 61 |         self.assertEqual(enum.ZERO, 0)
 62 |         self.assertEqual(enum.ONE, 1)
 63 |         self.assertEqual(enum.TWO, 2)
 64 |         self.assertEqual(enum.NOT_THREE, 4)
 65 |         with self.assertRaises(AttributeError) as cm:
 66 |             x = enum.NONEXISTING
 67 | 
 68 |     def _make_enum(self):
 69 |         return Enum('ZERO', 'ONE', 'TWO', NOT_THREE=4)
 70 | 
 71 |     def test_enum_create(self):
 72 |         numbers = self._make_enum()
 73 |         self._test_enum_values(numbers)
 74 |         
 75 |     def test_pickle_enum(self):
 76 |         numbers = self._make_enum()
 77 |         storage = tempfile.NamedTemporaryFile(prefix='_test_utils_deleteme_', 
 78 |                                               dir=_PATH_TO_HERE)
 79 |         pickle.dump(numbers, storage)
 80 |         storage.flush()
 81 |         storage.seek(0)
 82 |         numbers2 = pickle.load(storage)
 83 |         self._test_enum_values(numbers2)
 84 |         
 85 |     def test_indicer_values(self):
 86 |         manual = dict((('one', 0),
 87 |                        ('two', 1),
 88 |                        ('three', 2)))
 89 |         indices = indicer('one', 'two', 'three')
 90 |         self.assertEqual(indices, manual)
 91 | 
 92 |     def test_bound(self):
 93 |         self.assertEqual(bound(0, 1, -1), 0)
 94 |         self.assertEqual(bound(0, 1, 2), 1)
 95 |         self.assertEqual(bound(0, 1, 0.5), 0.5)
 96 | 
 97 |     def test_flatten(self):
 98 |         lists = ((1, 2), (3, 4), (5, 6))
 99 |         flats = [1, 2, 3, 4, 5, 6]
100 |         sublists = (((1, 2), (3, 4)), ((5, 6)))
101 |         subflats = [(1, 2), (3, 4), 5, 6]
102 |         self.assertEqual(flatten(*lists), flats)
103 |         self.assertEqual(flatten(*sublists), subflats)
104 | 
105 |     def test_safe_flatten(self):
106 |         l = [[1, 2, 3], 9, [[11, 12], [13, 14]], 22, 24]
107 |         shallow = [1, 2, 3, 9, [11, 12], [13, 14], 22, 24]
108 |         deep = [1, 2, 3, 9, 11, 12, 13, 14, 22, 24]
109 |         self.assertEqual(list(safe_shallow_flatten(l)), shallow)
110 |         self.assertEqual(list(safe_deep_flatten(l)), deep)
111 | 
112 |     def test_diffinv_determined(self):
113 |         x = np.arange(10)
114 |         diffed = np.diff(x)
115 |         self.assertArraysEqual(diffed, np.ones(len(x) - 1))
116 |         self.assertArraysEqual(diffinv(diffed, xi=0), x)
117 |         diffed = np.diff(x, n=2)        
118 |         self.assertArraysEqual(diffinv(diffed, n=2, xi=[0, 1]), x)
119 |         # Difference increases by one each step
120 |         x = np.array([1, 2, 4, 7, 11, 16, 22, 29, 37, 46])
121 |         self.assertArraysEqual(np.diff(x), np.arange(1, len(x)))
122 |         self.assertArraysEqual(
123 |             diffinv(np.diff(x, n=2), n=2, xi=[1, 2]), x)
124 |         
125 |     def test_diffinv_roundtrip(self):
126 |         diffed = np.arange(10)
127 |         for diff_order in range(10):
128 |             xi = np.arange(diff_order)
129 |             x = diffinv(diffed, n=diff_order, xi=xi)
130 |             re_diff = np.diff(x, n=diff_order)
131 |             re_x = diffinv(re_diff, n=diff_order, xi=xi)
132 |             self.assertArraysEqual(diffed, re_diff)
133 |             self.assertArraysEqual(x, re_x)
134 |             
135 | if __name__ == "__main__":
136 |     unittest.main()
137 | 


--------------------------------------------------------------------------------
/sg/utils/testutils.py:
--------------------------------------------------------------------------------
 1 | """Unit testing utilities."""
 2 | 
 3 | import os
 4 | import unittest
 5 | 
 6 | import numpy as np
 7 | 
 8 | class ArrayTestCase(unittest.TestCase):
 9 |     """This class adds some extra assertions in order to simplify working with
10 |     numpy matrices. Uses oddCamelCase in public method names to be consistent
11 |     with the asserts in unittest.TestCase."""
12 |         
13 |     def assertArraysAlmostEqual(self, first, second,
14 |                                 places=7, msg=None, delta=None):
15 |         """Assert that two or more numpy arrays have the same shape and contain
16 |         elements that are almost equal."""
17 |         self._generic_multi_array_assert(self.assertAlmostEqual, first, second,
18 |                                          places=places, msg=msg, delta=delta)
19 |         
20 |     def assertArraysEqual(self, first, second, msg=None):
21 |         """Assert that two or more numpy arrays have the same shape and contain
22 |         elements that are equal."""
23 |         self._generic_multi_array_assert(self.assertEqual, first, second,
24 |                                          msg=msg)
25 | 
26 |     def assertNaNArraysEqual(self, first, second, msg=None):
27 |         """Assert that two or more numpy arrays have the same shape and contain
28 |         elements that are equal. This is the same as assertArraysEqual, but
29 |         with the addition of NaN == NaN."""
30 |         nans_first = np.isnan(first)
31 |         nans_second = np.isnan(second)
32 |         self.assertArraysEqual(nans_first, nans_second, msg=msg)
33 |         self.assertArraysEqual(first[np.where(nans_first == False)[0]], 
34 |                                second[np.where(nans_second == False)[0]], msg=msg)
35 | 
36 |     def _assert_are_arrays(self, *arrays):
37 |         """Check that all the arrays passed in are actually numpy arrays."""
38 |         for array in arrays:
39 |             self.assertIsInstance(array, np.ndarray)
40 | 
41 |     def _assert_same_shape_arrays(self, *arrays):
42 |         """Check that all the arrays passed in have the same shape."""
43 |         self.assertGreater(len(arrays), 1)
44 |         shape1 = arrays[0].shape
45 |         for ar in arrays[1:]:
46 |             self.assertEqual(shape1, ar.shape)
47 | 
48 |     def _assert_are_similar_arrays(self, *arrays):
49 |         """Check that the arrays passed in are "similar": they are all numpy
50 |         arrays with the same shape."""
51 |         self._assert_are_arrays(*arrays)
52 |         self._assert_same_shape_arrays(*arrays)
53 | 
54 |     def _generic_multi_array_assert(self, assertion, first, second, **kwargs):
55 |         """Generic array assert for several arrays. Checks that the arrays are
56 |         similar, then performs the assertion element-by-element."""
57 |         self._assert_are_similar_arrays(first, second)
58 |         flats = [array.flatten() for array in (first, second)]
59 |         for elements in zip(*flats):
60 |             assertion(*elements, **kwargs)
61 |         
62 | 


--------------------------------------------------------------------------------
/sg/utils/testutils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/testutils.pyc


--------------------------------------------------------------------------------
/sg/utils/timer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import time
 3 | import sys
 4 | from datetime import timedelta as dt
 5 | 
 6 | class SimpleTimer():
 7 |     """Basic timer class, initial code by Lester.
 8 | 
 9 |     Basic usage:
10 |     timer = SimpleTimer() 
11 |     ...lost of slow code here...
12 |     # Optional, will be called by destructor unless called manually:
13 |     report = timer.end()
14 |     print report
15 | 
16 |     Use cProfile for more in-depth profiling."""
17 |     
18 |     def __init__(self, output_stream=sys.stdout):
19 |         """Start timing (may be restarted by explicit calls to start()). Output
20 |         printed by start() and end() will be printed to output_stream unless
21 |         this is Null."""
22 |         self.times = []
23 |         self.labels = []
24 |         self._has_ended = False
25 |         self._stream = output_stream
26 |         self.start()
27 | 
28 |     def __del__(self):
29 |         if (not self._has_ended) and self._stream is not None:
30 |             print >>self._stream, self.end()
31 |             
32 |     def start(self):
33 |         self.times = [time.time()]
34 |         self.labels = ["start"]
35 |         if self._stream is not None:
36 |             print >>self._stream, "Started at", time.asctime()
37 | 
38 |     def end(self):
39 |         self._has_ended = True
40 |         self.lap("end")
41 |         if self._stream is not None:
42 |             print >>self._stream, "Ended at", time.asctime()
43 |         return self.report()
44 | 
45 |     def lap(self,label):
46 |         self.times.append(time.time())
47 |         self.labels.append(label)
48 | 
49 |     @staticmethod
50 |     def seconds_to_string(seconds):
51 |         whole_secs = int(seconds)
52 |         micros = int((seconds - whole_secs) * 1000000)
53 |         delta = dt(seconds=whole_secs, microseconds=micros)
54 |         if micros == 0:
55 |             return str(delta)
56 |         else:
57 |             return str(delta)[:-4]
58 |     
59 |     @staticmethod
60 |     def period_to_string(start_time, end_time):
61 |         seconds = (end_time - start_time)
62 |         return SimpleTimer.seconds_to_string(seconds)
63 | 
64 |     def report(self):
65 |         s = "Finished in %s: " % self.period_to_string(self.times[0], 
66 |                                                        self.times[-1])
67 |         for i in range(1,len(self.labels)-1):
68 |             s += "%s %s, " % (self.labels[i], 
69 |                               self.period_to_string(self.times[i-1], 
70 |                                                     self.times[i]))
71 |         return s[:-2] + "."
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     from unittest import main
76 |     main(module="test_" + __file__[:-3])
77 |     
78 | 


--------------------------------------------------------------------------------
/sg/utils/timer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/timer.pyc


--------------------------------------------------------------------------------
/sg/utils/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axeltidemann/load_forecasting/d324a711a1a0c7ccd9587e0ecf9988a12214a1a3/sg/utils/utils.pyc


--------------------------------------------------------------------------------