├── h5_data
    └── .gitignore
├── raw_data
    └── download_igra.sh
├── CHANGELOG.md
├── LICENSE-DATA
├── +GCSAL
    ├── +H5
    │   ├── recursive_load.m
    │   ├── create_and_write.m
    │   ├── fullpath.m
    │   └── load.m
    ├── +Map
    │   ├── world_map.m
    │   ├── find_nearest.m
    │   ├── multipatch.m
    │   ├── inpolygon2.m
    │   ├── map_stations_by_country.m
    │   └── find_in_lat_long_range.m
    ├── +IGRA
    │   ├── mat2h5.m
    │   ├── datafile2mat_dir.m
    │   ├── mat2h5_dir.m
    │   ├── datafile2mat.m
    │   ├── format_definitions.m
    │   └── Param.m
    └── GCSAL.m
├── LICENSE
├── CONTRIBUTING
├── IGRA_to_h5_example.m
├── CODE_OF_CONDUCT.md
├── GCSAL_ex2.m
├── README.md
└── GCSAL_ex1.m


/h5_data/.gitignore:
--------------------------------------------------------------------------------
1 | gcsal.h5
2 | gcsal.h5.info.mat
3 | 


--------------------------------------------------------------------------------
/raw_data/download_igra.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | #
 5 | # This source code is licensed under the MIT license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | # Use wget to download all data
 9 | wget -m ftp://ftp.ncdc.noaa.gov/pub/data/igra/data/data-por/
10 | 
11 | # cd to download directory
12 | cd ftp.ncdc.noaa.gov/pub/data/igra/data/data-por || exit
13 | 
14 | # unzip all
15 | find ./ -name \*.zip -exec unzip -n {} \;
16 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | May 25, 2019
 2 | - Updated license from BSD to MIT
 3 | 
 4 | Oct 19, 2018
 5 | - Added CODE_OF_CONDUCT.md and CHANGELOG.md files
 6 | - Updated copyright information to all source code files
 7 | 
 8 | Dec 5, 2017
 9 | - Added functionality to search by latitude range and for nearest N stations.
10 | - Made searches faster with optional new modes that suppress plotting, waitbars, and text output.
11 | - Revised example file and added new one.  
12 | - All changes are backward compatible.
13 | 
14 | Oct 27, 2017
15 | - Initial release
16 | 


--------------------------------------------------------------------------------
/LICENSE-DATA:
--------------------------------------------------------------------------------
 1 | For NOAA Integrated Global Radiosonde Archive (IGRA) data
 2 | 
 3 | World Meteorological Organization (WMO) Resolution 40 NOAA Policy
 4 | NCEI data and products that contain international data may have conditions
 5 | placed on their international commercial use. They can be used within the United
 6 | States or for noncommercial international activities without restriction.
 7 | Redistribution of these data by others must provide this same notification.
 8 | The non-U.S. data cannot be redistributed for commercial purposes. For details,
 9 | please consult the WMO policy.
10 | 


--------------------------------------------------------------------------------
/+GCSAL/+H5/recursive_load.m:
--------------------------------------------------------------------------------
 1 | function [  ] = recursive_load( h5_file, info )
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % Recursively load all data in h5_file that is a child of the Groups in
 8 | % info. This is for test purpose only
 9 | 
10 | 
11 | if isempty(info.Datasets)
12 |     for i = 1:length(info.Groups)
13 |         GCSAL.H5.recursive_load( h5_file, info.Groups(i));
14 |     end
15 | else
16 |     GCSAL.H5.load(h5_file, info);
17 | end
18 | 
19 | 
20 | end
21 | 


--------------------------------------------------------------------------------
/+GCSAL/+H5/create_and_write.m:
--------------------------------------------------------------------------------
 1 | function create_and_write(filename, datasetname, data)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % [] = create_and_write(filename, datasetname, data)
 8 | %   Uses h5create and h5write to write data to datasetname in filename
 9 | %   If data is empty, does nothing
10 | 
11 | 
12 | if ~isempty(data)
13 |     h5create(filename, datasetname, size(data), 'Datatype', class(data))
14 |     h5write( filename, datasetname, data)
15 | end
16 | 
17 | end
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/world_map.m:
--------------------------------------------------------------------------------
 1 | function world_map(countries)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % world_map(countries)
 8 | %   Creates a cartesion map of the world by creating patches for each
 9 | %   country in countries. countries should be a struct array with each
10 | %   element containing the fields Lat and Lon for the latitude/longitude of
11 | %   the country borders in degrees.
12 | 
13 | 
14 | % Background patch for ocean in light blue
15 | patch([-180 -180 180 180], [-90 90 90 -90], [0 1 1])
16 | 
17 | % Loop through all countries and make a yellow patch with a grey border
18 | % multipatch handles the fact that the country borders may have NaN for
19 | % separating non-continuous borders
20 | for i = 1:length(countries)
21 |     GCSAL.Map.multipatch(countries(i).Lon, countries(i).Lat, [1 1 0], ...
22 |         'EdgeColor', [.7 .7 .7]);
23 | end
24 | 
25 | % axis equal so map does not distort
26 | axis equal
27 | 
28 | % Limit axes by longitude and latitude min/max values
29 | axis([-180 180 -90 90])
30 | 
31 | end
32 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/mat2h5.m:
--------------------------------------------------------------------------------
 1 | function [ ] = mat2h5( mat_filename, h5_filename, station_id )
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % Load mat_filename and write the contents in h5 format to h5_filename with
 8 | % station_id as h5 path root
 9 | 
10 | 
11 | % load mat file
12 | mat = load(mat_filename);
13 | 
14 | % Loop through the fields
15 | flds = fieldnames(mat);
16 | for i = 1:length(flds)
17 | 
18 |     % Construct h5 path
19 |     h5path = GCSAL.H5.fullpath(station_id, flds{i});
20 | 
21 |     % h5write all fields
22 |     h5write_all_params(mat.(flds{i}), h5_filename, h5path)
23 | end
24 | 
25 | end
26 | 
27 | function h5write_all_params(data, h5filename, datasetname)
28 | % Loop through all parameters in data and call h5write with h5filename and
29 | % datasetname. data should be a struct of GCSAL.IGRA.Param objects
30 | 
31 | % Loop through fields
32 | flds = fieldnames(data);
33 | for i = 1:length(flds)
34 |     % Extract Param object from data struct
35 |     curr_param = data.(flds{i});
36 | 
37 |     % Call h5write method of Param object
38 |     curr_param.h5write(h5filename, datasetname);
39 | end
40 | end
41 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/find_nearest.m:
--------------------------------------------------------------------------------
 1 | function [stations, arclen] = find_nearest(all_stations, lat, lon, n)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % [stations, arclen] = find_nearest(all_stations, lat, lon, n)
 8 | %
 9 | %   Returns an array of station structs of the nearest n stations
10 | %   relative to the specified lat/lon
11 | %
12 | %   lat and lon must be single values each in degrees.
13 | %
14 | % INPUTS
15 | %   all_stations - struct array, each elemetn contains lat, long, id
16 | %            lat - scalar defining reference latitude in degrees.
17 | %            lon - scalar defining reference longitude in degrees.
18 | %              n - number of nearest stations
19 | %
20 | % OUTPUTS
21 | %       stations - struct array, subset of all_stations located within the
22 | %                  lat/long search box
23 | %         arclen - vector of distances in meters
24 | 
25 | 
26 |     lats = [all_stations(:).lat];
27 |     lons = [all_stations(:).lon];
28 |     E = referenceEllipsoid('wgs84');
29 |     [arclen, ~] = distance(lats, lons, lat, lon, E);
30 |     [~,idx] = sort(arclen);
31 |     stations = all_stations(idx(1:n));
32 |     arclen = arclen(idx(1:n));
33 | 
34 | end
35 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/multipatch.m:
--------------------------------------------------------------------------------
 1 | function patch_handles = multipatch( x, y, varargin )
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % patch_handles = multipatch( x, y, varargin )
 8 | %   Like built in matlab function patch but allows for NaN values in x and
 9 | %   y to separate multiple patches. varargin allows for any additional
10 | %   inputs to the patch function.
11 | %
12 | %   returns patch_handles, a vector of handles returned by each call to
13 | %   patch()
14 | 
15 | 
16 | if ~isequal(size(x), size(y))
17 |     error('x and y must be same size')
18 | end
19 | 
20 | % Find nan indices
21 | xnan = isnan(x);
22 | ynan = isnan(y);
23 | 
24 | % Find where either x or y is nan
25 | anynan = find(xnan | ynan);
26 | 
27 | % anynan will be used for start/stop indices with any index in anynan being
28 | % skipped over. To make the for loop smooth, add indices 0 and length+1 to
29 | % anynan
30 | anynan = [0 anynan length(x)+1];
31 | 
32 | % initialize patch_handles
33 | patch_handles = [];
34 | 
35 | % Loop through anynan
36 | for i = 2:length(anynan)
37 | 
38 |     % Choose idx between previous and next nan value
39 |     idx = anynan(i-1)+1:anynan(i)-1;
40 | 
41 |     % Check that idx is not empty
42 |     if ~isempty(idx)
43 |         % Create a new patch with values at idx
44 |         patch_handles(end+1) = patch(x(idx), y(idx), varargin{:}); %#ok<AGROW>
45 |     end
46 | end
47 | 
48 | end
49 | 


--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | # Contributing to GCSAL
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | How changes are synced with internal changes to the project will be defined in
 7 | the future.
 8 | 
 9 | ## Pull Requests
10 | We actively welcome your pull requests.
11 | 
12 | 1. Fork the repo and create your branch from `master`.
13 | 2. If you've added code that should be tested, add tests.
14 | 3. If you've changed APIs, update the documentation.
15 | 4. Ensure the test suite passes.
16 | 5. Make sure your code lints.
17 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
18 | 
19 | ## Contributor License Agreement ("CLA")
20 | In order to accept your pull request, we need you to submit a CLA. You only need
21 | to do this once to work on any of Facebook's open source projects.
22 | 
23 | Complete your CLA here: <https://code.facebook.com/cla>
24 | 
25 | ## Issues
26 | We use GitHub issues to track public bugs. Please ensure your description is
27 | clear and has sufficient instructions to be able to reproduce the issue.
28 | 
29 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
30 | disclosure of security bugs. In those cases, please go through the process
31 | outlined on that page and do not file a public issue.
32 | 
33 | ## Coding Style
34 | * 2 spaces for indentation rather than tabs
35 | * 80 character line length
36 | 
37 | ## License
38 | By contributing to GCSAL, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/+GCSAL/+H5/fullpath.m:
--------------------------------------------------------------------------------
 1 | function out = fullpath(filepart1, varargin)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % F = fullpath(filepart1, filepart2, ..., filepartN) builds a full
 8 | %     path specification F from the folders specified using / for the
 9 | %     fileseparator regardless of operating system. Input
10 | %     arguments FOLDERNAME1, FOLDERNAME2, must be strings. The output
11 | %     of fullfile is  equivalent to
12 | %
13 | %        F = [filepart1 / filepart2 / ... / filepartN]
14 | %
15 | %     except that care is taken to handle the cases when the folders begin
16 | %     or end with a file separator.
17 | 
18 | 
19 | % Error check for number of inputs
20 | if length(varargin) < 1
21 |     error('fullpath expects at least 2 inputs')
22 | end
23 | 
24 | % Extract first input in varargin
25 | filepart2 = varargin{1};
26 | 
27 | % Check that inputs are strings
28 | if ~isa(filepart1, 'char') || ~isa(filepart2, 'char')
29 |     error('fullpath expects string inputs')
30 | end
31 | 
32 | % Remove / from end of filepart1 if it exists
33 | if strcmp(filepart1(end), '/')
34 |     filepart1(end) = [];
35 | end
36 | 
37 | % Remove / from start of filepart2 if it exists
38 | if strcmp(filepart2(1), '/')
39 |     filepart2(1) = [];
40 | end
41 | 
42 | % Concatenate fileparts with /
43 | out = [filepart1 '/' filepart2];
44 | 
45 | % Recurse on remaining inputs
46 | if length(varargin) > 1
47 |     out = GCSAL.H5.fullpath(out, varargin{2:end});
48 | end
49 | 
50 | end
51 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/inpolygon2.m:
--------------------------------------------------------------------------------
 1 | function [in] = inpolygon2(x,y,xv,yv)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % Function to determine if points (x,y) is inside or outside a polygon.
 8 | %
 9 | % INPUTS
10 | %   (xv,yv): polygon is specified by (xv,yv) points
11 | %   (x,y): point coordinates are specified as (x,y) pairs. x and y should be vectors of same size.
12 | % OUTPUTS:
13 | %   in : is a logical array (0 means point is outside and 1 means point is inside)
14 | %
15 | % Implementation is based on winding algorithm explained in http://geomalgorithms.com/a03-_inclusion.html
16 | % Example
17 | %       xv = rand(6,1); yv = rand(6,1);
18 | %       xv = [xv ; xv(1)]; yv = [yv ; yv(1)];
19 | %       x = rand(1000,1); y = rand(1000,1);
20 | %       in = inpolygon(x,y,xv,yv);
21 | %       plot(xv,yv,x(in),y(in),'.r',x(~in),y(~in),'.b')
22 | 
23 | 
24 | if ((xv(1) ~= xv(end)) || (yv(1) ~= yv(end)))
25 |         xv = [xv ; xv(1)];
26 |         yv = [yv ; yv(1)];
27 | end
28 | 
29 | n=length(xv)-1; % number of polygon corners
30 | np=length(x); % number of points to be evaluated
31 | wn=zeros(np,1); % starts with all points outside
32 | 
33 | for j=1:np
34 |     for i=1:n
35 |         if (yv(i) <=y(j))
36 |             if (yv(i+1) > y(j))
37 |                 if (is_point_on_left(x(j),y(j),xv(i),yv(i),xv(i+1),yv(i+1))>0)
38 |                     wn(j)=wn(j)+1;
39 |                 end
40 |             end
41 |         else
42 |             if (yv(i+1) <= y(j))
43 |                 if (is_point_on_left(x(j),y(j),xv(i),yv(i),xv(i+1),yv(i+1))<0)
44 |                     wn(j)=wn(j)-1;
45 |                 end
46 |             end
47 |         end
48 |     end
49 | end
50 | 
51 | in=logical(wn); % convert to logical 0-1
52 | 
53 | 
54 | function isleft=is_point_on_left(px,py,p1Lx,p1Ly,p2Lx,p2Ly)
55 | % Determine if point (px,py) is on the left | right | On the line.
56 | % points on the line is specified as (p1Lx,p1Ly) & (p2Lx,p2Ly)
57 | % isleft=1 if p is on the left , isleft= 0 if p is along the line
58 | % isleft=-1 if p is on the right.
59 | 
60 | p1_to_p2=[p2Lx-p1Lx;p2Ly-p1Ly];
61 | p1_to_p=[px-p1Lx;py-p1Ly];
62 | isleft = det([p1_to_p2 p1_to_p]);
63 | end
64 | 
65 | end
66 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/datafile2mat_dir.m:
--------------------------------------------------------------------------------
 1 | function datafile2mat_dir(in_dir, out_dir, overwrite_mat, filespec)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % [] = datafile2mat_dir(in_dir, out_dir, use_mat_if_found, filespec)
 8 | %   Parses IGRA data files that match filespec and saves a .mat file
 9 | %   containing the data in out_dir. The flag overwrite_mat controls whether
10 | %   IGRA files that already have a matching .mat file will be skipped or
11 | %   overwritten.
12 | %
13 | %   All inputs are optional and will revert to a default value if not provided.
14 | %
15 | % INPUTS
16 | %          in_dir - Directory to look for IGRA data files.
17 | %                   Default: current working directory
18 | %         out_dir - Directory to save output .mat files
19 | %                   Default: current working directory
20 | %   overwrite_mat - Flag whether to skip existing mat files or overwrite
21 | %                   Default: true
22 | %        filespec - filespec used to identify IGRA data files
23 | %                   Default: '*-data.txt'
24 | 
25 | 
26 | % Set default values
27 | if ~exist('in_dir', 'var')
28 |     in_dir = pwd;
29 | end
30 | 
31 | if ~exist('out_dir', 'var')
32 |     out_dir = pwd;
33 | end
34 | 
35 | if ~exist('overwrite_mat', 'var')
36 |     overwrite_mat = true;
37 | end
38 | 
39 | if ~exist('filespec', 'var')
40 |     filespec = '*-data.txt';
41 | end
42 | 
43 | % Find all files ending in "-data.txt"
44 | filespec = fullfile(in_dir, filespec);
45 | fileObj = dir(filespec);
46 | 
47 | % % Sort files by size
48 | % [~, indices] = sort([fileObj.bytes], 'ascend');
49 | % indices = indices([1:300]);
50 | % fileObj = fileObj(indices);
51 | 
52 | % Calculate total size of all files in MB
53 | total_MB = sum([fileObj.bytes])/1e6;
54 | N_files = length(fileObj);
55 | 
56 | fprintf('Reading %d files totalling %.1f MB in %s\n', N_files, total_MB, in_dir);
57 | 
58 | % Initialize counters
59 | read_MB = 0;
60 | time_so_far = 0;
61 | t1 = tic;
62 | 
63 | % Iterate through files and read data
64 | for i = 1:N_files
65 |     t2 = tic;
66 |     curr = fileObj(i);
67 |     curr_name = curr.name;
68 |     curr_MB = curr.bytes/1e6;
69 |     fprintf('%04d: Reading %s, %5.1f MB', i, curr_name, curr_MB);
70 | 
71 |     GCSAL.IGRA.datafile2mat( fullfile(in_dir, curr_name), overwrite_mat, out_dir );
72 | 
73 |     read_MB = read_MB + curr_MB;
74 |     curr_time = toc(t2);
75 |     time_so_far = time_so_far + curr_time;
76 |     prct_complete = read_MB/total_MB;
77 |     total_time = time_so_far/prct_complete;
78 |     curr_rate = curr_MB/curr_time;
79 |     avg_rate = read_MB/time_so_far;
80 |     fprintf(', %.0f/%.0f MB, %5.2f%% %.1f curr MB/s, %.1f avg MB/s, %.1f/%.1f seconds\n', ...
81 |         read_MB, total_MB, prct_complete*100,  curr_rate, avg_rate, time_so_far, total_time);
82 | end
83 | toc(t1)
84 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/mat2h5_dir.m:
--------------------------------------------------------------------------------
 1 | function mat2h5_dir(in_dir, h5_filename, append, filespec)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % [] = mat2h5_dir(in_dir, h5_filename, append, filespec)
 8 | %   Loads data in .mat files that match filespec and writes the data to
 9 | %   h5_filename. The flag append controls whether data will be appended to
10 | %   h5_filename or if a new h5_filename will be created from scratch.
11 | %
12 | %   All inputs are optional and will revert to a default value if not provided.
13 | %
14 | % INPUTS
15 | %          in_dir - Directory to look for .mat data files.
16 | %                   Default: current working directory
17 | %     h5_filename - File to write data to
18 | %                   Default: './gcsal.h5'
19 | %          append - Flag whether to append to existing h5 file or start new
20 | %                   Default: true
21 | %        filespec - filespec used to identify .mat files
22 | %                   Default: '*-data.txt.mat'
23 | 
24 | 
25 | % Set default values
26 | if ~exist('in_dir', 'var')
27 |     in_dir = pwd;
28 | end
29 | 
30 | if ~exist('h5_filename', 'var')
31 |     h5_filename = fullfile(in_dir, 'gcsal.h5');
32 | end
33 | 
34 | if ~exist('append', 'var')
35 |     append = true;
36 | end
37 | 
38 | if ~exist('filespec', 'var')
39 |     filespec = '*-data.txt.mat';
40 | end
41 | 
42 | % Find all files matching filespec
43 | filespec = fullfile(in_dir, filespec);
44 | fileObj = dir(filespec);
45 | 
46 | % Sort files by size
47 | [~, indices] = sort([fileObj.bytes], 'ascend');
48 | fileObj = fileObj(indices);
49 | 
50 | % Calculate total size of all files in MB
51 | N_files = length(fileObj);
52 | 
53 | % Get h5 file info
54 | names = {};
55 | if exist(h5_filename, 'file')
56 |     if append
57 |         info = h5info(h5_filename);
58 |         names = {info.Groups.Name};
59 |     else
60 |         delete(h5_filename)
61 |     end
62 | end
63 | 
64 | fprintf('Processing %d files in %s\n', N_files, in_dir);
65 | 
66 | % Initialize counters
67 | time_so_far = 0;
68 | t1 = tic;
69 | 
70 | % Iterate through files and read data
71 | for i = 1:N_files
72 |     t2 = tic;
73 |     curr = fileObj(i);
74 |     curr_name = curr.name;
75 |     fprintf('Reading %s %04d/%04d ', curr_name, i, N_files);
76 | 
77 |     % Construct station id from .mat filename
78 |     station_id = ['/' curr_name(1:end-13)];
79 | 
80 |     % Check that station_id is not already in H5 file
81 |     if ~ismember(station_id, names)
82 |         % Load .mat file and write all params in header and entries to h5
83 |         mat_filename = fullfile(in_dir, curr_name);
84 |         GCSAL.IGRA.mat2h5( mat_filename, h5_filename, station_id )
85 |     end
86 | 
87 |     curr_time = toc(t2);
88 |     time_so_far = time_so_far + curr_time;
89 |     prct_complete = i/N_files;
90 |     total_time = time_so_far/prct_complete;
91 |     avg_rate = time_so_far/i;
92 |     fprintf('%3.0f%% Curr: %.2f sec, Avg: %.2f sec, %4.0f/%.0f sec\n', ...
93 |         prct_complete*100, curr_time, avg_rate, time_so_far, total_time);
94 | end
95 | toc(t1)
96 | 
97 | end
98 | 


--------------------------------------------------------------------------------
/IGRA_to_h5_example.m:
--------------------------------------------------------------------------------
 1 | % Copyright (c) Facebook, Inc. and its affiliates.
 2 | %
 3 | % This source code is licensed under the MIT license found in the
 4 | % LICENSE file in the root directory of this source tree.
 5 | %
 6 | % This example script takes the GCSAL process from downloading the source
 7 | % data from NOAA website through to creating a GCSAL Matlab object.
 8 | %
 9 | % The steps of this example script are only necessary if you want to
10 | % download and re-build the GCSAL library from the original source data.
11 | % Typically this is not necessary as you can use the provided
12 | % .h5 file available on the website. Reasons you might want to do this:
13 | %   1. Want to update the data with the latest measurements
14 | %   2. Want to make a change to the way data is stored in the .h5 file
15 | 
16 | 
17 | %% Set up paths and constants - Change the paths below as necessary
18 | clear all; close all; clc;
19 | 
20 | %%%%%%%%%%%%%% CHANGE THESE %%%%%%%%%%%%%%%%%%%%%%%%
21 | % Base directory. Text files should be in a directory called txt in this
22 | % folder
23 | base_dir = './raw_data/';
24 | 
25 | % Directory to save .h5 file
26 | h5_dir = './h5_data/';
27 | 
28 | % Directory to code. The folder +GCSAL which contains this file should be
29 | % in this directory
30 | codebase_dir = './';
31 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | 
33 | % Directory containing IGRA .txt files
34 | txt_dir = fullfile(base_dir, 'txt');
35 | 
36 | % filespec is used to find the source .txt files.
37 | filespec = '*-data.txt'; % Use this to find process all .txt files
38 | % filespec = 'BC*-data.txt'; % Use this for testing on a small subset files
39 | 
40 | % Directory to store .mat files
41 | mat_dir = fullfile(base_dir, 'mat');
42 | 
43 | % Full path to .h5 file
44 | h5_file = fullfile(h5_dir, 'gcsal.h5');
45 | 
46 | % Set up Matlab path
47 | addpath(genpath(codebase_dir))
48 | 
49 | %% Step 1: Download IGRA data from NOAA
50 | 
51 | % Run "bash ./raw_data/download_igra.sh" on the command line from this directory
52 | % This will take a while as you are downloading > 70 gb of data
53 | 
54 | 
55 | %% Step 2: Convert IGRA txt file to .mat
56 | 
57 | % Set overwrite_mat true if you want to start from scratch and overwrite any
58 | % existing .mat files that have already been made and exists on your path
59 | overwrite_mat = true; %false;
60 | GCSAL.IGRA.datafile2mat_dir(txt_dir, mat_dir, overwrite_mat, filespec);
61 | 
62 | %% Step 3: Convert .mat to .h5
63 | 
64 | % Set append_flag false if you want to start from scratch and clear the .h5
65 | % file if it previously been made and exists on your path
66 | append_flag = false; %true;
67 | GCSAL.IGRA.mat2h5_dir(mat_dir, h5_file, append_flag, [filespec '.mat']);
68 | 
69 | %% Step 4: Load GCSAL object from h5 file
70 | 
71 | % The first time you create the object by pointing to the h5_file. This
72 | % will create a .mat file which can be used after the first time
73 | g = GCSAL.GCSAL(h5_file);
74 | 
75 | 
76 | %%
77 | % Now you can look in GCSAL.GCSAL_examples.m to learn about how to use the
78 | % GCSAL object
79 | 
80 | % Get stations in Brazil
81 | stations = g.station_search();
82 | 
83 | % Histogram for all windspeeds
84 | t = tic;
85 | [N, entries] = g.counts(stations, 'wspd');
86 | toc(t)
87 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/map_stations_by_country.m:
--------------------------------------------------------------------------------
 1 | function countries = map_stations_by_country(all_stations, countries_data)
 2 | % Copyright (c) Facebook, Inc. and its affiliates.
 3 | %
 4 | % This source code is licensed under the MIT license found in the
 5 | % LICENSE file in the root directory of this source tree.
 6 | %
 7 | % countries_with_station = map_stations_by_country(all_stations, countries_data)
 8 | %   For each country in countries_data, finds stations in all_stations that
 9 | %   are within the borders of the country based on the latitude and
10 | %   longitude of the stations and countries.
11 | %
12 | %   Returns a struct array countries where each element contains the
13 | %   country name and a list of stations ids that were found to be contained
14 | %   by that country.
15 | %
16 | %   If countries_data is not input, then the function will try to load
17 | %   countries_data from ne_110m_admin_0_countries.mat if the file exists on
18 | %   the path
19 | %
20 | % INPUTS
21 | %     all_stations - struct array, each element should contain lat and long
22 | %                    of the station in degrees and id for the identifier of
23 | %                    the station
24 | %   countries_data - struct array, each element should contain Lat and Lon of
25 | %                    the borders of the country in degrees as well as name to
26 | %                    identify the country
27 | %
28 | % OUTPUTS
29 | %    countries - struct array, each element contains name, Lat, Lon, and
30 | %                stations. name is a string. Lat and Lon are the borders
31 | %                of the country in degrees. stations is a string matrix
32 | %                with each row an id for a station contained in that
33 | %                country.
34 | 
35 | 
36 | % If countries_data not given, try to load it form default .mat file
37 | if ~exist('countries_data', 'var')
38 |     fname = 'ne_110m_admin_0_countries.mat';
39 |     if exist(fname, 'file')
40 |         fprintf('Loading countries data from %s\n', which(fname))
41 |         countries_data = load(fname);
42 |         countries_data = countries_data.worldData;
43 |     else
44 |         fprintf('Could not load country data. %s file not found\n', fname)
45 |         countries = struct([]);
46 |         return
47 |     end
48 | end
49 | 
50 | 
51 | % Some data sets use name and others use NAME
52 | if isfield(countries_data, 'name')
53 |     names = {countries_data.name};
54 | elseif isfield(countries_data, 'NAME')
55 |     names = {countries_data.NAME};
56 | else
57 |     error('countries_data is missing the name field')
58 | end
59 | 
60 | % Initialize output with Lat, Lon, and name fields from countries_data
61 | countries = struct('Lat', {countries_data.Lat}, ...
62 |                    'Lon', {countries_data.Lon}, ...
63 |                    'name', names, ...
64 |                    'stations', '');
65 | 
66 | 
67 | % Extract vector from all_stations struct array
68 | stations_lat = [all_stations.lat];
69 | stations_long = [all_stations.lon];
70 | stations_id = vertcat(all_stations.id);
71 | 
72 | % Loop through each country in countries_data
73 | for i = 1:length(countries)
74 | 
75 |     % Get logical array corresponding to stations found in country
76 |     in_idx = GCSAL.Map.inpolygon2(stations_long, stations_lat, ...
77 |         countries(i).Lon, countries(i).Lat);
78 | 
79 |     % Record station ids found in the country
80 |     countries(i).stations = stations_id(in_idx,:);
81 | 
82 | end
83 | 
84 | 
85 | end
86 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at <opensource-conduct@fb.com>. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 
78 | 


--------------------------------------------------------------------------------
/+GCSAL/+H5/load.m:
--------------------------------------------------------------------------------
  1 | function [ out ] = load( filename, info )
  2 | % Copyright (c) Facebook, Inc. and its affiliates.
  3 | %
  4 | % This source code is licensed under the MIT license found in the
  5 | % LICENSE file in the root directory of this source tree.
  6 | %
  7 | % [ out ] = load( filename, info )
  8 | %   Load and uncompress data in h5 file filename at dataset find in info.
  9 | %   info is a specific Group found in the struct return by h5info()
 10 | 
 11 | 
 12 | % List of expected flds
 13 | flds = {'data', 'len', 'idx', 'i_unique'};
 14 | 
 15 | % Extract data set names, sort for stable comparison below
 16 | dsets = {info.Datasets.Name};
 17 | 
 18 | % Error check for unexpected fields
 19 | unexpected_flds = intersect(setxor(dsets, flds), dsets);
 20 | if ~isempty(unexpected_flds)
 21 |     msg = sprintf('  %s\n', unexpected_flds{:});
 22 |     error(['Unexpected datasets encountered:\n%s' ...
 23 |     'Expected datasets are data, len, and idx.'], msg) %#ok<SPERR>
 24 | end
 25 | 
 26 | % Loop through expected flds and read data if available
 27 | for i = 1:length(flds)
 28 |     if ismember(flds{i}, dsets)
 29 |         compressed_data.(flds{i}) = h5read(filename, GCSAL.H5.fullpath(info.Name, flds{i}));
 30 |     end
 31 | end
 32 | 
 33 | % Uncompress the data based on the datasets that were found
 34 | out = uncompress_data(compressed_data);
 35 | 
 36 | end
 37 | 
 38 | 
 39 | function out = uncompress_data(in)
 40 | % Returns uncompressed vector represented in uncompressed data
 41 | % struct in.
 42 | 
 43 | % First use i_unique to expand data if it was included
 44 | if isfield(in, 'i_unique')
 45 | 
 46 |     % Use i_unique to expand data to size of idx
 47 |     in.data = in.data(in.i_unique,:);
 48 | 
 49 |     % Remove i_unique field
 50 |     in = rmfield(in, 'i_unique');
 51 | end
 52 | 
 53 | % Extract fields from compressed data struct
 54 | flds = sort(fieldnames(in));
 55 | 
 56 | % Select decompression method based on fields found in uncompressed data
 57 | % struct
 58 | if isequal(flds, {'data'; 'idx'; 'len'})
 59 | 
 60 |     out = uncompress_data_idx_len(in.data, in.idx, in.len);
 61 | 
 62 | elseif isequal(flds, {'data'; 'len'})
 63 | 
 64 |     out = uncompress_data_len(in.data, in.len);
 65 | 
 66 | elseif isequal(flds, {'data'})
 67 |     % If only data is given, there is nothing to do
 68 |     out = in.data;
 69 | elseif isequal(flds, {'len'})
 70 |     % If only length is given return properly sized NaN vector
 71 |     out = NaN(in.len, 1);
 72 | else
 73 |     msg = sprintf('  %s\n', flds{:});
 74 |     error('Unexpected combination of datasets found:\n%s', msg)
 75 | end
 76 | 
 77 | end
 78 | 
 79 | function out = uncompress_data_idx_len(data, idx, len)
 80 | % Returns uncompressed vector from data, idx, and len compressed
 81 | % representation
 82 | 
 83 |     % Extract size of data input
 84 |     [N_data_rows, N_data_cols] = size(data);
 85 | 
 86 |     % Initialize out to properly sized NaN
 87 |     out = NaN(len, N_data_cols);
 88 | 
 89 |     % Uncompress idx
 90 |     idx = GCSAL.IGRA.Param.uncompress_idx(idx, len);
 91 | 
 92 |     % data should be the same size as idx unless there was only a single
 93 |     % data value, in which case it should replicated to match the length of
 94 |     % idx
 95 |     if N_data_rows == 1
 96 |         data = repmat(data, length(idx),1);
 97 |     end
 98 | 
 99 |     if size(data, 1) ~= length(idx)
100 |         error('Numer of rows in data does not match length of idx')
101 |     end
102 | 
103 |     % index data into out
104 |     out(idx,:) = data;
105 | end
106 | 
107 | function out = uncompress_data_len(data, len)
108 | % Returns uncompressed vector from data, and len compressed
109 | % representation
110 | 
111 |     if size(data,1) == 1
112 |         % Only a single value for data given so replicate to match len
113 | %         out = repmat(data, len, 1);
114 |         out = data;
115 |     else
116 |         % Verify that length of data matches len
117 |         if size(data,1) ~= len
118 |             error('Number of rows in data does not match len')
119 |         end
120 | 
121 |         % Nothing else to do data is already uncompressed
122 |         out = data;
123 |     end
124 | end
125 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/datafile2mat.m:
--------------------------------------------------------------------------------
  1 | function [ out ] = datafile2mat( filename, overwrite_mat, output_directory )
  2 | % Copyright (c) Facebook, Inc. and its affiliates.
  3 | %
  4 | % This source code is licensed under the MIT license found in the
  5 | % LICENSE file in the root directory of this source tree.
  6 | %
  7 | % [ out ] = datafile2mat( filename, overwrite_mat [opt], output_directory [opt] )
  8 | %  Parses IGRA data in filename and saves the data to a .mat file in
  9 | %  output_directory. Additionally returns the data in out.
 10 | %
 11 | %  If a .mat file with a matching name is found on the path and
 12 | %  overwrite_mat is set to false, this function returns an empty vector and
 13 | %  does not modify the existing .mat file or create a new one.
 14 | %
 15 | % INPUTS
 16 | %          filename - filename for IGRA data text file to be parsed
 17 | %     overwrite_mat - flag for whether to overwrite or ignore if a matching
 18 | %                     .mat file is found to already exist
 19 | %                     Default: true
 20 | %  output_directory - directory to save .mat file
 21 | %                     Default: current working directory
 22 | 
 23 | 
 24 | % Set default values
 25 | if ~exist('overwrite_mat', 'var')
 26 |     overwrite_mat = true;
 27 | end
 28 | 
 29 | if ~exist('output_directory', 'var')
 30 |     output_directory = pwd;
 31 | end
 32 | 
 33 | % Format name of .mat file
 34 | [~, file_no_path, ext] = fileparts(filename);
 35 | mat_filename = fullfile(output_directory, [file_no_path ext '.mat']);
 36 | 
 37 | % If use_mat_if_found is true, look for mat file and load from there if
 38 | % found
 39 | if exist(mat_filename, 'file') && ~overwrite_mat
 40 | %     out = load(mat_filename);
 41 |     out = [];
 42 |     return
 43 | end
 44 | 
 45 | % Get formatting definitions
 46 | defs = GCSAL.IGRA.format_definitions( );
 47 | 
 48 | % open the file
 49 | [fid, msg] = fopen(filename);
 50 | 
 51 | if fid == (-1)
 52 |     error(message('MATLAB:fileread:cannotOpenFile', filename, msg));
 53 | end
 54 | 
 55 | % Throw error on bad file
 56 | if fid == -1; error('Could not find file: %s', filename); end
 57 | 
 58 | % Read text file as uint8. Working directly in uint8 is more efficient for
 59 | % operations on large datasets. Also we are safe to assume that the IGRA
 60 | % data files contain only UTF-8 characters so all characters can be
 61 | % represented with 1 byte instead of the 2 bytes of a char
 62 | %
 63 | % Additionally the entire text file is read in one line for speed. This is
 64 | % significantly faster than a while loop with fgetl().
 65 | try
 66 |     % read file
 67 |     orig_txt = fread(fid,'char=>uint8');
 68 | catch exception
 69 |     % close file
 70 |     fclose(fid);
 71 | 	throw(exception);
 72 | end
 73 | 
 74 | % close file
 75 | fclose(fid);
 76 | 
 77 | % Lines of text associate with header information begin with #
 78 | i_hash = find(orig_txt == uint8('#'));
 79 | 
 80 | % Form indices into orig_txt for the location of header text characters.
 81 | % The start of each row of header_txt is given by i_hash and the width of
 82 | % each row of header text is fixed
 83 | header_idx = bsxfun(@plus, 0:defs.header.row_width, i_hash);
 84 | header_txt = orig_txt(header_idx);
 85 | 
 86 | % Find non-header text by simply removing the header text from the original
 87 | % text array
 88 | no_header_txt = orig_txt;
 89 | no_header_txt(header_idx) = [];
 90 | 
 91 | % Now reshape the non-header text so that each row is a line of text with
 92 | % fixed width.
 93 | if mod(length(no_header_txt), defs.entries.row_width+2) == 0
 94 |     entries_txt = reshape(no_header_txt, defs.entries.row_width+2, [])';
 95 | else
 96 |     error('File length not expected. Check for interrupted data')
 97 | end
 98 | 
 99 | % Parse header and entries text
100 | out.header =  txt2params(defs.header, header_txt);
101 | out.entries = txt2params(defs.entries, entries_txt);
102 | 
103 | % Save to mat file. Use -v6 for faster loading (v6 option does not compress
104 | % data so you get larger files but faster loading)
105 | save(mat_filename, '-v6', '-struct', 'out')
106 | 
107 | end
108 | 
109 | 
110 | function out = txt2params(def, text_mat)
111 | % For each Param in def create a Param object with the character array
112 | % text_mat.
113 | 
114 | flds = fieldnames(def.params);
115 | for i = 1:length(flds)
116 |     out.(flds{i}) = GCSAL.IGRA.Param(def.params.(flds{i}), text_mat);
117 | end
118 | 
119 | end
120 | 


--------------------------------------------------------------------------------
/GCSAL_ex2.m:
--------------------------------------------------------------------------------
  1 | % Copyright (c) Facebook, Inc. and its affiliates.
  2 | %
  3 | % This source code is licensed under the MIT license found in the
  4 | % LICENSE file in the root directory of this source tree.
  5 | %
  6 | % This example file goes through the most commonly used function of the
  7 | % Global Climate Statistical Analysis Library class.
  8 | 
  9 | 
 10 | %% Step 1 is to download the source .h5 file and set up your paths correctly
 11 | 
 12 | %clear;
 13 | close all; clc;
 14 | 
 15 | if (~exist('g','var') || ~isa(g,'GCSAL.GCSAL'))
 16 |     % The gcsal.h5 and gcsal.h5.info.mat files are available for download from the
 17 |     % website and should be placed in the h5_data directory.
 18 |     
 19 |     %%%%%%%%%%%%%% CHANGE THESE %%%%%%%%%%%%%%%%%%
 20 |     % Set this to wherever you put the gcsal.h5 file and gcsal.h5.info.mat
 21 |     % files downloaded from dropbox
 22 |     h5_dir = './h5_data/';
 23 |     
 24 |     % Directory to code. The folder +GCSAL which contains this file should be
 25 |     % in this directory
 26 |     codebase_dir = './';
 27 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 28 |     
 29 |     % Full path to .mat file with h5 info
 30 |     h5_file = fullfile(h5_dir, 'gcsal.h5');
 31 |     h5_mat_file = [h5_file '.info.mat'];
 32 |     
 33 |     % Set up Matlab path
 34 |     addpath(genpath(codebase_dir))
 35 |     
 36 |     %% Load GCSAL object from .mat file
 37 |     
 38 |     % This requires about 6 gb of RAM just to hold all of the header
 39 |     % information in memory
 40 |     
 41 |     % Normally you should load the GCSAL object from the .mat file but if it
 42 |     % doesn't exist on your path you can use the .h5 file. After using the .h5
 43 |     % file a .mat file will be created automatically for subsequent use
 44 |     if ~exist(h5_mat_file, 'file')
 45 |         g = GCSAL.GCSAL(h5_file);
 46 |     else
 47 |         g = GCSAL.GCSAL(h5_mat_file);
 48 |         g.h5_fname = h5_file;
 49 |     end
 50 | end
 51 | 
 52 | % Find 20 nearest stations to Capetown, South Africa
 53 | 
 54 | loc_lat = -33.938655;
 55 | loc_lon = 18.63863;
 56 | nsamples = 20;
 57 | stations = g.station_search('Nearest', [loc_lat loc_lon], 'Number', nsamples);
 58 | 
 59 | % Of the 20 nearest stations, grab the least number such that
 60 | % at each altitude between 18 and 25 km, there are at least
 61 | % 300 wind samples per month
 62 | 
 63 | tooFew = true;
 64 | i = 0;
 65 | 
 66 | while tooFew
 67 |     i = i + 1;
 68 |     sttns = stations(1:i);
 69 |     lows = zeros(12,1);
 70 |     for k = 1:12
 71 |         [N, entries, stats] = g.counts2(sttns, 'gph', 'wspd', ...
 72 |             'FilterFields', {'month'}, 'FilterRanges', {k}, ...
 73 |             'Verbose', false, 'Plot', false);
 74 |         lows(k) = min(interp1(stats.x, sum(N,2), [18:25]));
 75 |     end
 76 |     if ((min(lows) >= 300) || (i == nsamples))
 77 |         tooFew = false;
 78 |     end
 79 | end
 80 | 
 81 | stations = stations(1:i);
 82 | fprintf('Need %d stations\n', length(stations));
 83 | 
 84 | figure;
 85 | plot([stations(:).arclen]/1e3,'-x');grid;
 86 | xlabel('Station Index');
 87 | ylabel('Station Distance [km]');
 88 | title('Station Distance from Capetown');
 89 | 
 90 | % Get wind speed/dir stats
 91 | [N1, entries1, stats1] = g.counts2(stations, 'gph', 'wspd', 'Verbose', false);
 92 | [N2, entries2, stats2] = g.counts2(stations, 'gph', 'wdir', 'Verbose', false);
 93 | 
 94 | % stats2.x is gph vector
 95 | % stats2.y is wdir vector
 96 | 
 97 | % At constant altitude
 98 | figure;plot(stats1.y,stats1.pdf(:,23));grid;
 99 | title('Yearly Wind Speed PDF At 22.5 km');
100 | figure;plot(stats2.y,stats2.pdf(:,23));grid;
101 | title('Yearly Wind Direction PDF At 22.5 km');
102 | 
103 | s = stats1.y;
104 | p = stats1.pdf(:,23)';
105 | wspds = datasample(s, 1e4, 'Weights', p, 'Replace', true);
106 | figure;h1 = histogram(wspds,'BinMethod','sturges');grid;
107 | title('Sampled Yearly Wind Speed PDF At 22.5 km');
108 | 
109 | s = stats2.y;
110 | p = stats2.pdf(:,23)';
111 | wdirs = datasample(s, 1e4, 'Weights', p, 'Replace', true);
112 | figure;h2 = histogram(wdirs,'BinMethod','sturges');grid;
113 | title('Sampled Yearly Wind Direction PDF At 22.5 km');
114 | 
115 | %% Reproduce distribution
116 | 
117 | % y1max = max(stats1.y);
118 | % y1min = min(stats1.y);
119 | %
120 | % y2max = max(stats2.y);
121 | % y2min = min(stats2.y);
122 | %
123 | % n = 1000;
124 | % wind = zeros(n,1);
125 | % dir = zeros(n,1);
126 | % nsamples = 5;
127 | % for j = 1:n
128 | %     p = rand(nsamples,1);      % sample from uniform dist
129 | %     y1 = p * (y1max - y1min) + y1min;   % Get wind values from sample
130 | %     probs = interp1(stats1.y,stats1.pdf(:,23),y1);   % Probability of each wind
131 | %     [~,idx] = sort(probs);  % Find max probability
132 | %     wind(j) = y1(idx(end));
133 | %
134 | %     p = rand(nsamples,1);      % sample from uniform dist
135 | %     y2 = p * (y2max - y2min) + y2min;   % Get wind values from sample
136 | %     probs = interp1(stats2.y,stats2.pdf(:,23),y2);   % Probability of each wind
137 | %     [~,idx] = sort(probs);  % Find max probability
138 | %     dir(j) = y2(idx(end));
139 | % end
140 | % figure;histogram(wind);figure;histogram(dir);
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Global Climate Statistical Analysis Library (GCSAL)
  2 | GCSAL is a software package in MATLAB that allows the user to
  3 | - automatically download the Integrated Global Radiosonde Archive (IGRA) raw
  4 | text data from the NOAA website
  5 | - efficiently process and save the data in a h5 hierarchical file
  6 | - quickly access the data, aggregate statistics, and generate plots
  7 | 
  8 | ## Examples
  9 | The following example files are provided in the root directory.  
 10 | * **GCSAL_ex1.m**, **GCSAL_ex2.m**, etc. go through the most commonly used functions
 11 | of the Global Climate Statistical Analysis Library class. These examples include
 12 | analysis of the probability distributions of different atmospheric data based on
 13 | location and time of day or time of year.
 14 | * **IGRA_to_h5_example.m** takes the GCSAL process from downloading the source
 15 | data from NOAA website through to creating a GCSAL Matlab object.  These steps
 16 | are only necessary if you want to download and re-build the GCSAL library from
 17 | the original source data. Typically this is not necessary as you can use the
 18 | provided .h5 file on the website. Reasons you might want to do this are 1) want
 19 | to update the data with the latest measurements or 2) want to make a change to the
 20 | way the data is stored in the .h5 file.
 21 | 
 22 | ## Requirements
 23 | GCSAL requires MATLAB 2017 and runs on Mac, Linux, or Windows OS
 24 | 
 25 | ## Building GCSAL
 26 | No building required.
 27 | 
 28 | ## Installing GCSAL
 29 | No installation is required.  The user needs to download two data files,
 30 | *gcsal.h5* and *gcsal.h5.info.mat*, and put them in the *h5_data* directory.
 31 | - gcsal.h5: https://www.dropbox.com/s/2m3glr0drhds33l/gcsal.h5?dl=0
 32 | - gcsal.h5.info.mat: https://www.dropbox.com/s/ks9fs3xombb9xqs/gcsal.h5.info.mat?dl=0
 33 | 
 34 | ## How GCSAL works
 35 | GCSAL consists of 2 main groups of functions.  The first group is for processing
 36 | the text files efficiently and store the data in the h5 hierarchical file
 37 | format.  The second group of functions is for query the H5 data file and create
 38 | maps and statistical plots.
 39 | 
 40 | ## Full documentation
 41 | The Global Climate Statistical Analysis Library (GCSAL) allows one to view
 42 | climate statistics formulated from over 60 years of radiosonde data from weather
 43 | balloons launched at more than 3000 locations around the world! The GCSAL efficiently
 44 | processes and compresses the 80 GB of raw text data into 17 GB of data in the h5
 45 | hierarchical file format. It provides a simple MATLAB interface to access the
 46 | vast quantities of climate data. One can view statistical distributions and
 47 | perform statistical operations on the following quantities from sea level to
 48 | 30 km altitude: wind speed, wind direction, temperature, pressure, dewpoint
 49 | depression, and relative humidity.
 50 | 
 51 | ### List of functions
 52 | - Text to Mat to H5
 53 |   - IGRA
 54 |     - format_definitions
 55 |     - datafile2mat
 56 |     - datafile2mat_dir
 57 |     - mat2h5
 58 |     - mat2h5_dir
 59 |     - Param
 60 |     - Methods
 61 |       - read_columns
 62 |       - h5write
 63 |       - h5write_param
 64 |       - Static
 65 |         - pad_left
 66 |         - convert_to_min_int
 67 |         - txt2data
 68 |         - compare_bytes_unique
 69 |         - str2int
 70 |         - char2numerals
 71 |         - str2float
 72 |         - bits2ints
 73 |         - ints2bits
 74 |         - compress_idx
 75 |         - uncompress_idx
 76 |       - Static (Private)
 77 |         - unique_inverse
 78 |         - compress_txt
 79 |         - remove_bad_vals
 80 |   - H5
 81 |     - create_and_write
 82 |     - fullpath
 83 |     - load
 84 |     - recursive_load
 85 | - Query H5 and histograms
 86 |   - GCSAL
 87 |     - Methods
 88 |       - counts
 89 |       - counts2
 90 |       - countsN
 91 |       - query
 92 |       - stations_search
 93 |       - plot_world_map
 94 |       - find_countries
 95 |       - find_stations
 96 |       - find_headers
 97 |       - find_def
 98 |       - clear_entries_cache
 99 |     - Methods (Private)
100 |       - add_header_params_to_entries
101 |       - stations_from_latlong
102 |       - stations_from_countries
103 |       - stations_from_regex
104 |       - load_all_headers
105 |       - load_from_stations
106 |       - load_group
107 |       - load_param
108 |       - read_from_cached_entries
109 |       - cache_param
110 |       - add_entry_idx_to_headers
111 |     - Static
112 |       - filter_data_by_range
113 |       - histcounts
114 |       - histcounts2
115 |       - histcountsN
116 |       - counts2pdf
117 |       - get_bin_centers
118 |       - find_keys
119 |       - h5info_find_children
120 |       - plot_stations
121 |       - count_and_plot_entries
122 |       - default_bin_edges
123 |       - get_label
124 |       - description_from_filters
125 |       - stations_intersect
126 |       - stations_union
127 |       - stations_setxor
128 |     - Static (Private)
129 |       - struct_set_operation
130 |       - initialize_stations
131 |       - get_entry_idx_in_range
132 |       - header_to_entry_idx
133 |       - station_id_str
134 |   - Map
135 |     - find_in_lat_long_range
136 |     - find_nearest
137 |     - map_stations_by_country
138 |     - multipatch
139 |     - world_map
140 |     - inpolygon
141 | 
142 | 
143 | ## Join the GCSAL community
144 | * POC: Greg Katz (<gregbkatz@gmail.com>) and David Liu (<zliu@fb.com>)
145 | 
146 | ## License
147 | GCSAL is MIT-licensed.
148 | 
149 | NOAA Integrated Global Radiosonde Archive (IGRA) data is licensed under the
150 | World Meteorological Organization (WMO) Resolution 40 NOAA Policy NCEI data and
151 | products.
152 | 


--------------------------------------------------------------------------------
/+GCSAL/+Map/find_in_lat_long_range.m:
--------------------------------------------------------------------------------
  1 | function [stations, latbox, longbox] = ...
  2 |     find_in_lat_long_range(all_stations, latrange, longrange)
  3 | % Copyright (c) Facebook, Inc. and its affiliates.
  4 | %
  5 | % This source code is licensed under the MIT license found in the
  6 | % LICENSE file in the root directory of this source tree.
  7 | %
  8 | % [stations, longbox, latbox] = ...
  9 | %     find_in_lat_long_range(all_stations, latrange, longrange)
 10 | %
 11 | %   Returns an array of station structs that fall within the
 12 | %   box defined by latrange and longrange from the list of stations in
 13 | %   all_stations. all_stations is a struct array with each element
 14 | %   containing lat, long, and id fields.
 15 | %
 16 | %   Additionally returns longbox and latbox which can be used to plot the
 17 | %   the searchbox that was used.
 18 | %
 19 | %   latrange and longrange must be two element vectors and are
 20 | %   in degrees. This function accounts for angle wraparound. So latrange
 21 | %   could be [-45 45] to find stations in latitudes between -45 and 45
 22 | %   degrees or it could be [45 -45] to find stations with latitude above 45
 23 | %   deg or below -45.  The same goes for longrange
 24 | %
 25 | % INPUTS
 26 | %   all_stations - struct array, each elemetn contains lat, long, id
 27 | %       latrange - two element vector defining latitude limits in degrees.
 28 | %                  Angle wraparound is OK.
 29 | %      longrange - two element vector defining longitude limits in degrees.
 30 | %                  Angle wraparound is OK.
 31 | %
 32 | % OUTPUTS
 33 | %       stations - struct array, subset of all_stations located within the
 34 | %                  lat/long search box
 35 | %         latbox - vector of latitude values in degrees that can be used to
 36 | %                  make a plot representing the search box used. This
 37 | %                  handles wraparound nicely for a map plot by inserting
 38 | %                  NaNs for a discontinuous plot line.
 39 | %        longbox - see latbox
 40 | 
 41 | 
 42 | % Handle angle wraparound
 43 | % After this function lat/long ranges may have multiple rows for multiple
 44 | % boxes to search. Multiple boxes occur when a lat/long range spans across
 45 | % an edge of the map where the angle wraps around.
 46 | [latranges, longranges] = latlongwrap(latrange, longrange);
 47 | 
 48 | % Extract lat/long values from all_stations struct array
 49 | all_stations_lat = [all_stations.lat];
 50 | all_stations_long = [all_stations.lon];
 51 | 
 52 | % Initialize logical array to false
 53 | i_stations = false(size(all_stations));
 54 | 
 55 | % Initialize latbox and longbox output vectors
 56 | latbox = [];
 57 | longbox = [];
 58 | 
 59 | % Loop through the wrapped lat/long ranges
 60 | for i = 1:size(latranges, 1)
 61 | 
 62 |     % Get idx and lat/long box for the current range
 63 |     [idx_curr, latbox_curr, longbox_curr] = evaluate_range(...
 64 |         all_stations_lat, all_stations_long, latranges(i,:), longranges(i,:));
 65 | 
 66 |     % Append idx_curr to i_stations
 67 |     i_stations = i_stations | idx_curr;
 68 | 
 69 |     % Append lat/long box
 70 |     latbox = [latbox latbox_curr]; %#ok<AGROW>
 71 |     longbox = [longbox longbox_curr]; %#ok<AGROW>
 72 | end
 73 | 
 74 | % Index into all_stations
 75 | stations = all_stations(i_stations);
 76 | 
 77 | end
 78 | 
 79 | function [i_stations, latbox, longbox] = evaluate_range(...
 80 |     all_stations_lat, all_stations_long, latrange, longrange)
 81 | 
 82 |     % Find stations in latitude range
 83 |     ilat = find_in_range(all_stations_lat, latrange);
 84 | 
 85 |     % Find stations in longitude range
 86 |     ilong = find_in_range(all_stations_long, longrange);
 87 | 
 88 |     % Find stations in both lat and long ranges
 89 |     i_stations = (ilat & ilong);
 90 | 
 91 |     % Process lat/long range for pretty plotting
 92 |     [latbox, longbox] = latlongbox(latrange, longrange);
 93 | 
 94 | end
 95 | 
 96 | function in_range = find_in_range(val, range)
 97 | % return logical index for values betweeen range(1) and range(2) inclusive
 98 | 
 99 | in_range = val >= range(1) & val <= range(2);
100 | 
101 | end
102 | 
103 | function [lat, long] = latlongwrap(lat, long)
104 | 
105 | % Error check that lat/long are vectors
106 | if ~isvector(lat) || ~isvector(long)
107 |     error('lat and long must be vectors')
108 | end
109 | 
110 | % Enforce lat/long are row vectors
111 | lat = lat(:)';
112 | long = long(:)';
113 | 
114 | % Erroc check that lat/long are length 2
115 | if length(lat) ~= 2 || length(long) ~= 2
116 |     error('lat and long must be length 2')
117 | end
118 | 
119 | % Ensure lat/long are between -180 and 180
120 | lat =  dmodpi(lat);
121 | long = dmodpi(long);
122 | 
123 | % Error check that latitude is between -90 and 90
124 | if any(lat < -90 | lat > 90)
125 |     error('lat must be between -90 and 90')
126 | end
127 | 
128 | % Get booleans for whether lat/long ranges wrap around edges of map
129 | latwrap = lat(2) <= lat(1);
130 | longwrap = long(2) <= long(1);
131 | 
132 | % If both lat and long wrap, then we need four search boxes going to the
133 | % edge of the map at all four courners
134 | if latwrap && longwrap
135 |     lat = [lat(1) 90;
136 |         lat(1) 90;
137 |         -90    lat(2);
138 |         -90    lat(2)];
139 | 
140 |     long = [long(1) 180;
141 |         -180    long(2);
142 |         long(1) 180;
143 |         -180    long(2)];
144 | 
145 | % If only lat is wrapped then we need two search boxes going to the top and
146 | % bottom of the map
147 | elseif latwrap
148 |     lat = [lat(1) 90;
149 |         -90    lat(2)];
150 | 
151 |     long = [long; long];
152 | 
153 | % If only lat is wrapped then we need two search boxes going to the left
154 | % and right edges of the map
155 | elseif longwrap
156 |     long = [long(1) 180;
157 |         -180    long(2)];
158 | 
159 |     lat = [lat; lat];
160 | 
161 | % Neither is wrapped
162 | else
163 |     % In this case lat and long are good as is, just a single search box
164 | end
165 | 
166 | 
167 | end
168 | 
169 | function ang_wrapped = dmodpi(ang_deg)
170 | % Ensure ang_deg is between -180 and 180
171 | 
172 | ang_wrapped = mod(ang_deg + 180, 360) - 180;
173 | 
174 | end
175 | 
176 | function [lat_box, long_box] = latlongbox(latrange, longrange)
177 | % Return vectors that can be used to plot the search box made by latrange
178 | % and longrange but with lines on the edge of the map hidden to show how
179 | % the box is wrapped around to the other side of the map.
180 | %
181 | % To achieve this each of the lines of the box are constructed one at a
182 | % time and if the line is found to be on the edge of the map it is replaced
183 | % with NaN
184 | 
185 | % These are the points of the search box with the first point repeated at
186 | % the end to complete the circuit
187 | long_pts = [longrange(1) longrange(2) longrange(2) longrange(1) longrange(1)];
188 | lat_pts =  [latrange(1)  latrange(1)  latrange(2)  latrange(2)  latrange(1)];
189 | 
190 | % initalize outputs
191 | long_box = [];
192 | lat_box = [];
193 | 
194 | % Loop throught the 4 edges of the square
195 | for i = 1:4
196 | 
197 | 
198 |     % For this edge pull the next two points form long_pts and then check
199 |     % those points to see if they lie at the extreme and NaN if so.
200 |     curr_long = NaN_if_all_same_extreme(long_pts(i:i+1), [-180 180]);
201 |     curr_lat =  NaN_if_all_same_extreme( lat_pts(i:i+1), [-90 90]);
202 | 
203 |     % Append
204 |     long_box = [long_box curr_long  NaN]; %#ok<AGROW>
205 |     lat_box =  [lat_box  curr_lat   NaN]; %#ok<AGROW>
206 | end
207 | 
208 | end
209 | 
210 | function vals = NaN_if_all_same_extreme(vals, extremes)
211 | % If vals are equal to each other and equal to some value in extremes then
212 | % return NaN
213 | 
214 | L = length(vals);
215 | if vals(1) == vals(2:L)
216 |     if any(vals(1) == extremes)
217 |         vals(1:L) = NaN;
218 |     end
219 | end
220 | 
221 | end
222 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/format_definitions.m:
--------------------------------------------------------------------------------
  1 | function defs = format_definitions( )
  2 | % Copyright (c) Facebook, Inc. and its affiliates.
  3 | %
  4 | % This source code is licensed under the MIT license found in the
  5 | % LICENSE file in the root directory of this source tree.
  6 | %
  7 | % [ defs ] = igra_format_definitions( )
  8 | % Returns a struct containing the format definitions for igra data and
  9 | % stations list text files
 10 | %
 11 | % See the following references for more information on the IGRA text file
 12 | % format.
 13 | % Ref: https://www1.ncdc.noaa.gov/pub/data/igra/data/igra2-data-format.txt
 14 | % Ref: https://www1.ncdc.noaa.gov/pub/data/igra/igra2-list-format.txt
 15 | 
 16 | 
 17 | % Load up definitions in cell matrix format
 18 | headers_cell =  headers_format_definition_as_cell_matrix;
 19 | entries_cell =  entries_format_definition_as_cell_matrix;
 20 | stations_cell = stations_format_definition_as_cell_matrix;
 21 | 
 22 | % Convert definitions to structs
 23 | defs.header    = cell2struct(headers_cell);
 24 | defs.entries   = cell2struct(entries_cell);
 25 | defs.stations  = cell2struct(stations_cell);
 26 | 
 27 | 
 28 | end
 29 | 
 30 | function def = cell2struct(cell_matrix)
 31 | 
 32 | %%%% Convert cell arrays to structs
 33 | % field names for each column in the above cell matrices
 34 | columns = {'varname', 'type', 'col_idx', 'bad_vals', ...
 35 |            'function_handle', 'units', 'description'};
 36 | 
 37 | % Loop through the rows in the cell array
 38 | def.row_width = 0; % initialize row_width
 39 | for i_var = 1:size(cell_matrix, 1)
 40 |     curr_varname = cell_matrix{i_var, 1};
 41 | 
 42 |     % Loop through the columns
 43 |     for i_col = 1:length(columns)
 44 |         curr_col_name = columns{i_col};
 45 | 
 46 |         % Assign struct field to cell array element
 47 |         def.params.(curr_varname).(curr_col_name) = cell_matrix{i_var, i_col};
 48 |     end
 49 | 
 50 |     % Get row_widht by finding the maximum col_idx value
 51 |     def.row_width = max(def.row_width, def.params.(curr_varname).col_idx(end));
 52 | end
 53 | 
 54 | end
 55 | 
 56 | function out = scale_func(x, scale_factor)
 57 | % Helper function for applying a scale_factor and converting to single
 58 | % This is defined her for use in the function_handle column of the
 59 | % definitions below
 60 | 
 61 |     out = single(x)*scale_factor;
 62 | 
 63 | end
 64 | 
 65 | function def = headers_format_definition_as_cell_matrix()
 66 | % Source: https://www1.ncdc.noaa.gov/pub/data/igra/data/igra2-data-format.txt
 67 | % ---------------------------------
 68 | % Variable   Columns  Type
 69 | % ---------------------------------
 70 | % HEADREC       1-  1  Character
 71 | % ID            2- 12  Character
 72 | % YEAR         14- 17  Integer
 73 | % MONTH        19- 20  Integer
 74 | % DAY          22- 23  Integer
 75 | % HOUR         25- 26  Integer
 76 | % RELTIME      28- 31  Integer
 77 | % NUMLEV       33- 36  Integer
 78 | % P_SRC        38- 45  Character
 79 | % NP_SRC       47- 54  Character
 80 | % LAT          56- 62  Integer
 81 | % LON          64- 71  Integer
 82 | % ---------------------------------
 83 | def = {
 84 |     'id',         'char',   2:12,  {},     [], '', 'Station ID';
 85 |     'year',       'uint16', 14:17, {},     [], '', 'Year';
 86 |     'month',      'uint8',  19:20, {},     [], '', 'Month';
 87 |     'day',        'uint8',  22:23, {},     [], '', 'Day';
 88 |     'hour',       'uint8',  25:26, {},     [], '', 'Hour';
 89 |     'reltime_hr', 'uint8',  28:29, {'99'}, [], '', 'Release Time Hour';
 90 |     'reltime_min','uint8',  30:31, {'99'}, [], '', 'Release Time Minute';
 91 |     'numlevs',    'uint32', 33:36, {},     [], '', '# of levels in the sounding';
 92 |     'p_src',      'char',   38:45, {''},   [], '', 'Data Source Code for Pressure Levels';
 93 |     'np_src',     'char'    47:54, {''},   [], '', 'Data Source Code for Non-pressure Levels';
 94 |     'lat',        'int32',  56:62, {},     @(x)scale_func(x,1e-4), 'deg', 'Latittude';
 95 |     'lon',        'int32',  64:71, {},     @(x)scale_func(x,1e-4), 'deg', 'Longitude';
 96 |     };
 97 | end
 98 | 
 99 | function def = entries_format_definition_as_cell_matrix()
100 | % Source: https://www1.ncdc.noaa.gov/pub/data/igra/data/igra2-data-format.txt
101 | % -------------------------------
102 | % Variable        Columns Type
103 | % -------------------------------
104 | % LVLTYP1         1-  1   Integer
105 | % LVLTYP2         2-  2   Integer
106 | % ETIME           4-  8   Integer
107 | % PRESS          10- 15   Integer
108 | % PFLAG          16- 16   Character
109 | % GPH            17- 21   Integer
110 | % ZFLAG          22- 22   Character
111 | % TEMP           23- 27   Integer
112 | % TFLAG          28- 28   Character
113 | % RH             29- 33   Integer
114 | % DPDP           35- 39   Integer
115 | % WDIR           41- 45   Integer
116 | % WSPD           47- 51   Integer
117 | % -------------------------------
118 | % defs.entries = {
119 | %     'lvltyp1',   'uint8',  1,     {},                 [], '';
120 | %     'lvltyp2',   'uint8',  2,     {},                 [], '';
121 | %     'etime',     'int32',  4:8,   {'-8888', '-9999'}, [], '';
122 | %     'press',     'uint32', 10:15, {'-8888', '-9999'}, [], 'Pa';
123 | %     'pflag',     'char',   16,    {''},               [], '';
124 | %     'gph',       'uint16', 17:21, {'-8888', '-9999'}, @(x)scale_func(x, 1e-3), 'km';
125 | %     'zflag',     'char',   22,    {''},               [], '';
126 | %     'temp',      'uint16', 23:27, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), 'deg C';
127 | %     'tflag',     'char',   28,    {''},               [], '';
128 | %     'rh',        'uint16', 29:33, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), '%';
129 | %     'dpdp',      'uint16', 35:39, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), 'deg C'
130 | %     'wdir',      'uint16', 41:45, {'-8888', '-9999'}, [], 'deg from North';
131 | %     'wspd',      'uint16', 47:51, {'-8888', '-9999'}, @(x)scale_func(x,0.1), 'm/s';
132 | %     };
133 | 
134 | def = {
135 |     'lvltyp1',   'uint8',  1,     {},                 [], '', '';
136 |     'lvltyp2',   'uint8',  2,     {},                 [], '', '';
137 |     'etime_min', 'int32',  4:6,   {'-88', '-99'},     [], '', '';
138 |     'etime_sec', 'int32',  7:8,   { '88',  '99'},     [], '', '';
139 |     'press',     'int32',  10:15, {'-8888', '-9999'}, [], 'PA', 'Pressure';
140 |     'pflag',     'char',   16,    {''},               [], '', 'Pressure Flag';
141 |     'gph',       'int32',  17:21, {'-8888', '-9999'}, @(x)scale_func(x, 1e-3), 'km', 'Altitude';
142 |     'zflag',     'char',   22,    {''},               [], '', 'Altitude Flag';
143 |     'temp',      'int16',  23:27, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), 'deg C', 'Temperature';
144 |     'tflag',     'char',   28,    {''},               [], '', 'Temperature Flag';
145 |     'rh',        'int16',  29:33, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), '%', 'Relative Humidity';
146 |     'dpdp',      'int16',  35:39, {'-8888', '-9999'}, @(x)scale_func(x, 0.1), 'deg C', 'Dewpoint Depresesion';
147 |     'wdir',      'int16',  41:45, {'-8888', '-9999'}, [], 'deg from North', 'Wind direction (90 = East)';
148 |     'wspd',      'int16',  47:51, {'-8888', '-9999'}, @(x)scale_func(x,0.1), 'm/s', 'Wind Speed';
149 |     };
150 | end
151 | 
152 | function def = stations_format_definition_as_cell_matrix()
153 | 
154 | % Source: https://www1.ncdc.noaa.gov/pub/data/igra/igra2-list-format.txt
155 | % ------------------------------
156 | % Variable   Columns   Type
157 | % ------------------------------
158 | % ID            1-11   Character
159 | % LATITUDE     13-20   Real
160 | % LONGITUDE    22-30   Real
161 | % ELEVATION    32-37   Real
162 | % STATE        39-40   Character
163 | % NAME         42-71   Character
164 | % FSTYEAR      73-76   Integer
165 | % LSTYEAR      78-81   Integer
166 | % NOBS         83-88   Integer
167 | % ------------------------------
168 | def = {
169 |     'id',   'char',    1:11,   {},       [], '', 'Station ID';
170 |     'lat',  'single',  13:20,  {'-98.8888'},  [], '', 'Latitude';
171 |     'long', 'single',  22:30,  {'-998.8888'}, [], '', 'Longitude';
172 |     'elev', 'single',  32:37,  {'-998.8'},    [], '', 'Elevation';
173 |     'state',     'char',    39:40,  {}, [], '', 'U.S. State';
174 |     'name',      'char',    42:71,  {}, [], '', 'Name';
175 |     'fstyear',   'int16',   73:76,  {}, [], '', 'First Year';
176 |     'lstyear',   'int16',   78:81,  {}, [], '', 'Last Year';
177 |     'nobs',      'int32',   83:88,  {}, [], '', '# of Observations';
178 |     };
179 | end
180 | 


--------------------------------------------------------------------------------
/GCSAL_ex1.m:
--------------------------------------------------------------------------------
  1 | % Copyright (c) Facebook, Inc. and its affiliates.
  2 | %
  3 | % This source code is licensed under the MIT license found in the
  4 | % LICENSE file in the root directory of this source tree.
  5 | %
  6 | % This example file goes through the most commonly used function of the
  7 | % Global Climate Statistical Analysis Library class. The examples show you
  8 | % can do analysis of the probability distributions of different atmospheric
  9 | % data based on location and time of day or time of year.
 10 | 
 11 | 
 12 | %% Step 1 is to download the source .h5 file and set up your paths correctly
 13 | 
 14 | clear;
 15 | close all;
 16 | clc;
 17 | 
 18 | if (~exist('g','var') || ~isa(g,'GCSAL.GCSAL'))
 19 |     % The gcsal.h5 and gcsal.h5.info.mat files are available for download from the
 20 |     % website and should be placed in the h5_data directory.
 21 | 
 22 |     %%%%%%%%%%%%%% CHANGE THESE %%%%%%%%%%%%%%%%%%
 23 |     % Set this to wherever you put the gcsal.h5 file and gcsal.h5.info.mat
 24 |     % files downloaded from dropbox
 25 |     h5_dir = './h5_data/';
 26 | 
 27 |     % Directory to code. The folder +GCSAL which contains this file should be
 28 |     % in this directory
 29 |     codebase_dir = './';
 30 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 31 | 
 32 |     % Full path to .mat file with h5 info
 33 |     h5_file = fullfile(h5_dir, 'gcsal.h5');
 34 |     h5_mat_file = [h5_file '.info.mat'];
 35 | 
 36 |     % Set up Matlab path
 37 |     addpath(genpath(codebase_dir))
 38 | 
 39 |     %% Load GCSAL object from .mat file
 40 | 
 41 |     % This requires about 6 gb of RAM just to hold all of the header
 42 |     % information in memory
 43 | 
 44 |     % Normally you should load the GCSAL object from the .mat file but if it
 45 |     % doesn't exist on your path you can use the .h5 file. After using the .h5
 46 |     % file a .mat file will be created automatically for subsequent use
 47 |     if ~exist(h5_mat_file, 'file')
 48 |         g = GCSAL.GCSAL(h5_file);
 49 |     else
 50 |         g = GCSAL.GCSAL(h5_mat_file);
 51 |         g.h5_fname = h5_file;
 52 |     end
 53 | end
 54 | 
 55 | % Introduction: Printout the list of variables that are in the header data
 56 | % and the entries data.
 57 | % header data is for a single balloon launch - things like time, date, and location
 58 | % entries data is the measurements of the baloon - wind speed, pressure, etc.
 59 | g.defs.header.params
 60 | g.defs.entries.params
 61 | 
 62 | % For more details look at a single parameter
 63 | g.defs.entries.params.wspd
 64 | g.defs.entries.params.gph
 65 | 
 66 | 
 67 | %% Stations_search
 68 | 
 69 | % Find stations within 25 degrees of the equator
 70 | stations1 = g.station_search('LatLong', [-25 25 -180 180]);
 71 | 
 72 | % Find stations from a bunch of countries
 73 | country_names = {'Mexico', 'Brazil','Algeria', 'Burkina Faso', 'Ghana', 'Niger',...
 74 |            'Nigeria',  'Egypt', 'Sudan', 'Ethiopia', 'Uganda',...
 75 |            'Kenya', 'Tanzania', 'Madagascar', 'India', 'Sri Lanka',...
 76 |            'Nepal', 'Bangladesh','Myanmar', 'Thailand', 'Vietnam', 'Cambodia',...
 77 |            'Ukraine', 'Uzbekistan', 'Turkey',...
 78 |            'Indonesia', 'Philippines'};
 79 | 
 80 | stations2 = g.station_search('Countries', country_names);
 81 | 
 82 | % Find stations in countries AND within 25 degrees of equator
 83 | stations3 = g.station_search('Countries', country_names, ...
 84 |     'LatLong', [-25 25 -180 180]);
 85 | 
 86 | % Find stations in countries OR within 25 degrees of equator
 87 | stations4 = GCSAL.GCSAL.stations_union(stations1, stations2);
 88 | 
 89 | % Plot stations 4 on wolrd map
 90 | figure; hold all; g.plot_world_map();
 91 | GCSAL.GCSAL.plot_stations(stations4, 'r+');
 92 | 
 93 | % Find stations with IDs beginnign with the letter A.
 94 | % Note that in regex ^ means beginning of the line
 95 | stations5 = g.station_search('IDRegex', '^A');
 96 | 
 97 | % Find stations in Brazil or India AND within 25 degrees of
 98 | % the equator AND with station IDs ending in 5
 99 | % Note that in regex $ means end of the line
100 | stations6 = g.station_search('Countries', {'Brazil', 'India'}, ...
101 |     'LatLong', [-25 25 -180 180], ...
102 |     'IDRegex', '5$');
103 | 
104 | % Find stations within +/-2.5 deg latitude about Guatemala
105 | stations7 =  g.station_search('Lat', [14.583323, -90.527309], 'Range', 2.5);
106 | 
107 | %% Data query
108 | 
109 | % Get stations located in Botswana
110 | stations = g.station_search('Countries', {'Botswana'});
111 | 
112 | % Get all geopotential height and  windspeed data along with hour, month,
113 | % and year data
114 | entries1 = g.query(stations, {'gph', 'wspd', 'hour', 'month', 'year'});
115 | 
116 | % Plot distribution of hours and years for the data in entries1
117 | figure; histogram(vertcat(entries1.hour)); xlabel('Hour'); ylabel('# of occurences');
118 | figure; histogram(vertcat(entries1.year)); xlabel('Year'); ylabel('# of occurences');
119 | 
120 | % Get gph and wspd data measured between 6 and 4 pm
121 | entries2 = g.query(stations, {'gph', 'wspd'}, 'hour', [6 16]);
122 | 
123 | % Plot distribution of hours for the data in entries2
124 | figure; histogram(vertcat(entries2.hour)); xlabel('Hour'); ylabel('# of occurences');
125 | 
126 | % Get data corresponding only to measuresments taken in August between
127 | % 4am and Noon and in the years 1990 to 1999
128 | entries3 = g.query(stations, {'gph', 'wspd'}, ...
129 |                   {'month', 'hour', 'year'}, ...
130 |                   {8, [4 12], [1990 1999]});
131 | 
132 | % Plot distribution of years for the data in entries3
133 | figure; histogram(vertcat(entries3.year)); xlabel('Year'); ylabel('# of occurences');
134 | 
135 | % To save RAM clear out the entries cache. The entries cache holds data for
136 | % any data that has been loaded so far which makes it faster to the access
137 | % the data subsequently but also uses up RAM;
138 | g.clear_entries_cache();
139 | 
140 | %% One dimensional histograms
141 | 
142 | % Get stations in Brazil
143 | stations = g.station_search('Countries', 'Brazil');
144 | 
145 | % Histogram for all windspeeds
146 | [N, entries] = g.counts(stations, 'wspd');
147 | 
148 | % Define custom bin_edges
149 | bin_edges = 0:1:80;
150 | 
151 | % Do counts for windspeed filtered on geopotential
152 | % altitude between 20 and 30 km and with custom bin edges
153 | [N, entries] = g.counts(stations, 'wspd', 'FilterFields', {'gph'}, ...
154 |      'FilterRanges', {[20 30]}, 'Edges', bin_edges);
155 | 
156 | % Now additionally filter on measurements taken in August
157 | % between 4 and 10 am
158 | [N, entries] = g.counts(stations, 'wspd', 'FilterFields', {'gph', 'month', 'hour'}, ...
159 |     'FilterRanges', {[20 30], 8, [4 10]}, 'Edges', bin_edges);
160 | 
161 | %% Two dimensional histograms
162 | 
163 | % Get some stations
164 | stations = g.station_search('Countries', 'Brazil');
165 | 
166 | % Do counts between gph and various other parameters
167 | [N, entries] = g.counts2(stations, 'gph', 'wspd');
168 | [N, entries] = g.counts2(stations, 'gph', 'press');
169 | [N, entries] = g.counts2(stations, 'gph', 'temp');
170 | [N, entries] = g.counts2(stations, 'gph', 'rh');
171 | [N, entries] = g.counts2(stations, 'gph', 'dpdp');
172 | [N, entries] = g.counts2(stations, 'gph', 'wdir');
173 | [N, entries] = g.counts2(stations, 'gph', 'wspd');
174 | 
175 | % Do counts between gph and pressure with custom bin edges
176 | [N, entries] = g.counts2(stations, 'gph', 'wspd', ...
177 |    'XEdges', 0:0.5:40, 'YEdges', 0:1:80);
178 | 
179 | % Do counts for data measured between 6am and 4pm and in August
180 | [N, entries] = g.counts2(stations, 'gph', 'wspd', ...
181 | 'FilterFields', {'hour', 'month'}, 'FilterRanges', {[6 16], 8});
182 | 
183 | 
184 | %% N dimensional counts
185 | 
186 | % Get some stations
187 | stations = g.station_search('Countries', 'Brazil');
188 | 
189 | % Make 5-dimensional count matrix with default bin edges and no filtering
190 | resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'wspd'});
191 | N = g.countsN(stations, resolutions);
192 | 
193 | % Add custom bin edges to gph field and limit data to only data between 6
194 | % and 10 am
195 | resolutions(3).edges = 0:1:80;
196 | N = g.countsN(stations, resolutions, 'FilterFields', {'hour'}, 'FilterRanges', [6 10]);
197 | 
198 | % Clear the cache to save RAM
199 | g.clear_entries_cache();
200 | 
201 | %% N dimensional counts on full library
202 | % This section has been commented out because it takes ~35 minutes and 30
203 | % gb to complete
204 | 
205 | % % Get all stations
206 | % stations = g.stations;
207 | %
208 | % % Cache data for lat, lon, gph, and month
209 | % % This may take a few minutes and requires another 18 gb of RAM for a total
210 | % % of 24 gb of RAM including the header data that is loaded when the GCSAL
211 | % % object is initialized.
212 | % g.query(stations, {'lat', 'lon', 'gph', 'month'});
213 | %
214 | % % Turn off cache so we don't use any more RAM going forward
215 | % g.do_cache_entries = false;
216 | %
217 | % % Make 5-dimensional count matrix with default bin edges and no filtering
218 | % % This may take about 5 minutes for each countsN call.
219 | % % Also temporarily uses another 6 gb of RAM for each countsN call for a
220 | % % grand total of ~30gb of RAM. As long as do_cache_entries is false this
221 | % % last 6 gb RAM is cleared between each call to countsN
222 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'wdir'});
223 | % N1 = g.countsN(stations, resolutions); % This may take about 5 minutes
224 | %
225 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'wspd'});
226 | % N2 = g.countsN(stations, resolutions); % This may take about 5 minutes
227 | %
228 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'temp'});
229 | % N3 = g.countsN(stations, resolutions); % This may take about 5 minutes
230 | %
231 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'press'});
232 | % N4 = g.countsN(stations, resolutions); % This may take about 5 minutes
233 | %
234 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'dpdp'});
235 | % N5 = g.countsN(stations, resolutions); % This may take about 5 minutes
236 | %
237 | % resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'rh'});
238 | % N6 = g.countsN(stations, resolutions); % This may take about 5 minutes
239 | 


--------------------------------------------------------------------------------
/+GCSAL/+IGRA/Param.m:
--------------------------------------------------------------------------------
  1 | classdef Param
  2 |     % Copyright (c) Facebook, Inc. and its affiliates.
  3 |     %
  4 |     % This source code is licensed under the MIT license found in the
  5 |     % LICENSE file in the root directory of this source tree.
  6 |     %
  7 |     % Param(param_def, text_mat)
  8 |     %    Reads the IGRA formatted text in text_mat according to the
  9 |     %    format definition in param_def. IGRA text data uses fixed column
 10 |     %    widths so a single Param reads only data from the fixed
 11 |     %    columns specified in param_def corresponding with a single data
 12 |     %    parameter.
 13 |     %
 14 |     %    The param_def must supply the following struct fields:
 15 |     %       varname - name used to store the data in an H5 file. Must
 16 |     %                 resolve to a valid Matlab struct field.
 17 |     %          type - 'uint8', 'uint16', 'uint32', 'uint64', 'int8',
 18 |     %                 'int16', 'int32', 'int64', 'single', 'double', or
 19 |     %                 'char'
 20 |     %       col_idx - index vector corresponding to the columns of the
 21 |     %                 source file relevant to the current entry
 22 |     %      bad_vals - Cell array of strings listing the code used to
 23 |     %                 indicate missing or erroneous data in the IGRA file
 24 |     %
 25 |     % INPUTS
 26 |     %  param_def - a struct containing varname, type, col_idx,
 27 |     %              bad_vals
 28 |     %   text_mat - a matrix of characters converted to uint8. Each
 29 |     %              row corresponding to a row of text from an IGRA
 30 |     %              formatted text file
 31 |     %
 32 |     % PROPERTIES
 33 |     %   data - stored data representation of text.
 34 |     %   idx  - indexing vector for data
 35 |     %   len  - length of uncompressed data
 36 |     %   def  - format definition struct
 37 | 
 38 | 
 39 |     properties
 40 |         data % stored data representation of text
 41 |         idx  % indexing vector for data
 42 |         len  % length of uncompressed data
 43 |         def  % format definition struct
 44 |         i_unique % indexing vector if data was compressed based on unique parameters
 45 |     end
 46 | 
 47 |     methods
 48 |         function obj = Param(param_def, text_mat)
 49 |             % Param Constructor
 50 |             %
 51 |             % INPUTS
 52 |             %  param_def - a struct containing varname, type, col_idx,
 53 |             %              bad_vals
 54 |             %   text_mat - a matrix of characters converted to uint8. Each
 55 |             %              row corresponding to a row of text from an IGRA
 56 |             %              formatted text file
 57 | 
 58 |             % Error check fields in format definition struct
 59 |             found_flds = fieldnames(param_def);
 60 |             expected_flds = {'varname', 'type', 'col_idx', 'bad_vals'};
 61 |             if ~all(ismember(expected_flds, found_flds))
 62 |                 error('def struct must contain varname, type, col_idx, and bad_vals fields')
 63 |             end
 64 | 
 65 |             % Store format definition for the data entry
 66 |             obj.def = param_def;
 67 | 
 68 |             % Error check the text_mat
 69 |             if ~isa(text_mat, 'uint8')
 70 |                 error('text_mat must be a matrix of characters converted to uint8')
 71 |             end
 72 | 
 73 |             % Read the string matrix text_mat
 74 |             obj = obj.read_columns(text_mat);
 75 | 
 76 |         end
 77 | 
 78 |         function obj = read_columns(obj, text_mat)
 79 | 
 80 |             % Extract the relative columns of text
 81 |             txt = text_mat(:, obj.def.col_idx);
 82 | 
 83 |             % Compress txt by removing rows that match bad_vals and
 84 |             % creating an idx vector for mapping remaining rows (unless the
 85 |             % compression actually increases the data size in which case
 86 |             % leave it as is)
 87 |             [txt, obj.idx, obj.len] = GCSAL.IGRA.Param.compress_txt(txt, obj.def.bad_vals);
 88 | 
 89 |             % Convert txt to data based on the type in the format
 90 |             % definition
 91 |             [obj.data, obj.i_unique] = GCSAL.IGRA.Param.txt2data(txt, obj.def.type);
 92 | 
 93 |         end
 94 | 
 95 |         function h5write(obj, filename, dataset_prefix)
 96 |             % Write the data to h5 file filename with h5 path
 97 |             % dataset_prefix
 98 | 
 99 |             % Make h5 path for this varname
100 |             h5_path = GCSAL.H5.fullpath(dataset_prefix, obj.def.varname);
101 | 
102 |             % Write data, idx, and len
103 |             obj.h5_write_param(filename, h5_path, 'data')
104 |             obj.h5_write_param(filename, h5_path, 'idx')
105 |             obj.h5_write_param(filename, h5_path, 'i_unique')
106 | 
107 |             % Don't need len if data is already full length
108 |             if length(obj.data) ~= obj.len
109 |                 obj.h5_write_param(filename, h5_path, 'len')
110 |             end
111 | 
112 |         end
113 | 
114 |         function h5_write_param(obj, filename, dataset_prefix, var_name)
115 |             % Helper for calling H5.create_and_write with proper h5 path
116 |             h5_path = GCSAL.H5.fullpath(dataset_prefix, var_name);
117 |             GCSAL.H5.create_and_write(filename, h5_path, obj.(var_name))
118 |         end
119 | 
120 |     end
121 | 
122 |     methods (Static)
123 | 
124 | 
125 |         function str = pad_left(str, desired_length)
126 |             % Prepends str with enough spaces to make a string of
127 |             % desired_length
128 | 
129 |             str_length = length(str);
130 |             if str_length > desired_length
131 |                 error('str: %s is already longer than str_length: %f', str, desired_length)
132 |             end
133 | 
134 |             % Pad beginning of str with blanks
135 |             prefix = blanks(desired_length - str_length);
136 |             str = [prefix str];
137 | 
138 |         end
139 | 
140 |         function val = convert_to_min_int(val)
141 |             % Create an idx variable with values 1:max_value and of the
142 |             % smallest type that can store max_value
143 | 
144 |             max_val = max(val(:));
145 |             if isempty(max_val)
146 |                 max_val = 1;
147 |             end
148 |             if max_val < 1 || rem(max_val, 1) ~= 0
149 |                 error('max_value: %f must be a whole number greater than 0')
150 |             end
151 | 
152 |             if max_val < intmax('uint8')
153 |                 val = uint8(val);
154 |             elseif max_val < intmax('uint16')
155 |                 val = uint16(val);
156 |             elseif max_val < intmax('uint32')
157 |                 val = uint32(val);
158 |             elseif max_val < intmax('uint64')
159 |                 val = uint64(val);
160 |             else
161 |                 error(['max_val: %f exceeds the maximum value than ' ...
162 |                     'can be stored as an integer'], max_val)
163 |             end
164 |         end
165 | 
166 |         function [data, i_unique] = txt2data(txt, type)
167 |             % Convert txt to type
168 | 
169 |             % Return on empty txt
170 |             if isempty(txt)
171 |                 data = []; i_unique = [];
172 |                 return
173 |             end
174 | 
175 |             % Since txt may be very repetitive it is more efficient to find
176 |             % unique rows before trying to convert
177 |             [unique_txt, ~, i_unique] = unique(txt, 'rows');
178 |             i_unique = GCSAL.IGRA.Param.convert_to_min_int(i_unique);
179 | 
180 |             % Switch on whether data is int, real, or char
181 |             switch type
182 | 
183 |                 % Integer types
184 |                 case {'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32'}
185 |                     unique_val = GCSAL.IGRA.Param.str2int(unique_txt, type);
186 | 
187 | 
188 |                     % Float types
189 |                 case {'single', 'double'}
190 |                     unique_val = GCSAL.IGRA.Param.str2float(unique_txt, type);
191 | 
192 |                     % String types
193 |                 case 'char'
194 |                     unique_val = unique_txt;
195 | 
196 |                     % Anything else
197 |                 otherwise
198 |                     error('Unrecognized type: %s', type)
199 |             end
200 | 
201 | 
202 |             if size(unique_val, 1) == 1
203 |                 % If there is only one unique value, return that value with
204 |                 % i_unique empty
205 |                 i_unique = [];
206 |                 data = unique_val;
207 |             else
208 |                 % Otherwise determine if we can save space by representing
209 |                 % data in unique form
210 | 
211 |                 if GCSAL.IGRA.Param.compare_bytes_unique(i_unique, unique_val)
212 |                     % keep data in unique form
213 |                     data = unique_val;
214 |                 else
215 |                     % Inverse the unique call and make data the full
216 |                     % uncompressed data
217 |                     data = unique_val(i_unique,:);
218 | 
219 |                     % Revert i_unique to empty vector
220 |                     i_unique = [];
221 |                 end
222 |             end
223 |         end
224 | 
225 |         function do_compression = compare_bytes_unique(i_unique, unique_val)
226 |             %  do_compression = compare_bytes_unique(i_unique, unique_val)
227 |             %    Returns true if bytes needed to store i_unique and
228 |             %    unique_val is less than bytes needed to store
229 |             %    unique_val(i_unique,:).
230 |             %
231 |             %    This is generally true if i_unique is a smaller data type
232 |             %    than unique_val and unique_val is significantly shorter
233 |             %    than unique_val(i_unique,:)
234 | 
235 |                 % Figure out bytes requred to store data with i_unique and
236 |                 % unique_val
237 |                 bytes_i_unique= whos('i_unique');
238 |                 bytes_i_unique = bytes_i_unique.bytes;
239 | 
240 |                 bytes_unique = whos('unique_val');
241 |                 bytes_unique = bytes_unique.bytes;
242 | 
243 |                 bytes_compressed = bytes_i_unique + bytes_unique;
244 | 
245 |                 % Figure out bytes to store uncompressed data
246 |                 % Do this by multiplying bytes_unique by the
247 |                 % ratio of the size of data for compressed and uncompressed
248 |                 length_uncompressed = length(i_unique);
249 |                 length_compressed = size(unique_val, 1);
250 |                 bytes_uncompressed = bytes_unique*length_uncompressed/length_compressed;
251 | 
252 |                 % Return comparison of bytes_compressed and uncompressed
253 |                 do_compression = bytes_compressed < bytes_uncompressed;
254 | 
255 |         end
256 | 
257 |         function int = str2int(str_mat, type)
258 |             % Custom vectorized str2int function. Processes the character
259 |             % matrix str_mat by columns, converting the characters in each
260 |             % column to a number and then adding up the numbers from each
261 |             % column. This is faster than str2double or str2num because it
262 |             % relies on the characters in str_mat being well behaved.
263 |             %
264 |             % Additional the character matrinx str_mat in this case is
265 |             % represented as uint8 for efficiency
266 |             %
267 |             % Assumptions:
268 |             %   - The only characters in str_mat are ' -0123456789'
269 |             %   - Every row of str_mat is a single number
270 |             %   - Every row of str_mat is equal width with left padding
271 |             %   - Every row is ordered blanks, negative sign, numerals from
272 |             %     left to right
273 | 
274 |             % Convert char to uint8 if necessary
275 |             if isa(str_mat, 'char')
276 |                 str_mat = uint8(str_mat);
277 |             end
278 | 
279 |             % Get the size of str_mat for preallocation
280 |             [rows, cols] = size(str_mat);
281 | 
282 |             % Pre-allocate int with zeros of the correct type
283 |             int = zeros(rows,1, type);
284 | 
285 |             % Initialize place_factor with ones. This rep100resents the value
286 |             % of the current column (like 1, 10, 100, 1000, etc.)
287 |             place_factor = ones(rows,1, type);
288 | 
289 |             % Convert each character of str_mat to it's integer numeral
290 |             % (blanks become 0 and negative sign becomes -1)
291 |             numerals = GCSAL.IGRA.Param.char2numerals(str_mat);
292 | 
293 |             % Convert numerals to proper type
294 |             numerals = cast(numerals, type);
295 | 
296 |             % Go column by column to sum each row vector of numerals into a
297 |             % single value per row in a vectorized manner. Start with right
298 |             % most column for the ones place
299 |             for i = cols:-1:1
300 | 
301 |                 % Extract the current column, going right to left
302 |                 curr_col = numerals(:,i);
303 | 
304 |                 % Find any numerals in the current column that are
305 |                 % negative
306 |                 is_neg = curr_col == -1;
307 | 
308 |                 % negate where isneg
309 |                 int(is_neg) = -1*int(is_neg);
310 | 
311 |                 % Apply the place_factor multiplier to the curr_col where
312 |                 % not is_neg
313 |                 int(~is_neg) = int(~is_neg) + place_factor(~is_neg) .* curr_col(~is_neg);
314 | 
315 |                 % Increment place_factor by x10 since we are in base 10
316 |                 place_factor = place_factor*10;
317 |             end
318 | 
319 |             % error check on data type
320 |             if any(int(:) >= intmax(type))
321 |                 error('Data overflow for type: %s', type)
322 |             end
323 |         end
324 | 
325 |         function int = char2numerals(char_mat)
326 |             % converts matrix of characters char_mat to a matrix of
327 |             % numerals. The characters of char_mat are represented as uint8
328 |             % for efficiency
329 |             %
330 |             % The only acceptable characters in char_mat are ' -0123456789'
331 | 
332 |             % Convert char to uint8 if necessary
333 |             if isa(char_mat, 'char')
334 |                 char_mat = uint8(char_mat);
335 |             end
336 | 
337 |             % Create key/value map for converting uint8 characters to
338 |             % numerals. blanks become 0 and negative signs become -1
339 |             key = uint8(' -0123456789');
340 |             val = [0 -1 0 1 2 3 4 5 6 7 8 9];
341 | 
342 |             % Match all characters in char_mat with key
343 |             [test, key_idx] = ismember(char_mat, key);
344 |             if ~all(test(:))
345 |                 error('Encountered an unrecognized character. The only recognized characters are -0123456789 and blank')
346 |             end
347 | 
348 |             % Use the matching indices from ismember to index to the proper
349 |             % values in val
350 |             int = val(key_idx);
351 | 
352 |             % int defaults to a row vector if the char_mat is a column
353 |             % vector, in this case transpose to size of in matches size of
354 |             % char_mat
355 |             if size(char_mat,2) == 1
356 |                 int = int';
357 |             end
358 | 
359 |             % Error check on size
360 |             if ~all(size(int) == size(char_mat))
361 |                 error('size error')
362 |             end
363 | 
364 |         end
365 | 
366 |         function float = str2float(str_mat, type)
367 |             % Uses str2num to convert the character matrix str_mat to a
368 |             % number then applies the necessary type conversion
369 | 
370 |             float = str2num(char(str_mat)); %#ok<ST2NM>
371 |             float = cast(float, type);
372 |         end
373 | 
374 |         function int_array = bits2ints(bits)
375 |             % Convert a logical array bits to an array of integers. A
376 |             % logical array actually uses 1 byte (8 bits) to represent each
377 |             % boolean. By converting the logical array to integers you can
378 |             % reduce the size in memory by 8 times.
379 | 
380 |             % convert to uint32 and column vector
381 |             bits = uint32(logical(bits(:)));
382 | 
383 |             % Want to convert to array of uint32 whereeach integer has 32
384 |             % bits. We will reshape bits be an Nx32 matrix, but bits may
385 |             % not be divisible by 32 so we will pad with a prefix of zeros
386 |             % as necessary.
387 |             L = 32;
388 | 
389 |             % First, keep track of how many bits we started with so the
390 |             % first bit can be disambiguated upon decoding
391 |             N_bits = uint32(length(bits));
392 | 
393 |             % Calculate how much padding is needed
394 |             padding = L -rem(length(bits), L);
395 |             if padding == L; padding = 0; end
396 | 
397 |             % Prepend padding
398 |             bits = [zeros(padding, 1); bits];
399 | 
400 |             % Reshape
401 |             bits = reshape(bits, L, [])';
402 | 
403 |             % convert each group of 32 bits to a uint32 by adding up each
404 |             % column multiplied 2^i_col
405 |             int_array = zeros(size(bits,1), 1, 'uint32');
406 |             for i_col = 1:L
407 |                 int_array = int_array + bits(:,L+1-i_col).*2.^(i_col-1);
408 |             end
409 | 
410 |             % Finally prefix N_bits at the beginning so that this array can
411 |             % be decoded without ambiguity about the bits that were padded
412 |             int_array = [N_bits; int_array];
413 | 
414 |         end
415 | 
416 |         function bits = ints2bits(int_array)
417 |             % Convert an array of integers (assumed to be uint32 in this
418 |             % implementation) to bits represented as a logical array.
419 | 
420 |             % First integer in int_array contains the # of bits stored
421 |             N_bits = int_array(1);
422 |             int_array(1) = [];
423 | 
424 |             % To convert an integer to bits we need to keep dividing by 2
425 |             % and checking the remainder
426 |             L = 32;
427 |             int_array = double(int_array); % this ensures division by 2 works properly
428 |             bits = zeros(32, length(int_array));
429 |             for i = 1:L
430 |                 % Check the remainder when dividing by 2
431 |                 bits(L+1-i, :) = mod(int_array, 2);
432 | 
433 |                 % Reduce by half
434 |                 int_array = floor(int_array/2);
435 |             end
436 | 
437 |             % Ensure column vector
438 |             bits = bits(:);
439 | 
440 |             % We may have some extra bits that were added by the padding
441 |             % process caused by N_bits not being exactly divisible by 32.
442 |             % This step removes any extra bits that were prefixed
443 |             bits_to_remove = length(bits) - N_bits;
444 |             bits(1:bits_to_remove) = [];
445 | 
446 |         end
447 | 
448 |         function idx = compress_idx(idx_good, original_length)
449 |             % idx can be represented as either a list of the good indices,
450 |             % a list of the bad indices, or a logical array of bits. This
451 |             % function calculates which one will be most efficient
452 | 
453 |             % If idx_good is empty, then we do not need to continue
454 |             if isempty(idx_good)
455 |                 idx = [];
456 |                 return
457 |             end
458 | 
459 |             % If idx_good is the same length as original_length then no
460 |             % compression occured and idx can be returned empty
461 |             if length(idx_good) == original_length
462 |                 idx = [];
463 |                 return
464 |             end
465 | 
466 |             % Crete a logical index version of idx
467 |             idx_logical = zeros(original_length,1, 'uint8');
468 |             idx_logical(idx_good) = 1;
469 | 
470 |             % Create the inverse idx
471 |             idx_bad = find(~idx_logical);
472 | 
473 |             % Convert to the min int type to save space
474 |             idx_bad = GCSAL.IGRA.Param.convert_to_min_int(idx_bad); %#ok<FNDSB>
475 | 
476 |             % Convert logical from bits to int to save space
477 |             idx_logical = GCSAL.IGRA.Param.bits2ints(idx_logical);
478 | 
479 |             % Find which idx corresponds to the fewest bytes
480 |             var_info(1) = whos('idx_good');
481 |             var_info(2) = whos('idx_bad');
482 |             var_info(3) = whos('idx_logical');
483 |             [~, idx_type] = min([var_info.bytes]);
484 | 
485 |             % Set idx based on the idx_type found to be most efficient
486 |             switch idx_type
487 |                 case 1
488 |                     idx = idx_good;
489 |                 case 2
490 |                     idx = idx_bad;
491 |                 case 3
492 |                     idx = idx_logical;
493 |             end
494 | 
495 |             % Prepend the idx_type to the idx array
496 |             idx = [idx_type; idx];
497 | 
498 |         end
499 | 
500 |         function idx_out = uncompress_idx(idx_in, original_length)
501 | 
502 |             % First value in idx_in should be encoding of index type
503 |             idx_type = idx_in(1);
504 |             idx_in(1) = [];
505 | 
506 |             % Switch on index type
507 |             switch idx_type
508 | 
509 |                 case 1
510 |                     % idx_in is already good_vals
511 |                     idx_out = idx_in;
512 | 
513 |                 case 2
514 |                     % idx_in is bad_vals and needs to be inversed
515 |                     logical_array = ones(original_length, 1);
516 |                     logical_array(idx_in) = 0;
517 |                     idx_out = find(logical_array);
518 | 
519 |                 case 3
520 |                     % idx_in is logical bits represented as integer aray
521 |                     logical_array = GCSAL.IGRA.Param.ints2bits(idx_in);
522 |                     idx_out = find(logical_array);
523 | 
524 |                 otherwise
525 |                     error('Unrecognized idx type')
526 |             end
527 | 
528 |         end
529 | 
530 |     end
531 | 
532 |     methods (Static, Access = 'private')
533 |         function data = unique_inverse(unique_val, i_unique)
534 |             % Use i_unique to index unique_val and return the original
535 |             % ordering of the data before unique was called. If, however,
536 |             % there is only one unique element, then return just the unique
537 |             % value
538 | 
539 |             if size(unique_val, 1) == 1
540 |                 data = unique_val;
541 |             else
542 |                 data = unique_val(i_unique,:);
543 |             end
544 |         end
545 | 
546 |         function [txt, idx, original_length] = compress_txt(orig_txt, bad_vals)
547 | 
548 |             % Cache # of rows in orig_txt
549 |             original_length = size(orig_txt, 1);
550 |             original_length = GCSAL.IGRA.Param.convert_to_min_int(original_length);
551 | 
552 |             % Create compressed version of txt by removing rows that match
553 |             % bad_vals and keeping track of the idx for the remaining rows
554 |             [txt, idx] = GCSAL.IGRA.Param.remove_bad_vals(orig_txt, bad_vals);
555 | 
556 |             % Compress idx as efficiently as possible
557 |             idx = GCSAL.IGRA.Param.compress_idx(idx, original_length);
558 | 
559 | 
560 |         end
561 | 
562 |         function [txt, idx] = remove_bad_vals(txt, bad_vals)
563 | 
564 |             % Get size
565 |             [N_rows, N_cols] = size(txt);
566 | 
567 |             % Initialize indexing to incldue all rows
568 |             idx = (1:N_rows)';
569 |             idx = GCSAL.IGRA.Param.convert_to_min_int(idx);
570 | 
571 |             % Removes rows of txt that match any of the strings in
572 |             % bad_vals
573 | 
574 |             % Loop through the list of bad_vals in the format definition
575 |             % Each bad_val should be a string
576 |             for i = 1:length(bad_vals)
577 | 
578 |                 % Ensure bad_val is the correct width by padding
579 |                 curr_bad_val = GCSAL.IGRA.Param.pad_left(bad_vals{i}, N_cols);
580 | 
581 |                 % Convert char to uint8 to match format of txt matrix
582 |                 curr_bad_val = uint8(curr_bad_val);
583 | 
584 |                 % Find compare curr_bad_val to each row of txt
585 |                 matching_characters = bsxfun(@eq, curr_bad_val, txt);
586 | 
587 |                 % Find rows where all characters match
588 |                 matching_rows = all(matching_characters, 2);
589 | 
590 |                 % Remove matching_rows from txt and idx
591 |                 txt(matching_rows, :) = [];
592 |                 idx(matching_rows) = [];
593 |             end
594 |         end
595 |     end
596 | end
597 | 


--------------------------------------------------------------------------------
/+GCSAL/GCSAL.m:
--------------------------------------------------------------------------------
   1 | classdef GCSAL < handle
   2 |     % Copyright (c) Facebook, Inc. and its affiliates.
   3 |     %
   4 |     % This source code is licensed under the MIT license found in the
   5 |     % LICENSE file in the root directory of this source tree.
   6 |     %
   7 |     % GCSAL - Global Climate Statical Analysis Library
   8 |     %   Inherits handle class making objects of this class pointers
   9 | 
  10 | 
  11 |     properties
  12 |         h5_fname % path to .h5 source file for loading data
  13 |         h5_info  % struct returned by h5info()
  14 |         headers  % struct array, each element containing header data from an IGRA station
  15 |         stations % struct array, each element containing id, latitude, and longitude information for an IGRA station
  16 |         defs     % parameter format definitions
  17 |         countries  % struct array, each element countaining infomration for a country including its name, lat/long of its borders, and which staitons are contained by that country
  18 |         entries % struct of entries data that has been cached
  19 |         do_cache_entries  % boolean whether to cache entries are not. Normally true but if you are running out of RAM you can turn this off
  20 |         quiet_mode  % Suppress status text messages and waitbars
  21 |         plot_mode  % Suppress plots
  22 |     end
  23 | 
  24 |     methods
  25 |         function obj = GCSAL(in_file)
  26 |             % obj = GCSAL(in_file)
  27 |             %   Create a GCSAL object from in_file. in_file should be the
  28 |             %   path to either a .h5 or .mat file. The .h5 file could have
  29 |             %   been created with GCSAL.IGRA.mat2h5_dir for example. A .mat
  30 |             %   file would have been created by an earlier call to this
  31 |             %   constructor function
  32 | 
  33 |             % Default to output all comments
  34 |             obj.quiet_mode = false;
  35 |             
  36 |             % Default to make all plots
  37 |             obj.plot_mode = true;
  38 |             
  39 |             % Set do_cache_entries true
  40 |             obj.do_cache_entries = true;
  41 | 
  42 |             % Load format definitions
  43 |             obj.defs = GCSAL.IGRA.format_definitions();
  44 | 
  45 |             % Expected fields for loading from/saving to .mat file
  46 |             flds = {'h5_info', 'h5_fname', 'headers', 'countries', 'stations'};
  47 | 
  48 |             % Initialize obj.entries to empty struct
  49 |             obj.clear_entries_cache();
  50 | 
  51 |             % Extract extension from in_file
  52 |             [~,~, ext] = fileparts(in_file);
  53 | 
  54 |             % Switch on file extension of in_file
  55 |             switch ext
  56 | 
  57 |                 % For .h5 file, load basic info from .h5 and save to .mat
  58 |                 case '.h5'
  59 | 
  60 |                     % in_file was .h5 so assign h5_fname
  61 |                     obj.h5_fname = in_file;
  62 | 
  63 |                     % Get h5info
  64 |                     tic; fprintf('Parsing info from h5 file...');
  65 |                     obj.h5_info = h5info(obj.h5_fname);
  66 |                     fprintf(' Complete in %.1f seconds\n', toc);
  67 | 
  68 |                     % Load all headers to RAM for quicker data access
  69 |                     tic; fprintf('Loading headers...');
  70 |                     obj.headers = obj.load_all_headers();
  71 |                     fprintf(' Complete in %.1f seconds\n', toc);
  72 | 
  73 |                     % Create stations struct array from headers and create
  74 |                     % countries from stations struct. These data
  75 |                     % structures offer quick access to finding structures
  76 |                     % based on their lat/long location or which country
  77 |                     % they are in
  78 |                     tic; fprintf('Initiazling stations struct and country map ...');
  79 |                     obj.stations = GCSAL.GCSAL.initialize_stations(obj.headers);
  80 |                     obj.countries = GCSAL.Map.map_stations_by_country(obj.stations);
  81 |                     fprintf(' Complete in %.1f seconds\n', toc);
  82 | 
  83 |                     % Save the workspace to a .mat file for quicker loading
  84 |                     % in the future
  85 |                     tic; fprintf('Saving h5 info to .mat file for faster loading...');
  86 |                     for i = 1:length(flds)
  87 |                         to_mat.(flds{i}) = obj.(flds{i}); %#ok<STRNU>
  88 |                     end
  89 |                     save([in_file '.info.mat'], '-struct', 'to_mat');
  90 |                     fprintf(' Complete in %.1f seconds\n', toc);
  91 | 
  92 | 
  93 |                     % For .mat file, just load and check expected variables are
  94 |                     % present
  95 |                 case '.mat'
  96 | 
  97 |                     % Load .mat file
  98 |                     tic; fprintf('Loading h5 info from .mat file...\n');
  99 |                     mat_data = load(in_file);
 100 |                     fprintf(' Complete in %.1f seconds\n', toc);
 101 | 
 102 |                     % Error check .mat file had all necessary flds
 103 |                     found_flds = fieldnames(mat_data);
 104 |                     flds_not_found_idx = ~ismember(flds, found_flds);
 105 |                     if ~all(ismember(flds, found_flds))
 106 |                         msg = sprintf('  %s\n', flds{flds_not_found_idx});
 107 |                         error('mat file is missing the following fields: %s', msg)
 108 |                     end
 109 | 
 110 |                     % Assign data in mat_data to obj
 111 |                     for i = 1:length(found_flds)
 112 |                         obj.(found_flds{i}) = mat_data.(found_flds{i});
 113 |                     end
 114 | 
 115 |                 otherwise
 116 |                     error('Unexpected file extension')
 117 |             end
 118 |         end
 119 | 
 120 |         function [N, entries] = counts(obj, stations, fld, varargin)
 121 |             % [N, entries] = counts(obj, stations, fld, varargin)
 122 |             %    Returns counts from  histcounts and makes
 123 |             %    plots showing histogram, probability and cumulative
 124 |             %    density functions for data in fld at stations
 125 |             %
 126 |             %    Additional optional parameters can be given as Name/Value
 127 |             %    pairs. See examples below.
 128 |             %
 129 |             %    'Edges' defines the edges of the bins.
 130 |             %    'FilterFields' and 'FilterRanges' together can be used to
 131 |             %    filter the data to a subset where the parameter in
 132 |             %    FilterFields matches the range in FilterRanges.
 133 |             %
 134 |             %
 135 |             % Example:
 136 |             %    % Create GCSAL object
 137 |             %    g = GCSAL.GCSAL('gcsal.h5.info.mat');
 138 |             %
 139 |             %    % Choose some stations. In this case stations within 2
 140 |             %    % degrees of the equator
 141 |             %    stations = g.station_search('LatLong', [-2 2 -180 180]);
 142 |             %
 143 |             %    % Histogram for all windspeeds
 144 |             %    [N, entries] = g.counts(stations, 'wspd');
 145 |             %
 146 |             %    % Define custom bin_edges
 147 |             %    bin_edges = 0:1:80;
 148 |             %
 149 |             %    % Do counts for windspeed filtered on geopotential
 150 |             %    % altitude between 20 and 30 km and with custom bin edges
 151 |             %    [N, entries] = g.counts(stations, 'wspd', 'FilterFields', {'gph'}, ...
 152 |             %    'FilterRanges', {[20 30]}, 'Edges', bin_edges);
 153 |             %
 154 |             %    % Now additionally filter on measurements taken in August
 155 |             %    % between 4 and 10 am
 156 |             %    [N, entries] = g.counts(stations, 'wspd', 'FilterFields', {'gph', 'month', 'hour'}, ...
 157 |             %     'FilterRanges', {[0 8], 8, [4 10]}, 'Edges', bin_edges);
 158 | 
 159 |             % Return on empty stations
 160 |             if isempty(stations)
 161 |                 warning('Stations is empty, cannot count')
 162 |                 N = []; entries = [];
 163 |                 return
 164 |             end
 165 | 
 166 |             % Parse Name/Value pairs from input
 167 |             p = inputParser;
 168 |             addOptional(p, 'Edges', GCSAL.GCSAL.default_bin_edges(fld));
 169 |             addOptional(p, 'FilterFields', {});
 170 |             addOptional(p, 'FilterRanges', {});
 171 |             addOptional(p, 'Plot', []);
 172 |             addOptional(p, 'Verbose', []);
 173 |             parse(p, varargin{:});
 174 | 
 175 |             % Rename for convenience
 176 |             edges = p.Results.Edges;
 177 |             fltr_flds = p.Results.FilterFields;
 178 |             fltr_ranges = p.Results.FilterRanges;
 179 |             obj.plot_mode = (~ismember('Plot', p.UsingDefaults) && ...
 180 |                              (p.Results.Plot == true)) || ...
 181 |                              ismember('Plot', p.UsingDefaults);
 182 |             obj.quiet_mode = (~ismember('Verbose', p.UsingDefaults) && ...
 183 |                              (p.Results.Verbose == false));
 184 | 
 185 |             % Load data from stations and all fields
 186 |             entries = obj.query(stations, fld, fltr_flds, fltr_ranges);
 187 | 
 188 |             % Get counts
 189 |             [N] = GCSAL.GCSAL.histcounts(entries, fld, edges, obj.quiet_mode);
 190 |             [pdf, cdf] = GCSAL.GCSAL.counts2pdf(N, 2);
 191 | 
 192 |             % calculate bin centers from edges
 193 |             centers = GCSAL.GCSAL.get_bin_centers(edges);
 194 | 
 195 |             % Extract parameter definitions
 196 |             def = obj.find_def(fld);
 197 | 
 198 |             % Construct labels from definitions
 199 |             label = GCSAL.GCSAL.get_label(def);
 200 | 
 201 |             % Get string describing current filters in place for use in
 202 |             % title
 203 |             title_str = GCSAL.GCSAL.description_from_filters(fltr_flds, fltr_ranges);
 204 | 
 205 |             if (obj.plot_mode)
 206 |                 % Make figure
 207 |                 figure;
 208 | 
 209 |                 % Plot histogram
 210 |                 subplot(3,1,1)
 211 |                 histogram(vertcat(entries.(fld)), edges)
 212 |                 title(sprintf('Histogram\n%s', title_str))
 213 |                 xlabel(label)
 214 |                 ylabel('# of occurences')
 215 |                 
 216 |                 % Plot probability density function
 217 |                 subplot(3,1,2)
 218 |                 plot(centers, pdf, '-x')
 219 |                 title(sprintf('Probability Density Function\n%s', title_str))
 220 |                 
 221 |                 xlabel(label)
 222 |                 ylabel('Probability of occuring')
 223 |                 
 224 |                 % Plot cumulative density funciton
 225 |                 subplot(3,1,3)
 226 |                 plot(centers, cdf, '-x')
 227 |                 title(sprintf('Cumulative Density Function\n%s', title_str))
 228 |                 
 229 |                 xlabel(label)
 230 |                 ylabel('Probability of exceeding')
 231 |             end
 232 |             
 233 |             obj.plot_mode = true;
 234 |             obj.quiet_mode = false;
 235 |         end
 236 | 
 237 |         function [N, entries, stats] = counts2(obj, stations, x_fld, y_fld, varargin)
 238 |             % [N, entries] = counts2(obj, stations, x_fld, y_fld, varargin)
 239 |             %    Returns two dimensional counts from  histcounts2 and makes
 240 |             %    plots showing two dimensional probability and cumulative
 241 |             %    density functions comparing data in x_fld and y_fld of
 242 |             %    stations.
 243 |             %
 244 |             %    Additional optional parameters can be given as Name/Value
 245 |             %    pairs. See examples below.
 246 |             %
 247 |             %    'XEdges', 'YEdges' defines the edges of the bins.
 248 |             %    'FilterFields' and 'FilterRanges' together can be used to
 249 |             %    filter the data to a subset where the parameter in
 250 |             %    FilterFields matches the range in FilterRanges.
 251 |             %
 252 |             % Example:
 253 |             %    % Create GCSAL object
 254 |             %    g = GCSAL.GCSAL('gcsal.h5.info.mat');
 255 |             %
 256 |             %    % Choose some stations. In this case stations within 2
 257 |             %    % degrees of the equator
 258 |             %    stations = g.station_search('LatLong', [-2 2 -180 180]);
 259 |             %
 260 |             %    % Do counts between gph and wspd
 261 |             %    [N, entries] = g.counts2(stations, 'gph', 'wspd');
 262 |             %
 263 |             %    % Do counts between gph and pressure with custom bin
 264 |             %    % edges
 265 |             %    [N, entries] = g.counts2(stations, 'gph', 'press', ...
 266 |             %          'XEdges', 0:0.5:40, 'YEdges', 0:2000:100000);
 267 |             %
 268 |             %    % Do counts for data measured between 6 and 10 am in August
 269 |             %    [N, entries] = g.counts2(stations, 'gph', 'wspd', ...
 270 |             %           'FilterFields', {'hour', 'month'}, ...
 271 |             %           'FilterRanges', {[6 10], [8 8]});
 272 | 
 273 |           % Return on empty stations
 274 |             if isempty(stations)
 275 |                 warning('Stations is empty, cannot count')
 276 |                 N = []; entries = [];
 277 |                 return
 278 |             end
 279 | 
 280 |             % Parse Name/Value pairs from input
 281 |             p = inputParser;
 282 |             addOptional(p, 'XEdges', GCSAL.GCSAL.default_bin_edges(x_fld));
 283 |             addOptional(p, 'YEdges', GCSAL.GCSAL.default_bin_edges(y_fld));
 284 |             addOptional(p, 'FilterFields', {});
 285 |             addOptional(p, 'FilterRanges', {});
 286 |             addOptional(p, 'Plot', []);
 287 |             addOptional(p, 'Verbose', []);
 288 |             parse(p, varargin{:});
 289 | 
 290 |             % Rename for convenience
 291 |             x_edges = p.Results.XEdges;
 292 |             y_edges = p.Results.YEdges;
 293 |             fltr_flds = p.Results.FilterFields;
 294 |             fltr_ranges = p.Results.FilterRanges;
 295 |             obj.plot_mode = (~ismember('Plot', p.UsingDefaults) && ...
 296 |                              (p.Results.Plot == true)) || ...
 297 |                              ismember('Plot', p.UsingDefaults);
 298 |             obj.quiet_mode = (~ismember('Verbose', p.UsingDefaults) && ...
 299 |                              (p.Results.Verbose == false));
 300 | 
 301 |             % Load data from stations and all fields
 302 |             entries = obj.query(stations, {x_fld, y_fld}, fltr_flds, fltr_ranges);
 303 | 
 304 |             % Get counts
 305 |             [N] = GCSAL.GCSAL.histcounts2(entries, x_fld, y_fld, ...
 306 |                 x_edges, y_edges, obj.quiet_mode);
 307 |             [pdf, cdf] = GCSAL.GCSAL.counts2pdf(N, 2);
 308 |             pdf = pdf';
 309 |             cdf = cdf';
 310 | 
 311 |             % calculate bin centers from edges
 312 |             x_centers = GCSAL.GCSAL.get_bin_centers(x_edges);
 313 |             y_centers = GCSAL.GCSAL.get_bin_centers(y_edges);
 314 | 
 315 |             % Extract parameter definitions
 316 |             x_def = obj.find_def(x_fld);
 317 |             y_def = obj.find_def(y_fld);
 318 | 
 319 |             % Construct labels from definitions
 320 |             x_label = GCSAL.GCSAL.get_label(x_def);
 321 |             y_label = GCSAL.GCSAL.get_label(y_def);
 322 | 
 323 |             stats.x = x_centers;
 324 |             stats.y = y_centers;
 325 |             stats.cdf = cdf;
 326 |             stats.pdf = pdf;
 327 |             
 328 |             if (obj.plot_mode)
 329 |                 % Make figure
 330 |                 figure;
 331 |                 
 332 |                 % subplot for contour of cumulative density function
 333 |                 subplot(3,1,1)
 334 |                 [C, h] = contourf(x_centers, y_centers, cdf, ...
 335 |                     [0.05:0.05:0.95 0.99]);
 336 |                 clabel(C,h, 0.1:0.2:0.9, 'LabelSpacing', 600, 'FontSize', 18);
 337 |                 xlabel(x_label)
 338 |                 ylabel(y_label)
 339 |                 title('Percentile by Altitude')
 340 |                 
 341 |                 % subplot for surf of probability density function
 342 |                 subplot(3,1,2)
 343 |                 surf(x_centers, y_centers, pdf, 'LineStyle', 'None');
 344 |                 view([0 0 1])
 345 |                 xlabel(x_label)
 346 |                 ylabel(y_label)
 347 |                 title('Probability Density Function by Altitude')
 348 |                 h_colorbar = colorbar;
 349 |                 ylabel(h_colorbar, 'probability')
 350 |                 
 351 |                 % subplot for # of samples
 352 |                 subplot(3,1,3)
 353 |                 plot(x_centers, sum(N,2)/1000, '-x');
 354 |                 xlabel(x_label)
 355 |                 ylabel('Thousands of Counts')
 356 |                 title('Sample size')
 357 |             end
 358 |             
 359 |             obj.plot_mode = true;
 360 |             obj.quiet_mode = false;
 361 |         end
 362 | 
 363 |         function [N, entries] = countsN(obj, stations, resolutions, varargin)
 364 |             % [N, entries] = countsN(obj, stations, resolutions, varargin)
 365 |             %    Returns N dimensional counts similar to histcounts but in
 366 |             %    N dimensions. Counts are performed based on the fields in
 367 |             %    the struct array resolutions. Each struct in resolutions
 368 |             %    must contain a variable name in the field 'fld' and
 369 |             %    optionally may have bin edges specified in the field
 370 |             %    'edges'.
 371 |             %
 372 |             %    Additional optional parameters can be given as Name/Value
 373 |             %    pairs. See examples below.
 374 |             %
 375 |             %    'FilterFields' and 'FilterRanges' together can be used to
 376 |             %    filter the data to a subset where the parameter in
 377 |             %    FilterFields matches the range in FilterRanges.
 378 |             %
 379 |             % Examples:
 380 |             %   % Get some stations
 381 |             %   stations = g.station_search('Countries', 'Brazil');
 382 |             %
 383 |             %   % Make 5-dimensional count matrix with default bin edges and no filtering
 384 |             %   resolutions = struct('fld', {'lat', 'lon', 'gph', 'month', 'wspd'});
 385 |             %   N = g.countsN(stations, resolutions);
 386 |             %
 387 |             %   % Add custom bin edges to gph field and limit data to only data between 6
 388 |             %   % and 10 am
 389 |             %   resolutions(3).edges = 0:1:80;
 390 |             %   N = g.countsN(stations, resolutions, 'FilterFields', {'hour'}, ...
 391 |             %                 'FilterRanges', [6 10]);
 392 | 
 393 | 
 394 |           % Return on empty stations
 395 |             if isempty(stations)
 396 |                 warning('Stations is empty, cannot count')
 397 |                 N = []; entries = [];
 398 |                 return
 399 |             end
 400 | 
 401 |             % Parse Name/Value pairs from input
 402 |             p = inputParser;
 403 |             addOptional(p, 'FilterFields', {});
 404 |             addOptional(p, 'FilterRanges', {});
 405 |             addOptional(p, 'Plot', []);
 406 |             addOptional(p, 'Verbose', []);
 407 |             parse(p, varargin{:});
 408 | 
 409 |             % Rename for convenience
 410 |             fltr_flds = p.Results.FilterFields;
 411 |             fltr_ranges = p.Results.FilterRanges;
 412 |             obj.plot_mode = (~ismember('Plot', p.UsingDefaults) && ...
 413 |                              (p.Results.Plot == true)) || ...
 414 |                              ismember('Plot', p.UsingDefaults);
 415 |             obj.quiet_mode = (~ismember('Verbose', p.UsingDefaults) && ...
 416 |                              (p.Results.Verbose == false));
 417 | 
 418 |             % fill in edges
 419 |             for i = 1:length(resolutions)
 420 |                 if ~isfield(resolutions(i), 'edges') || isempty(resolutions(i).edges)
 421 |                     resolutions(i).edges = GCSAL.GCSAL.default_bin_edges(resolutions(i).fld);
 422 |                 end
 423 |             end
 424 | 
 425 |             flds = {resolutions.fld};
 426 | %             edges = {resolutions.edges};
 427 | 
 428 |             % Load data from stations and all fields
 429 |             entries = obj.query(stations, flds, fltr_flds, fltr_ranges);
 430 | 
 431 |             % Get counts
 432 |             [N] = GCSAL.GCSAL.histcountsN(entries, resolutions, obj.quiet_mode);
 433 | 
 434 |             obj.plot_mode = true;
 435 |             obj.quiet_mode = false;
 436 |         end
 437 | 
 438 |         function entries = query(obj, stations, params, fltr_flds, fltr_rngs)
 439 |             % entries = query(obj, station_ids, params, fltr_flds, fltr_rngs)
 440 |             %   Returns a struct array with each element containing the
 441 |             %   data for params for a station in stations.
 442 |             %
 443 |             %   Finds the data either by reading from the H5 file located
 444 |             %   at obj.h5_fname or by findind the data cached in
 445 |             %   obj.entries or obj.headers.
 446 |             %
 447 |             % INPUTS
 448 |             %     stations - A string, cell array of strings of struct array
 449 |             %                containing the station ids from which to load
 450 |             %                data
 451 |             %       params - params can be a string or cell array of strings.
 452 |             %                params can be either from the entry data or
 453 |             %                header data but at least one string in params
 454 |             %                must be from entry data
 455 |             %    fltr_flds - (optional) cell array of filtering parameter names
 456 |             %    fltr_rngs - (optional) cell array of filtering ranges
 457 |             %
 458 |             %   For a list of available by params see g.defs.header.params
 459 |             %   and g.defs.entries.params
 460 |             %
 461 |             %   If params or staiton_ids is empty, returns an empty struct
 462 |             %
 463 |             %   Examples:
 464 |             %      % Create GCSAL object
 465 |             %      g = GCSAL.GCSAL('gcsal.h5.info.mat');
 466 |             %
 467 |             %      % Get stations located in Botswana
 468 |             %      stations = g.station_search('Countries', {'Botswana'});
 469 |             %
 470 |             %      % Get all geopotential height and windspeed data as well
 471 |             %      % as hour, month, and year data
 472 |             %      entries1 = g.query(stations, {'gph', 'wspd', 'hour', 'month', 'year'});
 473 |             %
 474 |             %      % Plot distribution of hours and years for the data in entries1
 475 |             %      figure; histogram(vertcat(entries1.hour))
 476 |             %      figure; histogram(vertcat(entries1.year))
 477 |             %
 478 |             %      % Get gph and wspd data measured between 6 and 4 pm
 479 |             %      entries2 = g.query(stations, {'gph', 'wspd'}, 'hour', [6 16]);
 480 |             %
 481 |             %      % Plot distribution of hours for the data in entries2
 482 |             %      figure; histogram(vertcat(entries2.hour))
 483 |             %
 484 |             %      % Get data corresponding only to measuresments taken
 485 |             %      % in August between 4am and Noon and in the years 1990
 486 |             %      % to 1999
 487 |             %      entries3 = g.query(stations, {'gph', 'wspd'}, ...
 488 |             %                         {'month', 'hour', 'year'}, ...
 489 |             %                         {8, [4 12], [1990 1999]});
 490 |             %
 491 |             %      % Plot distribution of years for the data in entries3
 492 |             %      figure; histogram(vertcat(entries3.year))
 493 | 
 494 | 
 495 |             % Return empty struct array if no stations ids or params given
 496 |             if isempty(stations) || isempty(params)
 497 |                 entries = struct([]);
 498 |                 return
 499 |             end
 500 | 
 501 |             % Set filter parameters to empty cell arrays if not given
 502 |             if ~exist('fltr_flds', 'var')
 503 |                 fltr_flds = {};
 504 |             end
 505 | 
 506 |             if ~exist('fltr_rngs', 'var')
 507 |                 fltr_rngs = {};
 508 |             end
 509 | 
 510 |             % Ensure filter parameters are cell arrays
 511 |             fltr_flds = cellstr(fltr_flds);
 512 |             if ~iscell(fltr_rngs)
 513 |                 fltr_rngs = {fltr_rngs};
 514 |             end
 515 | 
 516 |             % Error check varargin was input in pairs
 517 |             if length(fltr_flds) ~= length(fltr_rngs)
 518 |                 error('Expected length of filter_flds and filter_ranges to match')
 519 |             end
 520 | 
 521 |             % Add all restriction range fields to params since they need to
 522 |             % be loaded as well
 523 |             params = unique([params fltr_flds]);
 524 | 
 525 |             % Get all header and entries parameters
 526 |             all_header_params = fieldnames(obj.defs.header.params);
 527 |             all_entries_params = fieldnames(obj.defs.entries.params);
 528 | 
 529 |             % Verify all params can be  found in either header or entries
 530 |             found = ismember(params, [all_header_params; all_entries_params]);
 531 |             if any(~found)
 532 |                 msg = sprintf('  %s\n', params{~found});
 533 |                 error('The following params were invalid: %s\n', msg)
 534 |             end
 535 | 
 536 |             % Determine whether each param is part of the header data or
 537 |             % entry data
 538 |             curr_header_params = intersect(params, all_header_params);
 539 |             curr_entries_params = intersect(params, all_entries_params);
 540 | 
 541 |             % Make sure at least one param is in entries group
 542 |             if isempty(curr_entries_params)
 543 |                 error('At least one param must be an entry')
 544 |             end
 545 | 
 546 |             %Ensure stations is a character array
 547 |             station_ids = GCSAL.GCSAL.station_id_str(stations);
 548 | 
 549 |             % Load data from entries
 550 |             entries = obj.load_from_stations('entries', station_ids, curr_entries_params);
 551 | 
 552 |             % Add data from headers
 553 |             entries = obj.add_header_params_to_entries(entries, curr_header_params);
 554 | 
 555 |             % Filter data according to range limits
 556 |             if (~obj.quiet_mode)
 557 |                 tic;
 558 |                 fprintf('Applying filters... ');
 559 |             end
 560 |             for i = 1:length(fltr_rngs)
 561 |                 entries = GCSAL.GCSAL.filter_data_by_range(entries, fltr_flds{i}, fltr_rngs{i});
 562 |             end
 563 |             if (~obj.quiet_mode)
 564 |                 fprintf('Complete in %.1f seconds\n', toc);
 565 |             end
 566 | 
 567 |             % Clear entries if nargout is 0 so we don't use any RAM in the
 568 |             % base workspace for "ans"
 569 |             if nargout == 0
 570 |                 entries = [];
 571 |             end
 572 |         end
 573 | 
 574 |         function stations_match = station_search(obj, varargin)
 575 |             % stations_match = station_search(obj, varargin)
 576 |             %   Search for stations by Latitude and Longitude, or by
 577 |             %   Country, or by name. Search criteria are given in
 578 |             %   Name/Value pairs with the key words 'LatLong, 'Countries',
 579 |             %   and "IDRegex". If multiple criteria are given they are
 580 |             %   combined with AND.
 581 |             %
 582 |             % Examples:
 583 |             %    % Create GCSAL object
 584 |             %    g = GCSAL.GCSAL('gcsal.h5.info.mat');
 585 |             %
 586 |             %   % Find stations within 25 degrees of the equator
 587 |             %   stations1 = g.station_search('LatLong', [-25 25 -180 180]);
 588 |             %
 589 |             %   % Find stations in Brazil or India and within 25 degrees of
 590 |             %   % the equator
 591 |             %   stations2 = g.station_search('Countries', {'Brazil', 'India'}, ...
 592 |             %                               'LatLong', [-25 25 -180 180]);
 593 |             %
 594 |             %   % Find stations with IDs beginnign with the letter A.
 595 |             %   % Note that in regex ^ means beginning of the line
 596 |             %   stations3 = g.station_search('IDRegex', '^/A');
 597 |             %
 598 |             %   % Find stations in Brazil or India AND within 25 degrees of
 599 |             %   % the equator AND with station IDs ending in 5
 600 |             %   % Note that in regex $ means end of the line
 601 |             %   stations4 = g.station_search('Countries', {'Brazil', 'India'}, ...
 602 |             %                               'LatLong', [-25 25 -180 180], ...
 603 |             %                               'IDRegex', '5$');
 604 | 
 605 |             % Parse varargin
 606 |             p = inputParser;
 607 |             addOptional(p, 'Countries', []);
 608 |             addOptional(p, 'IDRegex', []);
 609 |             addOptional(p, 'Lat', []);
 610 |             addOptional(p, 'LatLong', []);
 611 |             addOptional(p, 'Nearest', []);
 612 |             addOptional(p, 'Number', []);
 613 |             addOptional(p, 'Range', []);
 614 |             addOptional(p, 'Plot', []);
 615 |             addOptional(p, 'Verbose', []);
 616 |             parse(p, varargin{:});
 617 | 
 618 |             obj.plot_mode = (~ismember('Plot', p.UsingDefaults) && ...
 619 |                              (p.Results.Plot == true)) || ...
 620 |                              ismember('Plot', p.UsingDefaults);
 621 |             obj.quiet_mode = (~ismember('Verbose', p.UsingDefaults) && ...
 622 |                              (p.Results.Verbose == false));
 623 |                          
 624 |             % Plot the map of world with all stations marked
 625 |             figure; hold all;
 626 |             obj.plot_world_map();
 627 | 
 628 |             % Initialize station_ids_match to all station ids
 629 |             ids_match = GCSAL.GCSAL.station_id_str(obj.stations);
 630 |             Lmax = length(ids_match);
 631 | 
 632 |             % If Nearest specified:
 633 |             if ~ismember('Nearest', p.UsingDefaults)
 634 |                 
 635 |                 % Find stations in lat/long range
 636 |                 num = uint16(p.Results.Number);
 637 |                 if (isempty(num) || (num < 1))
 638 |                     num = 1;
 639 |                 elseif (num > Lmax)
 640 |                     num = Lmax;
 641 |                 end
 642 |                 [stations_nearest, arclen] = ...
 643 |                     obj.stations_near_latlong(p.Results.Nearest, num);
 644 |                 
 645 |                 % Report # stations found
 646 |                 L = length(stations_nearest);
 647 |                 if (~obj.quiet_mode)
 648 |                     fprintf('%d stations found near lat/long\n', L)
 649 |                 end
 650 |                 
 651 |                 % Find intersect of stations found so far and currently
 652 |                 % found stations
 653 |                 curr_ids = GCSAL.GCSAL.station_id_str(stations_nearest);
 654 |                 ids_match = intersect(ids_match, curr_ids, 'rows');
 655 |             end
 656 |             
 657 |             % If LatLong specified:
 658 |             if (~ismember('LatLong', p.UsingDefaults) || ...
 659 |                     ~ismember('Lat', p.UsingDefaults))
 660 | 
 661 |                 if ~ismember('Lat', p.UsingDefaults)
 662 |                     % Find stations in lat/long range
 663 |                     range = single(p.Results.Range);
 664 |                     if (isempty(range) || (range < 1.2))
 665 |                         range = 1.2;
 666 |                     end
 667 |                     % Find stations in lat/long range
 668 |                     lat = p.Results.Lat(1);
 669 |                     lon = p.Results.Lat(2);
 670 |                     box = [(lat - range) (lat + range) lon lon];
 671 |                 else
 672 |                     box = p.Results.LatLong;
 673 |                 end
 674 |                 
 675 |                 % Find stations in lat/long range
 676 |                 [stations_in_range, latbox, longbox] = ...
 677 |                     obj.stations_from_latlong(box);
 678 | 
 679 |                 % Report # stations found
 680 |                 L = length(stations_in_range);
 681 |                 if (~obj.quiet_mode)
 682 |                     fprintf('%d stations found in lat/long range\n', L)
 683 |                 end
 684 | 
 685 |                 % Find intersect of stations found so far and currently
 686 |                 % found stations
 687 |                 curr_ids = GCSAL.GCSAL.station_id_str(stations_in_range);
 688 |                 ids_match = intersect(ids_match, curr_ids, 'rows');
 689 | 
 690 |                 % Highlight lat/long search box
 691 |                 plot(longbox, latbox, 'b-', 'LineWidth', 2)
 692 |             end
 693 | 
 694 |             % If Countries specified:
 695 |             if ~ismember('Countries', p.UsingDefaults)
 696 | 
 697 |                 % Find stations in countries
 698 |                 [stations_in_countries, countries_match] = ...
 699 |                     obj.stations_from_countries(p.Results.Countries);
 700 | 
 701 |                 % Report # stations found
 702 |                 L = length(stations_in_countries);
 703 |                 if (~obj.quiet_mode)
 704 |                     fprintf('%d stations found in countries\n', L)
 705 |                 end
 706 | 
 707 |                 % Find intersect of stations found so far and currently
 708 |                 % found stations
 709 |                 curr_ids = GCSAL.GCSAL.station_id_str(stations_in_countries);
 710 |                 ids_match = intersect(ids_match, curr_ids, 'rows');
 711 | 
 712 |                 % Highlight border of countries searched
 713 |                 for i = 1:length(countries_match)
 714 |                     plot(countries_match(i).Lon, countries_match(i).Lat, ...
 715 |                         'b-', 'linewidth', 2)
 716 |                 end
 717 |             end
 718 | 
 719 |             % IfIDRegex specified:
 720 |             if ~ismember('IDRegex', p.UsingDefaults)
 721 | 
 722 |                 % Found stations matching IDRegex
 723 |                 stations_from_regex = obj.stations_from_regex(p.Results.IDRegex);
 724 | 
 725 |                 % Report # stations found
 726 |                 L = length(stations_from_regex);
 727 |                 if (~obj.quiet_mode)
 728 |                     fprintf('%d stations found matching search_str\n', L)
 729 |                 end
 730 | 
 731 |                 % Find intersect of stations found so far and currently
 732 |                 % found stations
 733 |                 curr_ids = GCSAL.GCSAL.station_id_str(stations_from_regex);
 734 |                 ids_match = intersect(ids_match, curr_ids, 'rows');
 735 | 
 736 |                 % Highlight stations matching IDRegex
 737 |                 GCSAL.GCSAL.plot_stations(stations_from_regex, ...
 738 |                     'bo', 'MarkerSize', 6);
 739 | 
 740 |             end
 741 | 
 742 |             % Convert station ids to stations struct array
 743 |             stations_match = obj.find_stations(ids_match);
 744 |             
 745 |             % Report # stations found
 746 |             if (~obj.quiet_mode)
 747 |                 fprintf('%d stations found combined\n', length(stations_match))
 748 |             end
 749 | 
 750 |             % Highlight stations found
 751 |             GCSAL.GCSAL.plot_stations(stations_match, 'r+');
 752 | 
 753 |             if ~ismember('Nearest', p.UsingDefaults)
 754 |                 for i = 1:num
 755 |                     stations_match(i).arclen = arclen(i);
 756 |                 end
 757 |             end
 758 |             
 759 |             obj.plot_mode = true;
 760 |             obj.quiet_mode = false;
 761 |         end
 762 | 
 763 |         function plot_world_map(obj, include_stations)
 764 |             % plot_world_map(obj, include_stations)
 765 |             %   Plots a map of the world based on the country borders in
 766 |             %   obj.countries. If include_stations is true then will also
 767 |             %   put a mark for each station in obj.stations. If
 768 |             %   include_stations is not given then it defaults to true.
 769 | 
 770 | 
 771 |             % Set default value
 772 |             if ~exist('include_stations', 'var')
 773 |                 include_stations = true;
 774 |             end
 775 | 
 776 |             % Plot world map
 777 |             GCSAL.Map.world_map(obj.countries);
 778 | 
 779 |             % Plot stations
 780 |             if include_stations
 781 |                 obj.plot_stations(obj.stations, 'k.');
 782 |             end
 783 |         end
 784 | 
 785 |         function country_matches = find_countries(obj, country_names)
 786 |             % country_matches = find_countries(obj, country_names)
 787 |             %   Returns a struct array corresponding to elements of
 788 |             %   obj.countries with a name matching any string in
 789 |             %   country_names. country_names can be a string or cell array
 790 |             %   of strings. Ignores case.
 791 | 
 792 |             % Ensure country_names is lower case
 793 |             country_names = lower(country_names);
 794 | 
 795 |             % Get all names in countries and ensure lower case
 796 |             all_countries = lower({obj.countries.name});
 797 | 
 798 |             % Find matches
 799 |             i_country_match = GCSAL.GCSAL.find_keys(all_countries, country_names);
 800 | 
 801 |             % index into countries
 802 |             country_matches = obj.countries(i_country_match);
 803 |         end
 804 | 
 805 |         function station_matches = find_stations(obj, station_ids)
 806 |             % station_matches = find_stations(obj, station_ids)
 807 |             %   Returns a struct array corresponding to elements of
 808 |             %   obj.stations with an id matching any string in
 809 |             %   station_ids. station_ids can be a string or cell array
 810 |             %   of strings.
 811 | 
 812 |             % Get all station ids in obj.stations
 813 |             all_station_ids = GCSAL.GCSAL.station_id_str(obj.stations);
 814 | 
 815 |             % Find matches
 816 |             i_station_match = GCSAL.GCSAL.find_keys(all_station_ids, station_ids);
 817 | 
 818 |             % Index into stations
 819 |             station_matches = obj.stations(i_station_match);
 820 | 
 821 |         end
 822 | 
 823 |         function [header_matches, i_header_match] = find_headers(obj, stations)
 824 |             % header_matches = find_headers(obj, ids)
 825 |             %   Returns a struct array corresponding to elements of
 826 |             %   obj.headers with an id matching any string in
 827 |             %   ids. ids can be a string or cell array
 828 |             %   of strings.
 829 | 
 830 |             % Get all header ids in obj.headers
 831 |             all_ids = GCSAL.GCSAL.station_id_str(obj.headers);
 832 | 
 833 |             station_ids = GCSAL.GCSAL.station_id_str(stations);
 834 |             % Find matches
 835 |             i_header_match = GCSAL.GCSAL.find_keys(all_ids, station_ids);
 836 | 
 837 |             % Index into headers
 838 |             header_matches = obj.headers(i_header_match);
 839 |         end
 840 | 
 841 |         function def = find_def(obj, varname)
 842 |             % def = find_def(obj, varname)
 843 |             %   Searches through all parameters in obj.defs and returns the
 844 |             %   struct whose name matches varname
 845 | 
 846 |             % Loop through all groups in obj.defs
 847 |             groups = fieldnames(obj.defs);
 848 |             for i = 1:length(groups)
 849 | 
 850 |                 % Pull out the parameter names in param
 851 |                 param_names = fieldnames(obj.defs.(groups{i}).params);
 852 | 
 853 |                 % Use ismember to search for a match
 854 |                 [~, idx ] = ismember(varname, param_names);
 855 | 
 856 |                 % If a match is found return
 857 |                 if idx
 858 |                     def = obj.defs.(groups{i}).params.(varname);
 859 |                     return
 860 |                 end
 861 |             end
 862 | 
 863 |             % If we got here without hitting a return, no match was ever found
 864 |             error('Could not find defition for %s', varname)
 865 |         end
 866 | 
 867 |         function clear_entries_cache(obj)
 868 |             % clears the cached data in obj.entries. Do this if you are
 869 |             % running out of RAM
 870 | 
 871 |             obj.entries = struct();
 872 |         end
 873 | 
 874 |     end
 875 | 
 876 |     methods (Access = 'private')
 877 | 
 878 |         function entries = add_header_params_to_entries(obj, entries, params)
 879 |             % entries = add_header_params_to_entries(obj, entries, flds)
 880 |             %   For each entry in entries and each param in params, adds
 881 |             %   the data in header.(param) to entry.(param). The header is
 882 |             %   found based on matching station id. The data in
 883 |             %   header.(param) is expanded to match the length of data in
 884 |             %   entry based on a correspondence index.
 885 |             %
 886 |             %   If params is not given, then all params in header will be
 887 |             %   added.
 888 | 
 889 | 
 890 |             % If params is empty, then there is nothing to add, just return
 891 |             if isempty(params)
 892 |                 return
 893 |             end
 894 | 
 895 | 
 896 |             % Check if params was an input
 897 |             if ~exist('params', 'var')
 898 |                 % Params not given so add all params
 899 |                 add_all_params = true;
 900 |             else
 901 |                 % Params specific so do not add all params
 902 |                 add_all_params = false;
 903 | 
 904 |                 % Ensure params is a cell array
 905 |                 params = cellstr(params);
 906 |             end
 907 | 
 908 |             % Find indices of headers that match station id with entries
 909 |             [~, i_header_match] = obj.find_headers(GCSAL.GCSAL.station_id_str(entries));
 910 | 
 911 |             % Convert logical array to indices
 912 |             i_header_match = find(i_header_match);
 913 | 
 914 |             % Add entry_idx to all headers that match entries
 915 |             obj.add_entry_idx_to_headers(i_header_match);  %#ok<FNDSB>
 916 | 
 917 |             % Determine whether to do waitbar
 918 |             L = length(entries);
 919 |             do_waitbar = (L > 1) && ~obj.quiet_mode;
 920 |             if do_waitbar
 921 |                 h = waitbar(0, 'Adding header fields to entries');
 922 |             end
 923 |             
 924 |             if (~obj.quiet_mode)
 925 |                 tic;
 926 |                 fprintf('Adding header fields to entries... ');
 927 |             end
 928 | 
 929 |             % Loop through stations
 930 |             for i = 1:L
 931 | 
 932 |                 % Get header for current station
 933 |                 header = obj.find_headers(entries(i).id);
 934 | 
 935 |                 % If add_all_params set params to all fields in header
 936 |                 if add_all_params
 937 |                     params = fieldnames(header);
 938 |                 end
 939 | 
 940 |                 % Loop through params
 941 |                 for j = 1:length(params)
 942 | 
 943 |                     % Get the current field name from params
 944 |                     fld = params{j};
 945 | 
 946 |                     % Try reading from cached
 947 |                     val = obj.read_from_cached_entries(header.id, fld);
 948 | 
 949 |                     % If not found in cache, read from header
 950 |                     if isempty(val)
 951 | 
 952 |                         % Get the data in header at fld
 953 |                         val = header.(fld);
 954 | 
 955 | 
 956 |                         if size(val, 1) == 1
 957 |                             % If val is a single row we need to duplicate it to the
 958 |                             % size of the entry data
 959 |                             val = repmat(val, length(header.entry_idx), 1);
 960 |                         else
 961 |                             % Apply entry_idx to val to expand val data to
 962 |                             % match length of entry data with correct
 963 |                             % correspondence
 964 |                             val = val(header.entry_idx);
 965 |                         end
 966 | 
 967 |                         % Set entry data to val
 968 |                         obj.cache_param(header.id, fld, val);
 969 | 
 970 |                     end
 971 | 
 972 |                     % add data to entries for return struct array
 973 |                     entries(i).(fld) = val;
 974 | 
 975 |                 end
 976 | 
 977 |                 % Update waitbar
 978 |                 if do_waitbar && mod(i, ceil(L/50)) == 0
 979 |                     msg = sprintf('%d/%d: Adding header fields to entries for %s', i, L, header.id);
 980 |                     waitbar(i/L, h, msg);
 981 |                 end
 982 |             end
 983 |             
 984 |             if (~obj.quiet_mode)
 985 |                 fprintf('Complete in %.1f seconds\n', toc);
 986 |             end
 987 | 
 988 |             % Close waitbar
 989 |             if do_waitbar
 990 |                 close(h);
 991 |             end
 992 | 
 993 |         end
 994 | 
 995 |         function [stations_nearest, arclen] = ...
 996 |                 stations_near_latlong(obj, latlonposn, n)
 997 |             % [stations_nearest, arclen] = ...
 998 |             %      stations_near_latlong(obj, latlongrange)
 999 |             %   Returns an array of n-station structs for stations that
1000 |             %   are nearest the queried position.
1001 |             %   Additionally returns an array of arclength distances in meters
1002 |             %
1003 |             %   latlonposn must be a 2 element vector and is
1004 |             %   in degrees. Example:
1005 |             %          latlonposn = [9.999924 -84.205753]
1006 |             %
1007 |             
1008 |             % Return on empty input
1009 |             if isempty(latlonposn)
1010 |                 stations_nearest = struct();
1011 |                 return
1012 |             end
1013 |             
1014 |             % error check lat/long range
1015 |             if length(latlonposn) ~= 2
1016 |                 error('Lat Lon Position must be a 2 element vector')
1017 |             end
1018 |             
1019 |             % Find stations in range as well as getting lat/long vectors
1020 |             % for plotting the search box
1021 |             [stations_nearest, arclen] = GCSAL.Map.find_nearest(...
1022 |                 obj.stations, latlonposn(1), latlonposn(2), n);
1023 |             
1024 |         end
1025 |         
1026 |         function [stations_in_range, latbox, longbox] = ...
1027 |                 stations_from_latlong(obj, latlongrange)
1028 |             % [stations_in_range, latbox, longbox] = ...
1029 |             %      stations_from_latlong(obj, latlongrange)
1030 |             %   Returns an array of station structs for stations that are
1031 |             %   located within the box defined by latlongrange.
1032 |             %
1033 |             %   Additionally returns longbox and latbox which can be used
1034 |             %   to plot the the searchbox that was used.
1035 |             %
1036 |             %   latlongrange must be a four element vector and is
1037 |             %   in degrees. Example:
1038 |             %          latlongrange = [-45 45 -180 180]
1039 |             %   would find all stations between -45 and 45 deg latitude
1040 |             %
1041 |             %   latlongrange does account for angle wrap around. Example:
1042 |             %          latlongrange = [45 -45 -180 180]
1043 |             %   would finda all stations with latitude above 45 deg or
1044 |             %   below -45.
1045 | 
1046 |             % Return on empty input
1047 |             if isempty(latlongrange)
1048 |                 stations_in_range = struct();
1049 |                 return
1050 |             end
1051 | 
1052 |             % error check lat/long range
1053 |             if length(latlongrange) ~= 4
1054 |                 error('latlongrange must be a 4 element vector')
1055 |             end
1056 | 
1057 |             % Find stations in range as well as getting lat/long vectors
1058 |             % for plotting the search box
1059 |             [stations_in_range, latbox, longbox] = GCSAL.Map.find_in_lat_long_range(...
1060 |                 obj.stations, latlongrange(1:2), latlongrange(3:4));
1061 | 
1062 |         end
1063 | 
1064 |         function [stations_in_countries, countries_match] = ...
1065 |                 stations_from_countries(obj, country_names)
1066 |             % [stations_in_countries, countries_match] = stations_from_countries(obj, country_names)
1067 |             %   Returns an array of station structs for stations that
1068 |             %   are located within the countries listed in country_names.
1069 |             %   Additionally returns a struct array for countries that
1070 |             %   match country_names
1071 | 
1072 |             % Get struct array of countries from countries matching
1073 |             % country_names
1074 |             countries_match = obj.find_countries(country_names);
1075 | 
1076 |             % Get list of all stations in matching countries
1077 |             station_ids = vertcat(countries_match.stations);
1078 | 
1079 |             % In case where no stations or countries were found ensure
1080 |             % station_ids is an empyt string
1081 |             if isempty(station_ids); station_ids = ''; end
1082 | 
1083 |             % Convert station ids to stations struct array
1084 |             stations_in_countries = obj.find_stations(station_ids);
1085 | 
1086 |         end
1087 | 
1088 |         function station_matches = stations_from_regex(obj, search_str)
1089 |             % station_matches = find_stations_regex(obj, search_str)
1090 |             %   Returns an array of station structs for stations whose ids
1091 |             %   match the regex pattern in search_str
1092 | 
1093 |             % Get all station ids in obj.stations
1094 |             all_station_ids = GCSAL.GCSAL.station_id_str(obj.stations);
1095 | 
1096 |             % Convert to cell array
1097 |             all_station_ids = cellstr(all_station_ids);
1098 | 
1099 |             % Call regexp
1100 |             regex_out = regexp(all_station_ids, search_str);
1101 | 
1102 |             %  Use cellfun to find which elements in all_station_ids had a
1103 |             %  match
1104 |             i_station_match = ~cellfun(@isempty, regex_out);
1105 | 
1106 |             % Index into stations
1107 |             station_matches = obj.stations(i_station_match);
1108 | 
1109 |         end
1110 | 
1111 |         function headers = load_all_headers(obj)
1112 |             % Find all headers in the h5_info struct and load the data from
1113 |             % the h5 file for those headers
1114 | 
1115 |             % Find all station names in h5 info struct
1116 |             all_station_ids = {obj.h5_info.Groups.Name};
1117 | 
1118 |             % Remove / from beginning of station ids
1119 |             all_station_ids(:,1) = [];
1120 | 
1121 |             % Use empty params to indicate we want to load all parameters
1122 |             params = {};
1123 |             headers = obj.load_from_stations('header', all_station_ids, params);
1124 |         end
1125 | 
1126 | 
1127 | 
1128 | 
1129 |         function out = load_from_stations(obj, group, station_ids, params)
1130 |             % Load the parameters listed in params from the data in group from
1131 |             % the H5 file for all stations in station_ids.
1132 |             %
1133 |             % If params is empty then all parameters will be loaded
1134 | 
1135 |             % Set default params to empty cell which will revert to loading
1136 |             % all parameters
1137 |             if ~exist('params', 'var')
1138 |                 params = {};
1139 |             end
1140 | 
1141 |             % Handle case where station_ids are empty
1142 |             if isempty(station_ids)
1143 |                 out = [];
1144 |                 return
1145 |             end
1146 | 
1147 |             % Ensure station_ids is a cell array
1148 |             station_ids = cellstr(station_ids);
1149 | 
1150 |             % Initialize counter
1151 |             count = 1;
1152 | 
1153 |             % Decide whether to do wait bar
1154 |             L = length(station_ids);
1155 |             do_waitbar = (L > 1) && ~obj.quiet_mode;
1156 | 
1157 |             % Open waitbar
1158 |             if do_waitbar
1159 |                 h = waitbar(0, 'Loading data from stations');
1160 |             end
1161 |             
1162 |             if (~obj.quiet_mode)
1163 |                 tic;
1164 |                 fprintf('Loading data from stations... ');
1165 |             end
1166 | 
1167 |             % Loop through all station ids
1168 |             for i = 1:L
1169 | 
1170 |                 % Attempt to read group data
1171 |                 tmp = obj.load_group(group, station_ids{i}, params);
1172 | 
1173 |                 % Assign data to out struct if tmp is not empty
1174 |                 if ~isempty(tmp)
1175 |                     % Assign data
1176 |                     out(count) = tmp; %#ok<AGROW>
1177 | 
1178 |                     % Increment counter
1179 |                     count = count + 1;
1180 |                 end
1181 | 
1182 |                 % Update waitbar
1183 |                 if do_waitbar && mod(i, ceil(L/50)) == 0
1184 |                     msg = sprintf('%d/%d: Loading data for %s/%s', i, L, station_ids{i}, group);
1185 |                     waitbar(i/L, h, msg);
1186 |                 end
1187 |             end
1188 |             if (~obj.quiet_mode)
1189 |                 fprintf('Complete in %.1f seconds\n', toc);
1190 |             end
1191 | 
1192 |             % Close wait bar
1193 |             if do_waitbar
1194 |                 close(h)
1195 |             end
1196 | 
1197 |             % If counter never incremented, return empty struct array
1198 |             if count == 1
1199 |                 out = struct([]);
1200 |             end
1201 |         end
1202 | 
1203 |         function out = load_group(obj, group, station_id, params)
1204 |             % Load the parameters listed in params from the data in group
1205 |             % and from the station in station_id from the H5 file
1206 |             %
1207 |             % If params is empty then all parameters are loaded
1208 | 
1209 | 
1210 |             % Extract parameter definitions for the current group
1211 |             param_defs = obj.defs.(group);
1212 | 
1213 |             % Set default params to all parameters
1214 |             if ~exist('params', 'var') || isempty(params)
1215 |                 params = fieldnames(param_defs.params);
1216 |             end
1217 | 
1218 |             % Initialize output
1219 |             out = [];
1220 | 
1221 |             % Find info for the current station_id in the top level h5_info
1222 |             station_info = GCSAL.GCSAL.h5info_find_children(obj.h5_info, station_id);
1223 | 
1224 |             % If station_info is empty return with warning that station_id
1225 |             % was not found
1226 |             if isempty(station_info)
1227 |                 fprintf('%s not found\n', station_id)
1228 |                 return
1229 |             end
1230 | 
1231 |             % Find group in station_info
1232 |             group_info = GCSAL.GCSAL.h5info_find_children(station_info, group);
1233 | 
1234 |             % Throw error if group not found
1235 |             if isempty(group_info)
1236 |                 error('Group not found: %s', group)
1237 |             end
1238 | 
1239 |             % For each parameter in params, load the param
1240 |             for i = 1:length(params)
1241 |                 curr = param_defs.params.(params{i});
1242 |                 out.(curr.varname) = obj.load_param(curr, group_info);
1243 |             end
1244 | 
1245 |             % Add id to struct so that the header data for this struct can
1246 |             % be easily found
1247 |             if ~isfield(out, 'id')
1248 |                 out.id = station_id;
1249 |             end
1250 |         end
1251 | 
1252 | 
1253 |         function data = load_param(obj, param_def, group_info)
1254 |             % Load the data corresponding to param_def and group_info.
1255 |             % Apply data conversion and function_handle as
1256 |             % specified in param_def
1257 |             %
1258 |             % INPUTS
1259 |             %    param_def - struct containing varname, type, and
1260 |             %                function_handle for the parameter to be read
1261 |             %   group_info - Child struct from h5info call on h5_fname that
1262 |             %                points to the data of interest
1263 | 
1264 | 
1265 |             % Get station id from group info
1266 |             info_for_fileparts = group_info.Name;
1267 |             info_for_fileparts = strrep(info_for_fileparts, '/', filesep);
1268 |             [id, group] = fileparts(info_for_fileparts);
1269 |             id(1) = [];
1270 | 
1271 |             % Try reading data from cached entries
1272 |             data = obj.read_from_cached_entries(id, param_def.varname);
1273 | 
1274 |             % If data is not empty, it param was found in cached entries so
1275 |             % we can return
1276 |             if ~isempty(data); return; end
1277 | 
1278 |             % Find info for the param in group_info based on its varname
1279 |             param_info = GCSAL.GCSAL.h5info_find_children(group_info, param_def.varname);
1280 | 
1281 |             % If param not found, return empty
1282 |             if isempty(param_info)
1283 |                 data = [];
1284 |                 return
1285 |             end
1286 | 
1287 |             % load the parameter from the H5 file using param_info
1288 |             data = GCSAL.H5.load(obj.h5_fname, param_info);
1289 | 
1290 |             % Parameter is a char, convert uint8 to char
1291 |             if strcmp(param_def.type, 'char')
1292 |                 data = char(data);
1293 |             end
1294 | 
1295 |             % If parameter was returned as a double from H5.load, but is
1296 |             % not defined as a double then convert to a single for
1297 |             % efficiency
1298 |             if isa(data, 'double') && ~strcmp(param_def.type, 'double')
1299 |                 data = single(data);
1300 |             end
1301 | 
1302 |             % Apply function from parameter definition
1303 |             if ~isempty(param_def.function_handle)
1304 |                 data = param_def.function_handle(data);
1305 |             end
1306 | 
1307 |             % Cache the data in entries
1308 |             % Since obj is a pointer (inherits handle class) we can cache
1309 |             % the data without returning it
1310 |             if ~strcmp(group, 'header')
1311 |                 obj.cache_param(id, param_def.varname, data);
1312 |             end
1313 | 
1314 |         end
1315 | 
1316 | 
1317 |         function out = read_from_cached_entries(obj, id, param)
1318 |             % Read data from cached entries if it exists otherwise return
1319 |             % empty vector
1320 | 
1321 |             out = [];
1322 |             if isfield(obj.entries, id)
1323 |                 if isfield(obj.entries.(id), param)
1324 |                     out = obj.entries.(id).(param);
1325 |                 end
1326 |             end
1327 | 
1328 |         end
1329 | 
1330 |         function cache_param(obj, id, param, value)
1331 |             % Keep data in memory in entries struct for fast loading
1332 |             % Since obj is a pointer (inherits handle class) we can set the
1333 |             % obj.entries without returning it
1334 | 
1335 |             if obj.do_cache_entries
1336 |                 obj.entries.(id).(param) = value;
1337 |             end
1338 |         end
1339 | 
1340 |         function add_entry_idx_to_headers(obj, i_headers)
1341 |             % add_entry_idx_to_headers(obj, i_headers)
1342 |             %   For the structs in obj.headers(i_headers), add the
1343 |             %   entry_idx field. This field is an indexing vector for
1344 |             %   the correspondence between header data and entry data.
1345 | 
1346 | 
1347 |             % Decide whether to do wait bar
1348 |             L = length(i_headers);
1349 |             do_waitbar = (L > 1) && ~obj.quiet_mode;
1350 | 
1351 |             % Open waitbar
1352 |             if do_waitbar
1353 |                 h = waitbar(0, 'Calculating header to entry idx');
1354 |             end
1355 | 
1356 |             % Loop through indices in i_headres
1357 |             for i = 1:length(i_headers)
1358 | 
1359 |                 % Extract the current header
1360 |                 header = obj.headers(i_headers(i));
1361 | 
1362 |                 % Check if entry_idx has already been added to this header
1363 |                 if ~isfield(obj.headers, 'entry_idx') || isempty(header.entry_idx)
1364 | 
1365 |                     % Get the entry_idx that corresponds header to entry
1366 |                     % data
1367 |                     entry_idx = GCSAL.GCSAL.header_to_entry_idx(header);
1368 | 
1369 |                     % Convert entry_idx to the smallest possible type
1370 |                     obj.headers(i_headers(i)).entry_idx = GCSAL.IGRA.Param.convert_to_min_int(entry_idx);
1371 | 
1372 |                     % Update waitbar
1373 |                     if do_waitbar && mod(i, ceil(L/50)) == 0
1374 |                         msg = sprintf('%d/%d: Calculating header to entry idx for %s', i, L, header.id);
1375 |                         waitbar(i/L, h, msg);
1376 |                     end
1377 |                 end
1378 |             end
1379 | 
1380 |             % Close wait bar
1381 |             if do_waitbar
1382 |                 close(h)
1383 |             end
1384 |         end
1385 |     end
1386 | 
1387 |     methods (Static)
1388 | 
1389 |         function entries = filter_data_by_range(entries, range_fld, range)
1390 |             % entries = filter_data_by_range(entries, range_fld, range)
1391 |             %   Filter the data in entries to keep only instances where
1392 |             %   entries.(range_fld) is in range.
1393 |             %
1394 |             %   range can be a two element vector in which case data can be
1395 |             %   anywhere between range(1) and range(2) inclusive. Or range
1396 |             %   can be a scalar in which case data must be exactly equal to
1397 |             %   range.
1398 | 
1399 | 
1400 |             % Error check on length of range
1401 |             if length(range) ~= 1 && length(range) ~=2
1402 |                 error('range should be length 1 or 2')
1403 |             end
1404 | 
1405 |             % Loop through stations
1406 |             L = length(entries);
1407 |             for i = 1:L
1408 | 
1409 |                 % Extract the data in range_fld
1410 |                 val = entries(i).(range_fld);
1411 | 
1412 |                 % Get index vector for values that are in range
1413 |                 if isscalar(range)
1414 |                     % If range is a scalar match exactly
1415 |                     idx = val == range;
1416 |                 else
1417 |                     % If range is two element vector  match between
1418 |                     % range(1) and range(2) inclusive
1419 |                     idx = val >= range(1) & val <= range(2);
1420 |                 end
1421 | 
1422 |                 % Loop through each parameter in the current entry and apply index
1423 |                 params = fieldnames(entries(i));
1424 |                 for j = 1:length(params)
1425 | 
1426 |                     % Apply index
1427 |                     if size(entries(i).(params{j}), 1) ~= 1
1428 |                         entries(i).(params{j}) = entries(i).(params{j})(idx);
1429 |                     end
1430 |                 end
1431 | 
1432 |             end
1433 | 
1434 |         end
1435 | 
1436 |         function [counts] = histcounts(entries, fld, edges, quiet_mode)
1437 |             % counts = histcounts(entries, edges, x_fld)
1438 |             %  Pulls the data located in fld for every element
1439 |             %  in entries and returns the counts in each bin constructed by
1440 |             %  bin edges
1441 | 
1442 |             if (quiet_mode)
1443 |                 tic;
1444 |                 fprintf('Counting %s...', fld );
1445 |             end
1446 |             
1447 |             % Force first and last bins to include -/+ inf
1448 |             edges(1) = -inf;
1449 |             edges(end) = inf;
1450 | 
1451 |             % Pre-allocate counts matrix with all zeros. There are one
1452 |             % fewer bins than edges on each side of bin grid
1453 |             Nrows = length(edges)-1;
1454 |             counts = zeros(1, Nrows);
1455 | 
1456 |             % Loop through each entry
1457 |             for i = 1:length(entries)
1458 | 
1459 |                 % Extract data for x and y from entries
1460 |                 x = entries(i).(fld);
1461 | 
1462 |                 % Count x with bins defined by edges and add the
1463 |                 % result to the existing counts
1464 |                 counts = counts + histcounts(x, edges);
1465 | 
1466 |             end
1467 |             
1468 |             if (quiet_mode)
1469 |                 fprintf('Complete in %.1f seconds\n', toc);
1470 |             end
1471 | 
1472 |             % The following is a simpler and more vectorized way to do the
1473 |             % same as above but surpisingly, testing proved that the above
1474 |             % is faster
1475 | %             counts = histcounts(vertcat(entries.(fld)), edges);
1476 | 
1477 |         end
1478 | 
1479 |         function [counts] = histcounts2(entries, x_fld, y_fld, x_edges, y_edges, quiet_mode)
1480 |             % counts = histcounts(entries, x_edges, y_edges, x_fld, y_fld)
1481 |             %  Pulls the data located in x_fld and y_fld for every element
1482 |             %  in entries and returns the counts in each bin constructed by
1483 |             %  bin edges defined in x_edges and y_edges
1484 | 
1485 |             if (~quiet_mode)
1486 |                 tic;
1487 |                 fprintf('Counting %s vs %$s...', x_fld, y_fld );
1488 |             end
1489 | 
1490 |             % Force first and last bins to include -/+ inf
1491 |             x_edges(1) = -inf;
1492 |             x_edges(end) = inf;
1493 |             y_edges(1) = -inf;
1494 |             y_edges(end) = inf;
1495 | 
1496 |             % Pre-allocate counts matrix with all zeros. There are one
1497 |             % fewer bins than edges on each side of bin grid
1498 |             Nrows = length(x_edges)-1;
1499 |             Ncols = length(y_edges)-1;
1500 |             counts = zeros(Nrows, Ncols);
1501 | 
1502 |             % Loop through each entry
1503 |             for i = 1:length(entries)
1504 | 
1505 |                 % Extract data for x and y from entries
1506 |                 x = entries(i).(x_fld);
1507 |                 y = entries(i).(y_fld);
1508 | 
1509 |                 % Count x and y in grid made of x/y edges and add the
1510 |                 % result to the existing counts
1511 |                 counts = counts + histcounts2(x, y, x_edges, y_edges);
1512 |             end
1513 | 
1514 |             if (~quiet_mode)
1515 |                 fprintf('Complete in %.1f seconds\n', toc);
1516 |             end
1517 | 
1518 |             % The following is a simpler and more vectorized way to do the
1519 |             % same as above but surpisingly, testing proved that the above
1520 |             % is faster
1521 | %             x = vertcat(entries.(x_fld));
1522 | %             y = vertcat(entries.(y_fld));
1523 | %             counts = histcounts2(x, y, edges);
1524 | 
1525 |         end
1526 | 
1527 |         function [counts] = histcountsN(entries, resolutions, quiet_mode)
1528 |             % counts = histcounts(entries, edges, x_fld)
1529 |             %  Pulls the data located in fld for every element
1530 |             %  in entries and returns the counts in each bin constructed by
1531 |             %  bin edges
1532 | 
1533 |             if (quiet_mode)
1534 |                 tic;
1535 |                 msg = sprintf('  %s\n', resolutions.fld);
1536 |                 fprintf('Counting... \n%s', msg)
1537 |             end
1538 |             
1539 |             % Pre-allocate counts matrix with all zeros. There are one
1540 |             % fewer bins than edges on each side of bin grid
1541 |             N = cell(size(resolutions));
1542 |             for j = 1:length(resolutions)
1543 |                 N{j} = length(resolutions(j).edges) - 1;
1544 |             end
1545 |             counts = zeros(N{:}, 'uint32');
1546 | 
1547 | 
1548 |             % Determine whether to do waitbar
1549 |             L = length(entries);
1550 |             do_waitbar = (L > 1) && ~quiet_mode;
1551 |             if do_waitbar
1552 |                 h = waitbar(0, 'Doing counts');
1553 |             end
1554 | 
1555 |             % Loop through each entry
1556 |             for i = 1:length(entries)
1557 | 
1558 |                 % Initialize bins to cell array of proper size
1559 |                 bins = cell(size(resolutions));
1560 | 
1561 |                 % Loop through parameter resolutions
1562 |                 for j = 1:length(resolutions)
1563 |                     fld = resolutions(j).fld;
1564 |                     edges = resolutions(j).edges;
1565 | 
1566 |                     % Force first and last bins to include -/+ inf
1567 |                     edges(1) = -inf;
1568 |                     edges(end) = inf;
1569 | 
1570 |                     % Extract data for x and y from entries
1571 |                     x = entries(i).(fld);
1572 | 
1573 |                     % Use discretize to determine bin index for every data
1574 |                     % point in x
1575 |                     bins{j} = discretize(x, edges);
1576 | 
1577 |                 end
1578 | 
1579 |                 % Add to counts for each idx made by bins
1580 |                 try
1581 |                     idx = sub2ind(size(counts), bins{:});
1582 |                     idx(isnan(idx)) = [];
1583 |                     for j = 1:length(idx)
1584 |                         counts(idx(j)) = counts(idx(j)) + 1;
1585 |                     end
1586 |                 catch e
1587 |                     fprintf(['Counts encountered an error with the following ' ...
1588 |                         'station so it was skipped: %s\n'], entries(i).id);
1589 |                     disp(e.identifier)
1590 |                     disp(e.message)
1591 |                     %                     keyboard
1592 |                 end
1593 | 
1594 |                 % Update waitbar
1595 |                 if do_waitbar && mod(i, ceil(L/50)) == 0
1596 |                     msg = sprintf('%d/%d: Doing counts for %s', i, L, entries(i).id);
1597 |                     waitbar(i/L, h, msg);
1598 |                 end
1599 | 
1600 |             end
1601 | 
1602 |             if (quiet_mode)
1603 |                 fprintf('complete in %.1f seconds\n', toc)
1604 |             end
1605 | 
1606 |             % Close wait bar
1607 |             if do_waitbar
1608 |                 close(h)
1609 |             end
1610 | 
1611 |             % Convert to smallest possible integer data type to save space
1612 |             counts = GCSAL.IGRA.Param.convert_to_min_int(counts);
1613 | 
1614 |         end
1615 | 
1616 |         function [pdf, cdf] = counts2pdf(counts, dim)
1617 |             % [pdf, cdf] = counts2percentile_pdf(counts, dim)
1618 |             %    For a matrix of counts, calculates the probability density
1619 |             %    function along dimension dim.
1620 |             %
1621 |             %    Conceptually if counts were a vector then
1622 |             %             pdf = counts / sum(counts)
1623 |             %
1624 |             %    This function does the calculation but in a vectorized way
1625 |             %    along dimension dim.
1626 |             %
1627 |             %    Additionally calculates the cumulative density function as
1628 |             %    cdf = cumsum(pdf, dim)
1629 | 
1630 |             % Get total counts in each row/column
1631 |             total_counts = sum(counts, dim);
1632 | 
1633 |             % Normalize counts to get probability density function
1634 |             pdf = bsxfun(@rdivide, counts, total_counts);
1635 | 
1636 |             % Accumulat pdf to get cdf
1637 |             cdf = cumsum(pdf, dim);
1638 | 
1639 |         end
1640 | 
1641 |         function bin_centers = get_bin_centers(bin_edges)
1642 |             % bin_centers = get_bin_centers(bin_edges)
1643 |             %   Returns bin_centers corresponding to midway value between
1644 |             %   each pair of edges in bin_edges
1645 | 
1646 |             left_edge = bin_edges(1:end-1);
1647 |             right_edge = bin_edges(2:end);
1648 | 
1649 |             bin_centers = (right_edge + left_edge) / 2;
1650 | 
1651 |         end
1652 | 
1653 |         function idx = find_keys(key_index, keys_to_find)
1654 |             % idx = find_keys(key_index, keys_to_find)
1655 |             %   Returns logical array idx indicating which elements in
1656 |             %   key_index match any string in keys_to_find.
1657 |             %
1658 |             %   key_index and keys_to_find can be strings, string matrices,
1659 |             %   or cell arrays of strings.
1660 |             %
1661 |             %   Warning is thrown if not all elements in keys_to_find are
1662 |             %   found in key_index
1663 | 
1664 |             % Handle case where keys_to_find is empty. This is
1665 |             % required because cellstr turns emptys strings into {''} which
1666 |             % is not an empty cell but rather has length 1
1667 |             if isempty(keys_to_find)
1668 |                 keys_to_find = {};
1669 |             end
1670 | 
1671 |             % Ensure keys and key_array are cell arrays
1672 |             keys_to_find = cellstr(keys_to_find);
1673 |             key_index = cellstr(key_index);
1674 | 
1675 |             % Use ismember to get logical array for existence of each
1676 |             % element of key_index present in keys
1677 |             idx = ismember(key_index, keys_to_find);
1678 | 
1679 |             % Warning check that all keys_to_find were found
1680 |             keys_not_found_idx = ~ismember(keys_to_find, key_index(idx));
1681 | 
1682 |             if any(keys_not_found_idx)
1683 |                 keys_not_found = keys_to_find(keys_not_found_idx);
1684 |                 msg = sprintf('  %s\n', keys_not_found{:});
1685 |                 warning('The following keys were not found: \n%s', msg)
1686 |             end
1687 |         end
1688 | 
1689 |         function info_matches = h5info_find_children(info, child_name)
1690 |             % info_matches = h5info_find_children(info, child_name)
1691 |             %   Returns a struct array corresponding to elements of
1692 |             %   info.Groups with a Name matching any string in
1693 |             %   child_name. child_name can be a string or cell array
1694 |             %   of strings. info should be part of the data structure
1695 |             %   returned by h5info.
1696 | 
1697 |             % Form search key by combining info.Name with subfolder
1698 |             search_key = GCSAL.H5.fullpath(info.Name, child_name);
1699 | 
1700 |             % Get all children names in info
1701 |             all_children_names = {info.Groups.Name};
1702 | 
1703 |             % Find matches
1704 |             i_children_match = GCSAL.GCSAL.find_keys( all_children_names, search_key);
1705 | 
1706 |             % Index in info.Groups
1707 |             info_matches = info.Groups(i_children_match);
1708 |         end
1709 | 
1710 |         function p = plot_stations(stations_to_plot, varargin)
1711 |             % p = plot_stations(stations_to_plot, varargin)
1712 |             %  Plots the lat/long coordinates of stations_to_plot. Any
1713 |             %  additional inputs to the plot function can be included in
1714 |             %  varargin. Returns a handle to the line object for the plot
1715 |             %  call.
1716 |             %
1717 |             %  stations_to_plot can either by a struct array with lat/long
1718 |             %  as fields or a list of station id strings
1719 | 
1720 |             p = plot([stations_to_plot.lon], [stations_to_plot.lat], varargin{:});
1721 | 
1722 |         end
1723 | 
1724 |         function edges = default_bin_edges(param_name)
1725 |             % edges = default_bin_edges(param_name)
1726 |             % Returns default values for bin edges given a parameter name
1727 | 
1728 |             switch param_name
1729 |                 case 'gph'
1730 |                     edges = 0:1:30;
1731 |                 case 'press'
1732 |                     edges = 0:1000:100000;
1733 |                 case 'temp'
1734 |                     edges = -100:1:40;
1735 |                 case 'rh'
1736 |                     edges = 0:1:100;
1737 |                 case 'dpdp'
1738 |                     edges = 0:1:100;
1739 |                 case 'wspd'
1740 |                     edges = 0:2:60;
1741 |                 case 'wdir'
1742 |                     edges = 0:15:360;
1743 |                 case 'month'
1744 |                     edges = 1:12;
1745 |                 case 'day'
1746 |                     edges = 1:31;
1747 |                 case 'hour'
1748 |                     edges = 0:24;
1749 |                 case 'lat'
1750 |                     edges = -90:2:90;
1751 |                 case 'lon'
1752 |                     edges = -180:4:180;
1753 |                 otherwise
1754 |                     error('unrecognized fld: %s', param_name)
1755 |             end
1756 | 
1757 |         end
1758 | 
1759 |         function label = get_label(def)
1760 |             % label = get_label(def)
1761 |             %   Returns the from a definition struct. This label
1762 |             %   can be used for and x or y labels on a plot.
1763 | 
1764 |             label = def.description;
1765 |             if ~isempty(def.units)
1766 |                 label = [label ' (' def.units ')'];
1767 |             end
1768 |         end
1769 | 
1770 |         function title_str = description_from_filters(fltr_flds, fltr_rngs)
1771 |             % Returns a string describing the filters in fltr_flds and
1772 |             % fltr_ranges that can be used on a plot title
1773 | 
1774 |             % Ensure fltr_flds and fltr_ranges are cell arrays
1775 |             fltr_flds = cellstr(fltr_flds);
1776 |             if ~iscell(fltr_rngs)
1777 |                 fltr_rngs = {fltr_rngs};
1778 |             end
1779 | 
1780 |             % Initialize string as empty
1781 |             title_str = '';
1782 | 
1783 |             % Loop through each filter element
1784 |             for i = 1:length(fltr_flds)
1785 | 
1786 |                 % Create a string describing the fitler applied based on
1787 |                 % whether filter range was min/max or single value
1788 |                 if length(fltr_rngs{i}) == 1
1789 |                     msg = sprintf('%s = %g, ', fltr_flds{i}, fltr_rngs{i});
1790 |                 elseif length(fltr_rngs{i}) == 2
1791 |                     msg = sprintf('%s = [%g to %g], ', fltr_flds{i}, ...
1792 |                         fltr_rngs{i}(1), fltr_rngs{i}(2));
1793 |                 else
1794 |                     error('fltr_ranges length expected to be 1 or 2')
1795 |                 end
1796 | 
1797 |                 % Append msg to the title_str
1798 |                 title_str = [title_str msg];  %#ok<AGROW>
1799 |             end
1800 | 
1801 |             % If title_str is empty remove the new line character at the
1802 |             % end of the string
1803 |             if ~isempty(title_str)
1804 |                 title_str(end-1:end) = [];
1805 |             end
1806 |         end
1807 | 
1808 |         function stations_out = stations_intersect(stations1, stations2)
1809 |             % out = stations_intersect(stations1, stations2)
1810 |             %   Returns the intersect of struct arrays stations1 and
1811 |             %   stations2 based on their id fields
1812 | 
1813 | 
1814 |             [~, idx] = GCSAL.GCSAL.struct_set_operation(...
1815 |                 stations1, stations2, 'id', @intersect);
1816 |             stations_out = stations1(idx);
1817 | 
1818 |         end
1819 | 
1820 |         function stations_out = stations_union(stations1, stations2)
1821 |             % out = stations_union(stations1, stations2)
1822 |             %   Returns the union of struct arrays stations1 and
1823 |             %   stations2 based on their id fields
1824 | 
1825 |             [~, ia, ib] = GCSAL.GCSAL.struct_set_operation(...
1826 |                 stations1, stations2, 'id', @union);
1827 |             stations_out = [stations1(ia) stations2(ib)];
1828 | 
1829 |         end
1830 | 
1831 |         function stations_out = stations_setxor(stations1, stations2)
1832 |             % out = stations_setxor(stations1, stations2)
1833 |             %   Returns the setxor of struct arrays stations1 and
1834 |             %   stations2 based on their id fields
1835 | 
1836 |             [~, ia, ib] = GCSAL.GCSAL.struct_set_operation(...
1837 |                 stations1, stations2, 'id', @setxor);
1838 |             stations_out = [stations1(ia) stations2(ib)];
1839 | 
1840 |         end
1841 | 
1842 |     end
1843 | 
1844 | 
1845 |     methods (Static, Access = 'private')
1846 | 
1847 |         function [c, ia, ib] = struct_set_operation(struct1, struct2, fld, operation)
1848 |             % Set the set operation on the data at struct1.(fld) and
1849 |             % struct2.(fld). operation can be intersect, union, or setxor.
1850 |             % Returns the outputs of the operation
1851 | 
1852 |             str1 = vertcat(struct1.(fld));
1853 |             str2 = vertcat(struct2.(fld));
1854 |             [c, ia, ib] = operation(str1, str2, 'rows');
1855 | 
1856 |         end
1857 | 
1858 |         function stations = initialize_stations(headers)
1859 |             % stations = initialize_stations(obj)
1860 |             %  Creates the stations struct array from
1861 | 
1862 |             % Initialize stations struct with headers.id and NaN for lat
1863 |             % and lon
1864 |             stations = struct('id', {headers.id}, 'lat', NaN, 'lon', NaN);
1865 | 
1866 |             % Loop through all headers
1867 |             for i = 1:length(headers)
1868 | 
1869 |                 % Get current header
1870 |                 header = headers(i);
1871 | 
1872 |                 % If lat and lon are sclars then use them
1873 |                 if isscalar(header.lat) && isscalar(header.lon)
1874 |                     stations(i).lat = header.lat;
1875 |                     stations(i).lon = header.lon;
1876 |                 else
1877 |                     % If lat and lon are not scalars then check how big the
1878 |                     % the biggest difference is
1879 |                     x = abs(max(header.lat) - min(header.lat));
1880 |                     y = abs(max(header.lon) - min(header.lon));
1881 |                     d = sqrt(x^2 + y^2);
1882 | 
1883 |                     % As long as the difference isn't too big, use the mode
1884 |                     if d < .2
1885 |                         stations(i).lat = mode(header.lat);
1886 |                         stations(i).lon = mode(header.lon);
1887 |                     else
1888 |                         % Otherwise we will the station lat/lon as NaN.
1889 |                         % Stations that begin with ZZ are expected to move
1890 |                         % around a lot, but any station besides ZZ, report
1891 |                         % tit.
1892 |                         if ~strcmp(header.id([1 2]), 'ZZ')
1893 |                             fprintf(['Location for station %s was not used ' ...
1894 |                                 'because it moved around by ~%g deg\n'], header.id, d);
1895 |                         end
1896 |                     end
1897 |                 end
1898 |             end
1899 |         end
1900 | 
1901 |         function idx = get_entry_idx_in_range(header, range, range_fld)
1902 | 
1903 |             % Get header values at range_fld
1904 |             val = header.(range_fld);
1905 | 
1906 |             % Find index in header where value is in range
1907 |             idx_header = val >= range(1) & val <= range(2);
1908 | 
1909 |             % Extract array of entry lengths for header values in range
1910 |             numlevs_in_range = header.numlevs(idx_header);
1911 | 
1912 |             % Next we want to calculate the index offset for the start of
1913 |             % each header in range. First we need the index to the start of
1914 |             % every entry which we get by first getting the cumulative sum
1915 |             % of all numlevs
1916 |             cumulative_numlevs_all = cumsum(header.numlevs);
1917 | 
1918 |             % Then to get an idx offset we just start at 1 and remove the
1919 |             % last element of the cumulative sum
1920 |             idx_offset_all = [1; cumulative_numlevs_all(1:end-1)];
1921 | 
1922 | 
1923 |             % Now getting the index offset to the in range headers is just
1924 |             % indexing into idx_offset_all
1925 |             idx_offset = idx_offset_all(idx_header);
1926 | 
1927 |             % Initialize idx to be size of total from numlevs
1928 |             idx = zeros(sum(numlevs_in_range), 1);
1929 | 
1930 |             % initialize start index to 1
1931 |             start = 1;
1932 |             for i = 1:length(numlevs_in_range)
1933 | 
1934 |                 % finish index is start + current # of entries - 1
1935 |                 finish = start + numlevs_in_range(i) - 1;
1936 | 
1937 |                 % Assign idx based on idx_offset and current number of
1938 |                 % entries
1939 |                 idx(start:finish) = (1:numlevs_in_range(i)) + idx_offset(i);
1940 | 
1941 |                 % For next loop start where we left off
1942 |                 start = finish+1;
1943 | 
1944 |             end
1945 | 
1946 |         end
1947 | 
1948 |         function entry_idx = header_to_entry_idx(header)
1949 |             % entry_idx = header_to_entry_idx(header)
1950 |             %   Returns an indexing vector for the correspondence between
1951 |             %   header data and entry data.
1952 |             %
1953 |             %   Each element in header corresponds to many elements in the
1954 |             %   entry data for the same station. This function determines
1955 |             %   the indexing vector that allows you to expand the header
1956 |             %   data to the length of the entry data with the header data
1957 |             %   copied for each instance of entry data for which it
1958 |             %   corresponds.
1959 | 
1960 | 
1961 |             % Initialize idx to be size of total from numlevs
1962 |             entry_idx = zeros(sum(header.numlevs), 1);
1963 | 
1964 |             % initialize start index to 1
1965 |             start = 1;
1966 |             for i = 1:length(header.numlevs)
1967 | 
1968 |                 % finish index is start + current # of entries - 1
1969 |                 finish = start + header.numlevs(i) - 1;
1970 | 
1971 |                 % Assign idx based on idx_offset and current number of
1972 |                 % entries
1973 |                 entry_idx(start:finish) = i;
1974 | 
1975 |                 % For next loop start where we left off
1976 |                 start = finish+1;
1977 |             end
1978 |         end
1979 | 
1980 |         function station_ids = station_id_str(stations)
1981 |             % station_ids = station_id_str(stations)
1982 |             %   Returns a character array whose rows correspond to the the
1983 |             %   id field from the struct array stations.
1984 |             %
1985 |             %   If stations is already a character array returns stations.
1986 | 
1987 |             % Handle empty stations
1988 |             if ~exist('stations', 'var') || isempty(stations)
1989 |                 station_ids = '';
1990 |                 return
1991 |             end
1992 | 
1993 |             % Handle case where station_ids is input as a struct array by
1994 |             % extracting station id character array
1995 |             if isstruct(stations)
1996 |                 station_ids = vertcat(stations.id);
1997 |             elseif ischar(stations)
1998 |                 station_ids = stations;
1999 |             else
2000 |                 error('stations expeted to be struct or char')
2001 |             end
2002 |         end
2003 | 
2004 | 
2005 |     end
2006 | end
2007 | 


--------------------------------------------------------------------------------