├── .gitignore ├── LICENSE ├── README.md └── glove2h5.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Matti Lyra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GloVe2H5 2 | A small utility for converting Stanford GloVe vectors to HDF5 / NumPy. The pretrained 3 | Stanford vectors are distributed as zipped text files with one line per vector. This is 4 | not the most convenient way of interacting with the vectors, this utility converts the 5 | zip files into NumPy arrays contained in HDF5 (using `h5py`) files with a separate sqllite 6 | dictionary for the vocabulry. 7 | 8 | The GloVe code (in `C`) is available on github https://github.com/stanfordnlp/GloVe and you 9 | can download the pretrained Stanford GloVe vectors from https://nlp.stanford.edu/projects/glove/. 10 | 11 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation. 12 | https://nlp.stanford.edu/pubs/glove.pdf 13 | 14 | # Install & Usage 15 | 16 | ### CLI 17 | 18 | Extract the 50-dimensional word vectors using LZF compression for HDF5 19 | 20 | ``` 21 | $ git clone https://github.com/mattilyra/glove2h5.git 22 | $ cd glove2h5 23 | $ python -m glove2h5 ~/Downloads/glove.6B.txt.zip --collection glove.6B.50d.txt --compression lzf 24 | ``` 25 | 26 | --- 27 | 28 | Assuming you've downloaded the GloVe vectors from https://nlp.stanford.edu/projects/glove/ 29 | into `./GloVe` 30 | 31 | Convert all GloVe vectors in `glove.6B.zip` to NumPy and store in an HDF5 file. 32 | The call below creates a vocabulary stored in a `sqlitedict.SqliteDict` and 33 | extracts the GloVe vectors into an `vectors.h5` file. The results are stored in the 34 | current directory under `$SOURCEFILE.glove2h5` where `$SOURCEFILE` is the name 35 | of the original source file, `glove.6B.zip` without the file extension. A separate 36 | vocabulary is created to allow indexing the vectors from HDF5. 37 | 38 | ```python 39 | glove2h5 = GloVe2H5.create_from('./GloVe/glove.6B.zip', compression='lzf') 40 | 41 | # get the 50 dimensional vector for 'sample' 42 | glove2h5['glove.50d/sample'] 43 | 44 | # get the 100 dimensional vector for 'sample' 45 | glove2h5['glove.100d/sample'] 46 | 47 | ``` 48 | 49 | ## Extract only certain dimensional vectors 50 | 51 | The `glove.6B.txt.zip` file contains vectors in 50, 100, 200 and 300 dimensions. Each 52 | of these is stored in a separate file in the zip archive. 53 | 54 | ``` 55 | - glove.6B.zip 56 | -- glove.6B.50d.txt # 50 dimensional vectors 57 | -- glove.6B.100d.txt # 100 dimensional vectors 58 | -- glove.6B.200d.txt # 200 dimensional vectors 59 | -- glove.6B.300d.txt # 300 dimensional vectors 60 | ``` 61 | 62 | Extracting all of them into `HDF5` is unnecessary (and obivously slow) if you only need 63 | some of them. You can provide a keyword to `create_from` to only extract certain files 64 | contained in the zip archive. 65 | 66 | ```python 67 | # extract only the 100 dimensional vectors 68 | glove2h5_100d = GloVe2H5.create_from('./GloVe/glove.6B.zip', collections=['glove.6B.100d.txt'], compression='lzf')` 69 | 70 | # the collection is defined automatically as 'glove.6B.100D' 71 | glove2h5['sample'] 72 | ``` 73 | 74 | # Load already extracted vectors 75 | 76 | You can load an earlier extracted set of vectors by just calling the constructor 77 | 78 | ```python 79 | glove2h5_100d = GloVe2H5('./GloVe/glove.6B.zip', collection='glove.6B.100d.txt')` 80 | 81 | # the collection was defined to be 'glove.6B.100D' so we don't need it for __getitem__ anymore 82 | glove2h5['sample'] 83 | ``` 84 | 85 | 86 | # Requirements 87 | 88 | - Python 3.6 89 | - `h5py` 90 | - `sqlitedict` 91 | -------------------------------------------------------------------------------- /glove2h5.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import zipfile 3 | import tarfile 4 | import tempfile 5 | import shutil 6 | import argparse 7 | 8 | import numpy as np 9 | import h5py 10 | import sqlitedict 11 | 12 | class GloVe2H5: 13 | def __init__(self, path, collection=''): 14 | self.path = Path(path) 15 | self.collection = collection 16 | 17 | @staticmethod 18 | def _extract_vocab_from_stanford_zip(zipfh, zipinfo, vocab): 19 | with zipfh.open(zipinfo) as inputfile: 20 | num_entries = 0 21 | for i_row, row in enumerate(inputfile): 22 | row = row.decode('utf-8').strip() 23 | parts = row.split() 24 | token = parts[0] 25 | vocab[token] = num_entries 26 | num_entries += 1 27 | vocab.commit() 28 | return num_entries 29 | 30 | @staticmethod 31 | def _extract_vectors_from_stanford_zip(zipfh, zipinfo, vocab, h5_dataset): 32 | with zipfh.open(zipinfo) as inputfile: 33 | for i_row, row in enumerate(inputfile): 34 | row = row.decode('utf-8').strip() 35 | parts = row.split() 36 | token = parts[0] 37 | vec = np.asarray(parts[1:], dtype=np.float64) 38 | h5_dataset[vocab[token]] = vec 39 | 40 | @staticmethod 41 | def create_from(datafile, collections=None, compression='lzf'): 42 | """Initialise the HDF5 container and vocabulary from the original Stanford ZIP files. 43 | 44 | Parameters 45 | ---------- 46 | 47 | datafile : str or pathlib.Path 48 | The original Stanford GloVe zip file to extract 49 | 50 | collections : list (optional) default = None 51 | Optionally a list of strings defining which collections in the zipfile should 52 | be extracted. By default all collections are extracted, this can be slow. 53 | 54 | compression : str 55 | Compression to be used for the HDF5 collections. See the h5py docs for 56 | valid values. 57 | 58 | http://docs.h5py.org/en/latest/high/dataset.html#lossless-compression-filters 59 | """ 60 | 61 | output_path = Path(datafile).expanduser().parent 62 | 63 | if not output_path.exists(): 64 | output_path.parent.mkdir() 65 | vector_dimensions = 0 66 | 67 | output_file = Path(datafile).with_suffix('.glove2h5') 68 | try: 69 | output_file.mkdir() 70 | except FileExistsError: pass 71 | 72 | h5_path = Path(output_file / 'vectors.h5') 73 | vocab_path = Path(output_file / 'vocab.sqlite') 74 | vocab_rev_path = Path(output_file / 'vocab_rev.sqlite') 75 | 76 | with zipfile.ZipFile(datafile, 'r') as zipfh,\ 77 | h5py.File(h5_path, 'w', ) as h5fh: 78 | zipfiles = zipfh.filelist 79 | zipfiles = zipfiles if collections is None else [zf for zf in zipfiles if zf.filename in collections] 80 | if collections and not zipfiles: 81 | raise RuntimeError(f'Collections {collections} not found in zipfile {datafile}.') 82 | for zipinfo in zipfiles[:1]: 83 | try: 84 | vocab = sqlitedict.SqliteDict(str(vocab_path), autocommit=False, flag='w') 85 | num_entries = GloVe2H5._extract_vocab_from_stanford_zip(zipfh, zipinfo, vocab) 86 | vocab.commit() 87 | finally: 88 | vocab.close() 89 | 90 | vocab = sqlitedict.SqliteDict(str(vocab_path), autocommit=False, flag='r') 91 | 92 | for zipinfo in zipfiles: 93 | with zipfh.open(zipinfo) as inputfile: 94 | parts = inputfile.readline().decode('utf-8').strip().split() 95 | D = len(parts) - 1 96 | 97 | dataset_name = Path(zipinfo.filename).stem 98 | h5_dataset = h5fh.create_dataset(dataset_name, (num_entries, D), dtype=np.float64, compression=compression) 99 | GloVe2H5._extract_vectors_from_stanford_zip(zipfh, zipinfo, vocab, h5_dataset) 100 | 101 | vocab.close() 102 | 103 | vector_dimensions = len(parts) - 1 104 | return GloVe2H5(output_file, collection=zipfiles[0]) 105 | 106 | def __contains__(self, entry): 107 | vocab = sqlitedict.SqliteDict(str(self.path / 'vocab.sqlite'), autocommit=False, flag='r') 108 | entry_ = Path(entry) 109 | contains = entry_.name in vocab 110 | vocab.close() 111 | return contains 112 | 113 | def __getitem__(self, entry): 114 | vocab = sqlitedict.SqliteDict(str(self.path / 'vocab.sqlite'), autocommit=False, flag='r') 115 | entry_ = Path(entry) 116 | if entry_.name in self: 117 | with h5py.File(self.path / 'vectors.h5', mode='r') as h5: 118 | token_idx = vocab[entry_.name] 119 | parent = self.collection if entry_.parent == Path('.') else entry_.parent 120 | if parent == '.': 121 | vocab.close() 122 | raise RuntimeError('HDF5 dataset name not defined, either set a default \'collection=\' in constructor or define the access key as \'d[\'collection/entry\']\'') 123 | v = h5[parent][token_idx] 124 | else: 125 | vocab.close() 126 | raise KeyError(f'Entry {entry} not found in vocabulary.') 127 | vocab.close() 128 | return v 129 | 130 | if __name__ == "__main__": 131 | parser = argparse.ArgumentParser(description='GloVe2H5 - convert Stanford GloVe vectors from .zip to HDF5.') 132 | parser.add_argument('input_file', type=str, help='Path to source file .zip to convert / extract.') 133 | parser.add_argument('--collection', nargs='+', default=None, help='(optional) Extract only specific collections from the .zip file.') 134 | parser.add_argument('--compression', type=str, choices=['gzip', 'lzf', 'szip'], default='lzf', help='(optional) Compression to use for HDF5 datasets.') 135 | args = parser.parse_args() 136 | 137 | infile = Path(args.input_file) 138 | if infile.exists() and infile.isfile(): 139 | GloVe2H5.create_from(infile, args.collections, args.compression) 140 | else: 141 | raise RuntimeError(f'Source file {args.input_file} not found.') 142 | 143 | --------------------------------------------------------------------------------