├── docs
    ├── authors.rst
    ├── history.rst
    ├── readme.rst
    ├── contributing.rst
    ├── usage.rst
    ├── installation.rst
    ├── index.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── tests
    ├── __init__.py
    └── test_langchangetrack.py
├── langchangetrack
    ├── langchangetrack.py
    ├── examples
    │   ├── pos
    │   │   └── pos_tag_dist_example.sh
    │   ├── freq
    │   │   └── freq_count_example.sh
    │   ├── data
    │   │   ├── temporal_corpus
    │   │   │   └── create_temporal_corpora.sh
    │   │   ├── test_pval.csv
    │   │   ├── test_sample.csv
    │   │   └── normalized_timeseries_sample.csv
    │   └── distributional
    │   │   └── findNearest.py
    ├── images
    │   └── gay_invisible.png
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── scripts
    │   │   ├── calculate_freq_counts.sh
    │   │   ├── calculate_pos_dist.sh
    │   │   ├── train_models.sh
    │   │   ├── freq_count.py
    │   │   ├── common_vocab.py
    │   │   └── pos_tag.py
    │   ├── dummy_regressor.py
    │   ├── LocalLinearRegression.py
    │   └── entropy.py
    ├── corpusreaders
    │   ├── __init__.py
    │   └── plainngramscorpus.py
    ├── tsconstruction
    │   ├── __init__.py
    │   ├── distributional
    │   │   ├── __init__.py
    │   │   ├── corpustoembeddings.py
    │   │   └── scripts
    │   │   │   ├── train_embeddings_ngrams.py
    │   │   │   ├── embedding_displacements.py
    │   │   │   └── learn_map.py
    │   ├── freq
    │   │   └── scripts
    │   │   │   └── create_freq_timeseries.py
    │   ├── syntactic
    │   │   └── scripts
    │   │   │   └── pos_displacements.py
    │   ├── dump_timeseries.py
    │   └── displacements.py
    ├── scripts
    │   ├── detect_cp_freq.sh
    │   ├── detect_cp_pos.sh
    │   ├── detect_cp_distributional.sh
    │   ├── freq_pipeline.py
    │   ├── pos_pipeline.py
    │   └── ngrams_pipeline.py
    └── cpdetection
    │   ├── detect_changepoints_word_ts_r.py
    │   ├── demostrate_cp.py
    │   └── detect_changepoints_word_ts.py
├── setup.cfg
├── HISTORY.rst
├── AUTHORS.rst
├── tox.ini
├── MANIFEST.in
├── requirements.txt
├── .travis.yml
├── .gitignore
├── LICENSE
├── Makefile
├── setup.py
├── CONTRIBUTING.rst
└── README.rst


/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/langchangetrack/langchangetrack.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | description-file = README.rst
4 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/pos/pos_tag_dist_example.sh:
--------------------------------------------------------------------------------
1 | pos_tag.py -f ./gutenberg.txt -o gutenberg.posdist
2 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/freq/freq_count_example.sh:
--------------------------------------------------------------------------------
1 | freq_count.py -f ../data/sample_corpora/gutenberg.txt > gutenberg.freq
2 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Usage
3 | ========
4 | 
5 | To use langchangetrack in a project::
6 | 
7 |     import langchangetrack
8 | 


--------------------------------------------------------------------------------
/langchangetrack/images/gay_invisible.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/viveksck/langchangetrack/HEAD/langchangetrack/images/gay_invisible.png


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | History
 4 | -------
 5 | 
 6 | 0.1.0 (2015-02-20)
 7 | ---------------------
 8 | 
 9 | * First release on PyPI.
10 | 


--------------------------------------------------------------------------------
/langchangetrack/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'Vivek Kulkarni'
4 | __email__ = 'viveksck@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'Vivek Kulkarni'
4 | __email__ = 'viveksck@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/langchangetrack/corpusreaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'Vivek Kulkarni'
4 | __email__ = 'viveksck@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'Vivek Kulkarni'
4 | __email__ = 'viveksck@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/distributional/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'Vivek Kulkarni'
4 | __email__ = 'viveksck@gmail.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Vivek Kulkarni <viveksck@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/data/temporal_corpus/create_temporal_corpora.sh:
--------------------------------------------------------------------------------
1 | SAMPLESIZE=100000
2 | ls /scratch2/vvkulkarni/new_semantic/ngrams_expanded/eng-fiction/19*[0,5].ngrams | parallel -j16 --progress shuf -n $SAMPLESIZE {} -o {/}
3 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/data/test_pval.csv:
--------------------------------------------------------------------------------
1 | ,word,min_pval,cp,tpval,tcp
2 | 3,gay,0.0,1975,0.0,1980
3 | 2,bitch,0.0,1950,0.0001,1955
4 | 1,sex,0.0,1955,0.0007,1965
5 | 4,recording,0.0284,1990,0.0284,1990
6 | 0,tree,0.7833,1910,1.0,
7 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26, py27, py33, py34
 3 | 
 4 | [testenv]
 5 | setenv =
 6 |     PYTHONPATH = {toxinidir}:{toxinidir}/langchangetrack
 7 | commands = python setup.py test
 8 | deps =
 9 |     -r{toxinidir}/requirements.txt
10 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | At the command line::
 6 | 
 7 |     $ easy_install langchangetrack
 8 | 
 9 | Or, if you have virtualenvwrapper installed::
10 | 
11 |     $ mkvirtualenv langchangetrack
12 |     $ pip install langchangetrack
13 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/calculate_freq_counts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CORPUS_DIR=$1
3 | WORKING_DIR=$2
4 | EXT=$3
5 | WORKERS=$4
6 | mkdir -p $WORKING_DIR
7 | mkdir -p $WORKING_DIR/counts/
8 | ls $CORPUS_DIR/*.$EXT | parallel -j${WORKERS} "freq_count.py -f {} > $WORKING_DIR/counts/{/.}.freq"
9 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/calculate_pos_dist.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CORPUS_DIR=$1
3 | WORKING_DIR=$2
4 | EXT=$3
5 | WORKERS=$4
6 | mkdir -p $WORKING_DIR
7 | mkdir -p $WORKING_DIR/posdist/
8 | ls $CORPUS_DIR/*.$EXT | parallel -j${WORKERS} "pos_tag.py -f {} -o $WORKING_DIR/posdist/{/.}.posdist"
9 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | wheel==0.23.0
 2 | argparse>=1.2.1
 3 | numpy>=0.9.1
 4 | scipy>=0.15.1
 5 | more_itertools>=2.2
 6 | joblib>=0.8.3-r1
 7 | gensim==0.10.3
 8 | six>=1.7.0
 9 | statsmodels>=0.5.0
10 | changepoint>=0.1.0
11 | nltk>=3.0.0
12 | textblob>=0.9.0
13 | textblob-aptagger>=0.2.0
14 | psutil>=2.1.1
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "2.7"
 7 |   - "2.6"
 8 |   - "pypy"
 9 | 
10 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11 | install: pip install -r requirements.txt
12 | 
13 | # command to run tests, e.g. python setup.py test
14 | script: python setup.py test
15 | 


--------------------------------------------------------------------------------
/langchangetrack/corpusreaders/plainngramscorpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import re
 6 | import gensim
 7 | 
 8 | 
 9 | class PlainNGRAMSCorpus(object):
10 | 
11 |     """Iterate over sentences(ngram) of plain ngram file"""
12 | 
13 |     def __init__(self, filename):
14 |         self.filename = filename
15 | 
16 |     def __iter__(self):
17 |             text = open(self.filename)
18 |             for sentence in text:
19 |                 yield gensim.utils.simple_preprocess(sentence, deacc=True)
20 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/train_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CORPUS_DIR=$1
 3 | WORKING_DIR=$2
 4 | EXT=${3}
 5 | WINDOW=${4}
 6 | EPOCHS=${5}
 7 | WORKERS=${6}
 8 | EMBEDDINGS_TYPE=skipgram
 9 | arr=("$CORPUS_DIR/*.$EXT")
10 | echo "Processing files", $arr
11 | echo "Training embeddings"
12 | mkdir -p $WORKING_DIR/models
13 | echo "Models will be stored in", $WORKING_DIR/models
14 | parallel -vv -j ${WORKERS} --progress train_embeddings_ngrams.py -f {} -o $WORKING_DIR/models -p {/.} -e $EMBEDDINGS_TYPE -workers ${WORKERS} --epochs ${EPOCHS} -w ${WINDOW} ::: $arr
15 | 


--------------------------------------------------------------------------------
/tests/test_langchangetrack.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | test_langchangetrack
 6 | ----------------------------------
 7 | 
 8 | Tests for `langchangetrack` module.
 9 | """
10 | 
11 | import unittest
12 | 
13 | from langchangetrack import langchangetrack
14 | 
15 | 
16 | class TestLangchangetrack(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         pass
20 | 
21 |     def test_something(self):
22 |         pass
23 | 
24 |     def tearDown(self):
25 |         pass
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | htmlcov
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Complexity
39 | output/*.html
40 | output/*/index.html
41 | 
42 | # Sphinx
43 | docs/_build
44 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/data/test_sample.csv:
--------------------------------------------------------------------------------
1 | ,word,1905,1910,1915,1920,1925,1930,1935,1940,1945,1950,1955,1960,1965,1970,1975,1980,1985,1990,1995,2000
2 | 0,tree,8089,7833,9362,9431,9929,9890,9763,9700,9989,9994,10000,10000,9999,9997,9994,9998,9951,9951,9848,9966
3 | 1,sex,4518,1986,2271,2110,661,269,95,52,9,6,0,0,7,19,11,3,10,63,471,1382
4 | 2,bitch,1272,3324,2695,484,37,23,6,1,3,0,1,7,2,33,33,576,582,2166,3117,4494
5 | 3,gay,975,2312,968,1479,895,535,460,559,517,206,139,7,10,1,0,0,3,10,127,1835
6 | 4,recording,9998,9076,7201,7725,4498,3293,1808,2149,2661,2259,2433,478,670,618,586,549,559,284,376,1375
7 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. langchangetrack documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to langchangetrack's documentation!
 7 | ======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    readme
15 |    installation
16 |    usage
17 |    contributing
18 |    authors
19 |    history
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 
28 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/detect_cp_freq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INPUT_DIR=$1
 3 | WORKING_DIR=$2
 4 | OUTPUT_DIR=$3
 5 | STARTTIMEPOINT=$4
 6 | ENDTIMEPOINT=$5
 7 | STEP=$6
 8 | FILTER_VOCAB_FILE=$7
 9 | BOOTSTRAP=${8}
10 | THRESHOLD=${9}
11 | WORKERS=${10}
12 | 
13 | mkdir -p $WORKING_DIR
14 | mkdir -p $OUTPUT_DIR
15 | mkdir -p $WORKING_DIR/timeseries
16 | 
17 | create_freq_timeseries.py -d $INPUT_DIR -s $STARTTIMEPOINT -e $ENDTIMEPOINT -p $STEP -f $WORKING_DIR/timeseries/freq_timeseries.csv --log10
18 | 
19 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/freq_timeseries.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -d -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD}
20 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/detect_cp_pos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INPUT_DIR=$1
 3 | WORKING_DIR=$2
 4 | OUTPUT_DIR=$3
 5 | STARTTIMEPOINT=$4
 6 | ENDTIMEPOINT=$5
 7 | STEP=$6
 8 | FILTER_VOCAB_FILE=$7
 9 | BOOTSTRAP=${8}
10 | THRESHOLD=${9}
11 | WORKERS=${10}
12 | 
13 | mkdir -p $WORKING_DIR
14 | mkdir -p $OUTPUT_DIR
15 | 
16 | mkdir -p $WORKING_DIR/displacements/
17 | pos_displacements.py -f $FILTER_VOCAB_FILE -d $INPUT_DIR/ -p "" -os pos -es ".posdist" -ps "" -sy $STARTTIMEPOINT -ey $ENDTIMEPOINT -s $STEP -e "pos" -o $WORKING_DIR/displacements -workers ${WORKERS}
18 | 
19 | mkdir -p $WORKING_DIR/timeseries/
20 | dump_timeseries.py -f $WORKING_DIR/displacements/timeseries_s_t_pos.pkl -s $WORKING_DIR/timeseries/source.csv -e $WORKING_DIR/timeseries/dest.csv -m $STARTTIMEPOINT -n $ENDTIMEPOINT -st $STEP -me "polar" -metric "jsd" -workers ${WORKERS}
21 | 
22 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/source.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD}
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Vivek Kulkarni
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | * Neither the name of langchangetrack nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/langchangetrack/examples/distributional/findNearest.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import logging
 3 | import sys
 4 | import math
 5 | import operator
 6 | 
 7 | import numpy as np
 8 | from numpy import linalg as LA
 9 | import gensim
10 | 
11 | __author__ = "Vivek Kulkarni"
12 | __email__ = "viveksck@gmail.com"
13 | 
14 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
15 | 
16 | def main(args):
17 |     return process(args.filename)
18 | 
19 | def process(filename):
20 |     m = gensim.models.Word2Vec.load_word2vec_format(filename)
21 |     print "query (ctrl-c to quit): ",
22 |     line = sys.stdin.readline()
23 |     while line:
24 |         word = line.rstrip()
25 |         print word
26 |         tuples  = m.most_similar(word, topn=10)
27 |         for w, s in tuples:
28 |             print w, s
29 |         print "----------------------------------"
30 |         print "query (ctrl-c to quit): ",
31 |         line = sys.stdin.readline()
32 | 
33 | if __name__ == "__main__":
34 |     parser = ArgumentParser()
35 |     parser.add_argument("--embeddings-file", dest="filename", help="embeddings file")
36 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
37 |                         default="INFO")
38 |     args = parser.parse_args()
39 |     if args.log == 'DEBUG':
40 |         sys.excepthook = debug
41 |     numeric_level = getattr(logging, args.log.upper(), None)
42 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
43 |     main(args)
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
 5 | 	@echo "clean-build - remove build artifacts"
 6 | 	@echo "clean-pyc - remove Python file artifacts"
 7 | 	@echo "clean-test - remove test and coverage artifacts"
 8 | 	@echo "lint - check style with flake8"
 9 | 	@echo "test - run tests quickly with the default Python"
10 | 	@echo "test-all - run tests on every Python version with tox"
11 | 	@echo "coverage - check code coverage quickly with the default Python"
12 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
13 | 	@echo "release - package and upload a release"
14 | 	@echo "dist - package"
15 | 
16 | clean: clean-build clean-pyc clean-test
17 | 
18 | clean-build:
19 | 	rm -fr build/
20 | 	rm -fr dist/
21 | 	rm -fr *.egg-info
22 | 
23 | clean-pyc:
24 | 	find . -name '*.pyc' -exec rm -f {} +
25 | 	find . -name '*.pyo' -exec rm -f {} +
26 | 	find . -name '*~' -exec rm -f {} +
27 | 	find . -name '__pycache__' -exec rm -fr {} +
28 | 
29 | clean-test:
30 | 	rm -fr .tox/
31 | 	rm -f .coverage
32 | 	rm -fr htmlcov/
33 | 
34 | lint:
35 | 	flake8 langchangetrack tests
36 | 
37 | test:
38 | 	python setup.py test
39 | 
40 | test-all:
41 | 	tox
42 | 
43 | coverage:
44 | 	coverage run --source langchangetrack setup.py test
45 | 	coverage report -m
46 | 	coverage html
47 | 	open htmlcov/index.html
48 | 
49 | docs:
50 | 	rm -f docs/langchangetrack.rst
51 | 	rm -f docs/modules.rst
52 | 	sphinx-apidoc -o docs/ langchangetrack
53 | 	$(MAKE) -C docs clean
54 | 	$(MAKE) -C docs html
55 | 	open docs/_build/html/index.html
56 | 
57 | release: clean
58 | 	python setup.py sdist upload
59 | 	python setup.py bdist_wheel upload
60 | 
61 | dist: clean
62 | 	python setup.py sdist
63 | 	python setup.py bdist_wheel
64 | 	ls -l dist
65 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/detect_cp_distributional.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INPUT_DIR=$1
 3 | WORKING_DIR=$2
 4 | OUTPUT_DIR=$3
 5 | STARTTIMEPOINT=$4
 6 | ENDTIMEPOINT=$5
 7 | STEP=$6
 8 | MODEL_FAMILY=$7
 9 | KNN=$8
10 | FILTER_VOCAB_FILE=${9}
11 | BOOTSTRAP=${10}
12 | THRESHOLD=${11}
13 | WORKERS=${12}
14 | 
15 | EMBEDDINGS_TYPE=skipgram
16 | echo "Output directory is", $OUTPUT_DIR
17 | 
18 | mkdir -p $WORKING_DIR
19 | mkdir -p $OUTPUT_DIR
20 | 
21 | echo "Mapping to joint space"
22 | mkdir -p $WORKING_DIR/predictors
23 | echo "Predictors will be stored in", $WORKING_DIR/predictors
24 | arr=("$INPUT_DIR/*.model")
25 | ((FINALTIMEPOINT=$ENDTIMEPOINT-$STEP))
26 | parallel -j${WORKERS} learn_map.py -k ${KNN} -f $WORKING_DIR/predictors/{/.}.predictor -o {} -n {//}/${FINALTIMEPOINT}_*.model -m $MODEL_FAMILY ::: $arr
27 | 
28 | WORDS_FILE=${FILTER_VOCAB_FILE}
29 | 
30 | echo "Computing displacements"
31 | mkdir -p $WORKING_DIR/displacements/
32 | export MKL_NUM_THREADS=1
33 | export NUMEXPR_NUM_THREADS=1
34 | export OMP_NUM_THREADS=1
35 | export MKL_DYNAMIC=FALSE
36 | embedding_displacements.py -f $WORDS_FILE -d $INPUT_DIR/ -p $WORKING_DIR/predictors/ -os words -es ".model" -ps ".predictor" -sy $STARTTIMEPOINT -ey $ENDTIMEPOINT -s $STEP -e $EMBEDDINGS_TYPE -o $WORKING_DIR/displacements/ -workers ${WORKERS}
37 | 
38 | echo "Creating time series"
39 | mkdir -p $WORKING_DIR/timeseries/
40 | dump_timeseries.py -f $WORKING_DIR/displacements/timeseries_s_t_words.pkl -s $WORKING_DIR/timeseries/source.csv -e $WORKING_DIR/timeseries/dest.csv -m $STARTTIMEPOINT -n $ENDTIMEPOINT -st $STEP -me "polar" -metric "cosine" -workers ${WORKERS}
41 | 
42 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/source.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD}
43 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/dummy_regressor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """dummy_regressor.py: Regressor that is the identity"""
 5 | 
 6 | from argparse import ArgumentParser
 7 | import logging
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | from glob import glob
13 | import pickle
14 | 
15 | __author__ = "Vivek Kulkarni"
16 | __email__ = "viveksck@gmail.com"
17 | 
18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
19 | 
20 | 
21 | class DummyRegressor(object):
22 | 
23 |     def predict(self, X):
24 |         return X
25 | 
26 | 
27 | def main(args):
28 |     d = DummyRegressor()
29 |     pickle.dump(d, open('dummy_regressor.pkl', 'wb'))
30 | 
31 | 
32 | def debug(type_, value, tb):
33 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
34 |         # we are in interactive mode or we don't have a tty-like
35 |         # device, so we call the default hook
36 |         sys.__excepthook__(type_, value, tb)
37 |     else:
38 |         import traceback
39 |         import pdb
40 |         # we are NOT in interactive mode, print the exception...
41 |         traceback.print_exception(type_, value, tb)
42 |         print("\n")
43 |         # ...then start the debugger in post-mortem mode.
44 |         pdb.pm()
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
49 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
50 |                         default="INFO")
51 |     args = parser.parse_args()
52 |     if args.log == 'DEBUG':
53 |         sys.excepthook = debug
54 |     numeric_level = getattr(logging, args.log.upper(), None)
55 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
56 |     main(args)
57 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/freq_count.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """freq_count.py: Dumps the frequency distribution of a corpus in desc order"""
 5 | 
 6 | 
 7 | from argparse import ArgumentParser
 8 | import logging
 9 | import sys
10 | from io import open
11 | from os import path
12 | from time import time
13 | from glob import glob
14 | import nltk
15 | 
16 | __author__ = "Vivek Kulkarni"
17 | __email__ = "viveksck@gmail.com"
18 | 
19 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
20 | 
21 | 
22 | def main(args):
23 |     encoding = sys.stdout.encoding or 'utf-8'
24 |     f = open(args.filename)
25 |     fd = nltk.FreqDist()
26 |     for line in f:
27 |         for sent in nltk.sent_tokenize(line):
28 |             for word in nltk.word_tokenize(sent):
29 |                 fd[word] += 1
30 | 
31 |     for w, count in fd.most_common():
32 |         tup = u"{} {}".format(w, count)
33 |         print tup.encode(encoding)
34 | 
35 | 
36 | def debug(type_, value, tb):
37 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
38 |         # we are in interactive mode or we don't have a tty-like
39 |         # device, so we call the default hook
40 |         sys.__excepthook__(type_, value, tb)
41 |     else:
42 |         import traceback
43 |         import pdb
44 |         # we are NOT in interactive mode, print the exception...
45 |         traceback.print_exception(type_, value, tb)
46 |         print("\n")
47 |         # ...then start the debugger in post-mortem mode.
48 |         pdb.pm()
49 | 
50 | if __name__ == "__main__":
51 |     parser = ArgumentParser()
52 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
53 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
54 |                         default="INFO")
55 |     args = parser.parse_args()
56 |     if args.log == 'DEBUG':
57 |         sys.excepthook = debug
58 |     numeric_level = getattr(logging, args.log.upper(), None)
59 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
60 |     main(args)
61 | 


--------------------------------------------------------------------------------
/langchangetrack/examples/data/normalized_timeseries_sample.csv:
--------------------------------------------------------------------------------
1 | Unnamed: 0,word,1900,1905,1910,1915,1920,1925,1930,1935,1940,1945,1950,1955,1960,1965,1970,1975,1980,1985,1990,1995,2000,2005
2 | 830,tree,0.15327714247103288,-0.19885247327831865,-0.3683410408490255,0.09558896968329192,-0.3490389607298094,0.2565163581969198,-0.527302927985289,-0.7428741306743359,-0.574162051879182,0.34184472338303645,-0.3714711433188301,-0.1406226862557359,-0.5567142692737548,-0.8473020213018153,-0.7812379046311363,-0.7237730159955219,-0.6133051538066998,-1.0328756603907936,-0.7434065484530583,-0.9341274372494442,-0.8209303655079251,-1.5439039179382728
3 | 1232,sex,0.15327714247103288,0.8422974209765391,0.4670831714915557,1.2004514277740952,1.089117575545353,-0.042869473512371865,0.4905971357936949,0.4420351237685775,0.6754477335596171,0.3616949047487767,0.7148357324968245,0.4483215569154581,0.6638101589107545,2.8213000472467233,1.6867586375636152,1.2839351704861663,1.026826051998202,1.9734392345129588,2.3191684380330164,2.5913956386522274,2.266890620494351,2.5228906797208204
4 | 1280,bitch,0.15327714247103288,1.142182904418651,2.629857299421962,1.5050173023356772,-0.19664099967373186,-0.1816506923943963,1.3944691232705158,1.3129137681032181,1.698648850193928,2.3741872415908647,1.5339619507724442,2.4752087951820902,2.6833522170360724,1.9358190746841688,3.133418509779865,2.4097395640333708,4.423179526228856,2.375519385842317,3.8408415271690113,2.6895338913639515,2.6321814385912887,2.407180122647251
5 | 2008,gay,0.15327714247103288,0.4420047305142134,1.6090753807905729,0.6967969687421449,1.6586549543356486,0.9196378440898209,0.9731171389763618,1.28614418926982,1.5990551088842633,1.4001333223409744,0.7467762811647296,1.1979101086635646,0.3335700353267224,1.5371247773359915,0.7701374463207338,0.7797479007969157,2.4813433922784727,1.946577671429413,2.7217396855527105,3.0896267594069604,3.5749294003565684,2.5136765443816858
6 | 3281,recording,0.15327714247103288,4.177000925947883,1.9405949335453256,1.5249572014401584,2.698653478348632,0.9964745894159096,1.7323864289598985,1.4656472143368606,2.4480439305566315,2.6008059328492257,2.040880904439721,2.3918680858825256,0.5625011152828968,2.6666885636321247,2.3098860150844365,2.3988788624164106,2.389346840226949,2.4943894017849337,2.2083815531437043,2.73865670378146,3.610134796433214,3.0174046962457997
7 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/common_vocab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """common_vocab.py: Dumps the common vocabulary between a set of text files."""
 5 | 
 6 | from argparse import ArgumentParser
 7 | import logging
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | from glob import glob
13 | import nltk
14 | 
15 | __author__ = "Vivek Kulkarni"
16 | __email__ = "viveksck@gmail.com"
17 | 
18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
19 | 
20 | 
21 | def main(args):
22 |     encoding = sys.stdout.encoding or 'utf-8'
23 |     common_vocab = None
24 |     list_of_files = glob(args.filepattern)
25 |     for fname in list_of_files:
26 |         file_vocab = set()
27 |         f = open(fname)
28 |         for line in f:
29 |             for sent in nltk.sent_tokenize(line):
30 |                 for word in nltk.word_tokenize(sent):
31 |                     file_vocab.add(word)
32 |         if common_vocab == None:
33 |             common_vocab = file_vocab
34 |         else:
35 |             common_vocab = common_vocab & file_vocab
36 |         f.close()
37 | 
38 |     for w in common_vocab:
39 |         print w.encode(encoding)
40 | 
41 | 
42 | def debug(type_, value, tb):
43 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
44 |         # we are in interactive mode or we don't have a tty-like
45 |         # device, so we call the default hook
46 |         sys.__excepthook__(type_, value, tb)
47 |     else:
48 |         import traceback
49 |         import pdb
50 |         # we are NOT in interactive mode, print the exception...
51 |         traceback.print_exception(type_, value, tb)
52 |         print("\n")
53 |         # ...then start the debugger in post-mortem mode.
54 |         pdb.pm()
55 | 
56 | if __name__ == "__main__":
57 |     parser = ArgumentParser()
58 |     parser.add_argument("-f", "--filepattern",
59 |                         dest="filepattern", help="Input file pattern")
60 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
61 |                         default="INFO")
62 |     args = parser.parse_args()
63 |     if args.log == 'DEBUG':
64 |         sys.excepthook = debug
65 |     numeric_level = getattr(logging, args.log.upper(), None)
66 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
67 |     main(args)
68 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/freq_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """template.py: Description of what the module does."""
 5 | 
 6 | from argparse import ArgumentParser
 7 | import logging
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | from glob import glob
13 | import subprocess
14 | 
15 | __author__ = "Vivek Kulkarni"
16 | __email__ = "viveksck@gmail.com"
17 | 
18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
19 | 
20 | 
21 | def main(args):
22 |     train_cmd = "calculate_freq_counts.sh {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.workers)
23 |     subprocess.check_call(train_cmd, shell=True)
24 | 
25 |     cmd = "detect_cp_freq.sh {} {} {} {} {} {} {} {} {} {}"
26 |     input_dir = path.join(args.working_dir, 'counts')
27 |     cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start,
28 |                      args.end, args.step, args.vocab_file, args.bootstrap, args.threshold, args.workers)
29 |     subprocess.check_call(cmd, shell=True)
30 | 
31 | if __name__ == "__main__":
32 |     parser = ArgumentParser()
33 |     parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory")
34 |     parser.add_argument("--file-extension", dest="ext", help="Corpus file extension")
35 |     parser.add_argument("--working-dir", dest="working_dir", help="Working directory")
36 |     parser.add_argument("--output-dir", dest="output_dir", help="Output directory")
37 |     parser.add_argument("--start-time-point", dest="start", help="Start time point")
38 |     parser.add_argument("--end-time-point", dest="end", help="End time point")
39 |     parser.add_argument("--step-size", dest="step", help="Step size for timepoints")
40 |     parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file")
41 |     parser.add_argument("--threshold", dest="threshold", default=0.0, type=float, help="Threshold for mean shift model for change point detection")
42 |     parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw")
43 |     parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers")
44 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
45 |                         default="INFO")
46 |     args = parser.parse_args()
47 |     if args.log == 'DEBUG':
48 |         sys.excepthook = debug
49 |     numeric_level = getattr(logging, args.log.upper(), None)
50 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/pos_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ Pipeline to detect language change using part of speech."""
 5 | 
 6 | from argparse import ArgumentParser
 7 | import logging
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | from glob import glob
13 | import subprocess
14 | 
15 | __author__ = "Vivek Kulkarni"
16 | __email__ = "viveksck@gmail.com"
17 | 
18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
19 | 
20 | 
21 | def main(args):
22 |     train_cmd = "calculate_pos_dist.sh {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.workers)
23 |     subprocess.check_call(train_cmd, shell=True)
24 | 
25 |     cmd = "detect_cp_pos.sh {} {} {} {} {} {} {} {} {} {}"
26 |     input_dir = path.join(args.working_dir, 'posdist')
27 |     cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start,
28 |                      args.end, args.step, args.vocab_file, args.bootstrap, args.threshold, args.workers)
29 |     subprocess.check_call(cmd, shell=True)
30 | 
31 | if __name__ == "__main__":
32 |     parser = ArgumentParser()
33 |     parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory")
34 |     parser.add_argument("--file-extension", dest="ext", help="Corpus file extension")
35 |     parser.add_argument("--working-dir", dest="working_dir", help="Working directory")
36 |     parser.add_argument("--output-dir", dest="output_dir", help="Output directory")
37 |     parser.add_argument("--start-time-point", dest="start", help="Start time point")
38 |     parser.add_argument("--end-time-point", dest="end", help="End time point")
39 |     parser.add_argument("--step-size", dest="step", help="Step size for timepoints")
40 |     parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file")
41 |     parser.add_argument("--threshold", dest="threshold", default=1.75, type=float, help="Threshold for mean shift model for change point detection")
42 |     parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw")
43 |     parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers")
44 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
45 |                         default="INFO")
46 |     args = parser.parse_args()
47 |     if args.log == 'DEBUG':
48 |         sys.excepthook = debug
49 |     numeric_level = getattr(logging, args.log.upper(), None)
50 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/scripts/pos_tag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from argparse import ArgumentParser
 5 | import logging
 6 | import sys
 7 | from io import open
 8 | from os import path
 9 | from time import time
10 | from glob import glob
11 | 
12 | from textblob import Blobber
13 | from textblob_aptagger import PerceptronTagger
14 | 
15 | from collections import Counter, defaultdict
16 | import numpy as np
17 | import pandas as pd
18 | 
19 | __author__ = "Vivek Kulkarni"
20 | __email__ = "viveksck@gmail.com"
21 | 
22 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
23 | 
24 | 
25 | def main(args):
26 |     f = open(args.filename)
27 |     D = {}
28 |     tag_set = set([])
29 |     tb = Blobber(pos_tagger=PerceptronTagger())
30 |     for i, line in enumerate(f):
31 |         b1 = tb(line)
32 |         for w, t in b1.tags:
33 |             tag_set.add(t)
34 |             if w not in D:
35 |                 D[w] = Counter()
36 |             D[w][t] = float(D[w][t] + 1)
37 | 
38 |     sorted_pos_tags = sorted(list(tag_set))
39 |     rows = []
40 |     for w in D.keys():
41 |         row = [w]
42 |         pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags])
43 |         pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word))
44 |         assert(np.isclose(np.sum(pos_dist_word), 1.0))
45 |         row = row + list(pos_dist_word)
46 |         rows.append(row)
47 | 
48 |     header = ['word'] + sorted_pos_tags
49 |     print "Set of POS tags in sorted order", header
50 |     df = pd.DataFrame().from_records(rows, columns=header)
51 |     print "Dumping the POS distribution."
52 |     df.to_csv(args.outputfile, index=None, encoding='utf-8')
53 | 
54 | 
55 | def debug(type_, value, tb):
56 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
57 |         # we are in interactive mode or we don't have a tty-like
58 |         # device, so we call the default hook
59 |         sys.__excepthook__(type_, value, tb)
60 |     else:
61 |         import traceback
62 |         import pdb
63 |         # we are NOT in interactive mode, print the exception...
64 |         traceback.print_exception(type_, value, tb)
65 |         print("\n")
66 |         # ...then start the debugger in post-mortem mode.
67 |         pdb.pm()
68 | 
69 | if __name__ == "__main__":
70 |     parser = ArgumentParser()
71 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
72 |     parser.add_argument("-o", "--outputfile", dest="outputfile", help="Output file")
73 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
74 |                         default="INFO")
75 |     args = parser.parse_args()
76 |     if args.log == 'DEBUG':
77 |         sys.excepthook = debug
78 |     numeric_level = getattr(logging, args.log.upper(), None)
79 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
80 |     main(args)
81 | 


--------------------------------------------------------------------------------
/langchangetrack/scripts/ngrams_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Pipeline to train a corpus of ngrams"""
 5 | 
 6 | from argparse import ArgumentParser
 7 | import logging
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | from glob import glob
13 | import subprocess
14 | 
15 | __author__ = "Vivek Kulkarni"
16 | __email__ = "viveksck@gmail.com"
17 | 
18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
19 | 
20 | 
21 | def main(args):
22 |     train_cmd = "train_models.sh {} {} {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.window, args.epochs, args.workers)
23 |     subprocess.check_call(train_cmd, shell=True)
24 | 
25 |     cmd = "detect_cp_distributional.sh {} {} {} {} {} {} {} {} {} {} {} {}"
26 |     input_dir = path.join(args.working_dir, 'models')
27 |     cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start, args.end, args.step, args.model_family, args.knn, args.vocab_file, args.bootstrap, args.threshold, args.workers)
28 |     subprocess.check_call(cmd, shell=True)
29 | 
30 | if __name__ == "__main__":
31 |     parser = ArgumentParser()
32 |     parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory")
33 |     parser.add_argument("--file-extension", dest="ext", help="Corpus file extension")
34 |     parser.add_argument("--working-dir", dest="working_dir", help="Working directory")
35 |     parser.add_argument("--output-dir", dest="output_dir", help="Output directory")
36 |     parser.add_argument("--context-size", dest="window", default=5, type=int, help="Context size to use for training embeddings")
37 |     parser.add_argument("--epochs", dest="epochs", default=3, type=int, help="Number of epochs to training embeddings")
38 |     parser.add_argument("--start-time-point", dest="start", help="Start time point")
39 |     parser.add_argument("--end-time-point", dest="end", help="End time point")
40 |     parser.add_argument("--step-size", dest="step", help="Step size for timepoints")
41 |     parser.add_argument("--model-family", dest="model_family", default="locallinear", help="Model family default (locallinear)")
42 |     parser.add_argument("--number-nearest-neighbors", dest="knn", default=1000,
43 |                         type=int, help="Number of nearest neighbors to use for mapping to joint space (default:1000)")
44 |     parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file")
45 |     parser.add_argument("--threshold", dest="threshold", default=1.75, type=float, help="Threshold for mean shift model for change point detection")
46 |     parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw")
47 |     parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers")
48 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
49 |                         default="INFO")
50 |     args = parser.parse_args()
51 |     if args.log == 'DEBUG':
52 |         sys.excepthook = debug
53 |     numeric_level = getattr(logging, args.log.upper(), None)
54 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
55 |     main(args)
56 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/freq/scripts/create_freq_timeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from argparse import ArgumentParser
 5 | import logging
 6 | import sys
 7 | from io import open
 8 | from os import path
 9 | from time import time
10 | from glob import glob
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | __author__ = "Vivek Kulkarni"
15 | __email__ = "viveksck@gmail.com"
16 | 
17 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
18 | 
19 | 
20 | def main(args):
21 |     # Read the input arguments.
22 |     inputdir = args.inputdir
23 |     start = args.start
24 |     end = args.end
25 |     step = args.step
26 |     timepoints = np.arange(start, end, step)
27 |     timepoints = [str(timepoint) for timepoint in timepoints]
28 |     num = int(args.num)
29 |     freq = args.freq
30 | 
31 |     # Normalize the frequencies.
32 |     normdf = None
33 |     dfs = (pd.read_table(path.join(inputdir, timepoint + '.freq'), sep=' ',
34 | quotechar=' ', names=['word', timepoint]) for timepoint in (timepoints))
35 |     for i, df in enumerate(dfs):
36 |         df[str(timepoints[i])] = df[str(timepoints[i])] / df[str(timepoints[i])].sum()
37 |         if normdf is None:
38 |             normdf = df[:num]
39 |             continue
40 |         df = df[:num]
41 |         normdf = pd.merge(normdf, df, on='word', how='outer')
42 | 
43 |     # Convert them to log scale becoz that is what matters !
44 |     if args.log10:
45 |         for timepoint in timepoints:
46 |             normdf[timepoint] = np.log10(normdf[timepoint])
47 | 
48 |     normdf.to_csv(freq, encoding='utf-8')
49 | 
50 | 
51 | def debug(type_, value, tb):
52 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
53 |         # we are in interactive mode or we don't have a tty-like
54 |         # device, so we call the default hook
55 |         sys.__excepthook__(type_, value, tb)
56 |     else:
57 |         import traceback
58 |         import pdb
59 |         # we are NOT in interactive mode, print the exception...
60 |         traceback.print_exception(type_, value, tb)
61 |         print("\n")
62 |         # ...then start the debugger in post-mortem mode.
63 |         pdb.pm()
64 | 
65 | if __name__ == "__main__":
66 |     parser = ArgumentParser()
67 |     parser.add_argument("-d", "--inputdir", dest="inputdir", help="Input file")
68 |     parser.add_argument("-s", "--start", dest="start", help="start time", type=int)
69 |     parser.add_argument("-e", "--end", dest="end", help="end time(not included)", type=int)
70 |     parser.add_argument("-p", "--step", dest="step", help="step", type=int)
71 |     parser.add_argument("-num", "--num", dest="num", help="Number of words topN", type=int, default=30000)
72 |     parser.add_argument("-f", "--freq", dest="freq", help="Output freq dist file")
73 |     parser.add_argument("-log", "--log10", dest="log10", action="store_true", default=False,  help="freq")
74 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
75 |                         default="INFO")
76 |     args = parser.parse_args()
77 |     if args.log == 'DEBUG':
78 |         sys.excepthook = debug
79 |     numeric_level = getattr(logging, args.log.upper(), None)
80 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/LocalLinearRegression.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import statsmodels.api as sm
 3 | from sklearn.neighbors import NearestNeighbors
 4 | from sklearn.base import BaseEstimator
 5 | from statsmodels.sandbox.regression.predstd import wls_prediction_std
 6 | 
 7 | import pickle
 8 | 
 9 | # Code to pickle a VW model
10 | import copy_reg
11 | from types import FunctionType, FileType, MethodType
12 | 
13 | 
14 | def stub_pickler(obj):
15 |     return stub_unpickler, ()
16 | 
17 | 
18 | def stub_unpickler():
19 |     return "STUB"
20 | 
21 | copy_reg.pickle(MethodType, stub_pickler, stub_unpickler)
22 | copy_reg.pickle(FileType,   stub_pickler, stub_unpickler)
23 | copy_reg.pickle(FunctionType, stub_pickler, stub_unpickler)
24 | 
25 | 
26 | '''
27 |     Given a list of numbers, produce a list of weights using the specified kernel
28 | '''
29 | 
30 | 
31 | class KernelFunctions:
32 | 
33 |     @staticmethod
34 |     def uniform(distances):
35 |         return numpy.ones(len(distances))
36 | 
37 |     @staticmethod
38 |     def gauss(distances):
39 |         dist_norm = distances / distances[len(distances) - 1]
40 |         weights = [math.exp(-dist * dist) for dist in dist_norm]
41 |         return weights
42 | 
43 |     @staticmethod
44 |     def linear(distances):
45 |         dist_norm = distances / distances[len(distances) - 1]
46 |         weights = [1.0001 - dist for dist in dist_norm]
47 |         return weights
48 | 
49 |     @staticmethod
50 |     def epanechnikov(distances):
51 |         dist_norm = distances / distances[len(distances) - 1]
52 |         weights = [(3. / 4.) * (1.0001 - dist * dist) for dist in dist_norm]
53 |         return weights
54 | 
55 |     @staticmethod
56 |     def tricube(distances):
57 |         dist_norm = distances / distances[len(distances) - 1]
58 |         weights = [pow((1.0001 - pow(dist, 3)), 3) for dist in dist_norm]
59 |         return weights
60 | 
61 | 
62 | class LocalLinearRegression(BaseEstimator):
63 | 
64 |     def __init__(self, k_nn, weight_func=KernelFunctions.uniform):
65 |         self.k_nn = k_nn
66 |         self.weight_func = weight_func
67 |         print self.k_nn, self.weight_func
68 | 
69 |     '''
70 |         X: A list of points to transform
71 |         Y: The corresponding target points
72 |     '''
73 | 
74 |     def fit(self, X, Y):
75 |         if len(X) != len(Y):
76 |             raise ValueError("len(X) != len(Y)")
77 |         if len(X) < self.k_nn:
78 |             raise ValueError("Not enough points for local linear regression for the specified number of neighbors (" +
79 |                              str(len(X)) + " < " + str(self.k_nn) + ")")
80 |         self.X = numpy.array(X)
81 |         self.Y = numpy.array(Y)
82 |         self.nn = NearestNeighbors(n_neighbors=self.k_nn, algorithm='ball_tree', p=2)
83 |         self.nn.fit(self.X)
84 |         print "Fit the model"
85 | 
86 |     '''
87 |         X: The point to transform based on its neighbors 
88 |     '''
89 | 
90 |     def predict(self, X):
91 |         neighbors = self.nn.kneighbors(X)
92 |         distances = neighbors[0][0]
93 |         neighbor_indices = neighbors[1][0]
94 |         local_X = self.X.take(neighbor_indices, axis=0)
95 |         local_Y = self.Y.take(neighbor_indices, axis=0)
96 |         wls = sm.WLS(local_Y, local_X, weights=self.weight_func(distances)).fit()
97 |         return wls.predict(X)
98 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/distributional/corpustoembeddings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | 
 7 | import gensim
 8 | 
 9 | import logging
10 | logger = logging.getLogger("langchangetrack")
11 | 
12 | 
13 | class CorpusToEmbeddings(object):
14 | 
15 |     """ Class that encapsulates functionality for obtaining embeddings from a corpus."""
16 | 
17 |     def __init__(self, corpus_iter, embeddings_type, lang='en',
18 |                  model_config={}, save_model_file=None):
19 |         """ Initialize the object with the corpus iterator and 
20 |             the type of embeddings.
21 |         
22 |             The corpus iterator should just support iterating over 
23 |             sentences. It can be a list or a generator which yields
24 |             sentences. The embeddings type can be one of the supported 
25 |             embedding types: 'skipgram'
26 |             
27 |             The model_config is an optional named tuple containing specific 
28 |             configurations parameters to be passed when training the model.
29 |         """
30 | 
31 |         assert(corpus_iter)
32 |         assert(embeddings_type in CorpusToEmbeddings.supported_embedding_types())
33 | 
34 |         self.corpus_iter = corpus_iter
35 |         self.lang = lang
36 |         self.embeddings_type = embeddings_type
37 |         self.model_config = model_config
38 | 
39 |         self.embeddings_builder_map = {
40 |             'skipgram': self.buildword2vec
41 |         }
42 |         self.model = None
43 |         self.save_model_file = save_model_file
44 |         return
45 | 
46 |     @staticmethod
47 |     def supported_embedding_types():
48 |         """ Embedding types we support. """
49 |         return ['skipgram']
50 | 
51 |     def buildword2vec(self):
52 |         """ Trains a word2vec model on the corpus. """
53 | 
54 |         cfg_size = self.model_config.get('size', 200)
55 |         cfg_window = self.model_config.get('window', 5)
56 |         cfg_min_count = self.model_config.get('min_count', 10)
57 |         cfg_workers = self.model_config.get('workers', 16)
58 |         cfg_alpha = self.model_config.get('alpha', 0.01)
59 |         logger.info('window size:{}, alpha:{}, embedding size:{}, min_count:{}, workers:{}'.format(cfg_window, cfg_alpha, cfg_size, cfg_min_count, cfg_workers))
60 |         self.model = gensim.models.Word2Vec(self.corpus_iter,
61 |                                             size=cfg_size,
62 |                                             window=cfg_window,
63 |                                             min_count=cfg_min_count,
64 |                                             alpha=cfg_alpha,
65 |                                             workers=cfg_workers,
66 |                                             sample=1e-5,
67 |                                             negative=0)
68 | 
69 |         if self.save_model_file:
70 |             self.model.save_word2vec_format(self.save_model_file)
71 | 
72 |     def build(self):
73 |         """ Trains a model on the corpus to obtain embeddings."""
74 |         sys.stdout.write("Building a model from the corpus.\n")
75 |         sys.stdout.flush()
76 |         self.embeddings_builder_map[self.embeddings_type]()
77 |         sys.stdout.write("Model built.\n")
78 |         sys.stdout.flush()
79 | 
80 |     def save_model(self, model_file):
81 |         """ Saves the model file. """
82 |         self.model.save_word2vec_format(model_file)
83 |         return
84 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/distributional/scripts/train_embeddings_ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | from argparse import ArgumentParser
 8 | import sys
 9 | from io import open
10 | from os import path
11 | from time import time
12 | import itertools
13 | 
14 | from langchangetrack.corpusreaders.plainngramscorpus import PlainNGRAMSCorpus
15 | from langchangetrack.tsconstruction.distributional.corpustoembeddings import CorpusToEmbeddings
16 | 
17 | import logging
18 | logger = logging.getLogger("langchangetrack")
19 | 
20 | __author__ = "Vivek Kulkarni"
21 | __email__ = "viveksck@gmail.com"
22 | 
23 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
24 | 
25 | import psutil
26 | from multiprocessing import cpu_count
27 | 
28 | p = psutil.Process(os.getpid())
29 | p.set_cpu_affinity(list(range(cpu_count())))
30 | 
31 | 
32 | class RepeatCorpusNTimes(object):
33 | 
34 |     def __init__(self, corpus, n):
35 |         """
36 |         Repeat a `corpus` `n` times.
37 |         >>> corpus = [[(1, 0.5)], []]
38 |         >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
39 |         [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
40 |         """
41 |         self.corpus = corpus
42 |         self.n = n
43 | 
44 |     def __iter__(self):
45 |         return itertools.chain.from_iterable(itertools.repeat(tuple(self.corpus), self.n))
46 | 
47 | 
48 | def run(filename, output_dir, file_prefix, window_size, embedding_type, embedding_size, workers, num_epochs):
49 |     corpus_reader = RepeatCorpusNTimes(PlainNGRAMSCorpus(args.filename), num_epochs)
50 |     model_config = {}
51 |     model_config['window'] = window_size
52 |     model_config['size'] = embedding_size
53 |     model_config['workers'] = workers
54 |     model_file = path.join(output_dir, '_'. join([file_prefix, 'embeddings.model']))
55 |     c = CorpusToEmbeddings(corpus_reader, embedding_type, model_config=model_config, save_model_file=model_file)
56 |     c.build()
57 | 
58 | 
59 | def main(args):
60 |     filename = args.filename
61 |     output_dir = args.output_dir
62 |     file_prefix = args.file_prefix
63 |     window_size = int(args.window_size)
64 |     embedding_type = args.embedding_type
65 |     embedding_size = args.embedding_size
66 |     workers = args.workers
67 |     num_epochs = args.epochs
68 |     run(filename, output_dir, file_prefix, window_size, embedding_type, embedding_size, workers, num_epochs)
69 | 
70 | if __name__ == "__main__":
71 |     parser = ArgumentParser()
72 |     parser.add_argument("-f", "--file", dest="filename", help="Input file for ngrams")
73 |     parser.add_argument("-o", "--output_dir", dest="output_dir", help="Output directory")
74 |     parser.add_argument("-p", "--file-prefix", dest="file_prefix", default='exp', help="File prefix")
75 |     parser.add_argument("-w", "--window_size", dest="window_size", default=5,  help="Window size for word2vec")
76 |     parser.add_argument("-e", "--embedding_type", dest="embedding_type", default='skipgram',  help="Embedding type")
77 |     parser.add_argument("-s", "--embedding_size", dest="embedding_size", default=200, type=int, help="Window size for word2vec")
78 |     parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int)
79 |     parser.add_argument("-epochs", "--epochs", dest="epochs", default=1, help="Number of epochs", type=int)
80 |     logging.basicConfig(level=logging.INFO, format=LOGFORMAT)
81 |     args = parser.parse_args()
82 |     main(args)
83 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | try:
 5 |     from setuptools import setup
 6 | except ImportError:
 7 |     from distutils.core import setup
 8 | 
 9 | 
10 | readme = open('README.rst').read()
11 | history = open('HISTORY.rst').read().replace('.. :changelog:', '')
12 | 
13 | requirements = [
14 |     'wheel==0.23.0',
15 |     'argparse>=1.2.1',
16 |     'numpy>=0.9.1',
17 |     'scipy>=0.15.1',
18 |     'more_itertools>=2.2',
19 |     'joblib>=0.8.3-r1',
20 |     'gensim==0.10.3',
21 |     'statsmodels>=0.5.0',
22 |     'changepoint>=0.1.1',
23 |     'nltk>=3.0.0',
24 |     'textblob>=0.9.0',
25 |     'textblob-aptagger>=0.2.0',
26 |     'psutil>=2.1.1',
27 | ]
28 | 
29 | test_requirements = [
30 |     # TODO: put package test requirements here
31 | ]
32 | 
33 | setup(
34 |     name='langchangetrack',
35 |     version='0.1.0',
36 |     description='Package for statistically significant language change.',
37 |     long_description=readme + '\n\n' + history,
38 |     author='Vivek Kulkarni',
39 |     author_email='viveksck@gmail.com',
40 |     url='https://github.com/viveksck/langchangetrack',
41 |     packages=[
42 |         'langchangetrack',
43 |         'langchangetrack.utils',
44 |         'langchangetrack.corpusreaders',
45 |         'langchangetrack.tsconstruction',
46 |         'langchangetrack.tsconstruction.distributional'
47 |     ],
48 |     package_dir={'langchangetrack':
49 |                  'langchangetrack'},
50 |     include_package_data=True,
51 |     install_requires=requirements,
52 |     license="BSD",
53 |     zip_safe=False,
54 |     keywords='langchangetrack',
55 |     classifiers=[
56 |         'Development Status :: 2 - Pre-Alpha',
57 |         'Intended Audience :: Developers',
58 |         'License :: OSI Approved :: BSD License',
59 |         'Natural Language :: English',
60 |         "Programming Language :: Python :: 2",
61 |         'Programming Language :: Python :: 2.6',
62 |         'Programming Language :: Python :: 2.7',
63 |         'Programming Language :: Python :: 3',
64 |         'Programming Language :: Python :: 3.3',
65 |         'Programming Language :: Python :: 3.4',
66 |     ],
67 |     scripts=[
68 |         'langchangetrack/tsconstruction/freq/scripts/create_freq_timeseries.py',
69 |         'langchangetrack/tsconstruction/syntactic/scripts/pos_displacements.py',
70 |         'langchangetrack/tsconstruction/distributional/scripts/train_embeddings_ngrams.py',
71 |         'langchangetrack/tsconstruction/distributional/scripts/learn_map.py',
72 |         'langchangetrack/tsconstruction/distributional/scripts/embedding_displacements.py',
73 |         'langchangetrack/tsconstruction/dump_timeseries.py',
74 |         'langchangetrack/cpdetection/detect_changepoints_word_ts.py',
75 |         'langchangetrack/cpdetection/detect_changepoints_word_ts_r.py',
76 |         'langchangetrack/scripts/detect_cp_freq.sh',
77 |         'langchangetrack/scripts/detect_cp_pos.sh',
78 |         'langchangetrack/scripts/detect_cp_distributional.sh',
79 |         'langchangetrack/scripts/ngrams_pipeline.py',
80 |         'langchangetrack/scripts/pos_pipeline.py',
81 |         'langchangetrack/scripts/freq_pipeline.py',
82 |         'langchangetrack/utils/scripts/freq_count.py',
83 |         'langchangetrack/utils/scripts/common_vocab.py',
84 |         'langchangetrack/utils/scripts/pos_tag.py',
85 |         'langchangetrack/utils/scripts/calculate_pos_dist.sh',
86 |         'langchangetrack/utils/scripts/calculate_freq_counts.sh',
87 |         'langchangetrack/utils/scripts/train_models.sh',
88 |     ],
89 |     test_suite='tests',
90 |     tests_require=test_requirements
91 | )
92 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Contributing
  3 | ============
  4 | 
  5 | Contributions are welcome, and they are greatly appreciated! Every
  6 | little bit helps, and credit will always be given.
  7 | 
  8 | You can contribute in many ways:
  9 | 
 10 | Types of Contributions
 11 | ----------------------
 12 | 
 13 | Report Bugs
 14 | ~~~~~~~~~~~
 15 | 
 16 | Report bugs at https://github.com/viveksck/langchangetrack/issues.
 17 | 
 18 | If you are reporting a bug, please include:
 19 | 
 20 | * Your operating system name and version.
 21 | * Any details about your local setup that might be helpful in troubleshooting.
 22 | * Detailed steps to reproduce the bug.
 23 | 
 24 | Fix Bugs
 25 | ~~~~~~~~
 26 | 
 27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 28 | is open to whoever wants to implement it.
 29 | 
 30 | Implement Features
 31 | ~~~~~~~~~~~~~~~~~~
 32 | 
 33 | Look through the GitHub issues for features. Anything tagged with "feature"
 34 | is open to whoever wants to implement it.
 35 | 
 36 | Write Documentation
 37 | ~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | langchangetrack could always use more documentation, whether as part of the
 40 | official langchangetrack docs, in docstrings, or even on the web in blog posts,
 41 | articles, and such.
 42 | 
 43 | Submit Feedback
 44 | ~~~~~~~~~~~~~~~
 45 | 
 46 | The best way to send feedback is to file an issue at https://github.com/viveksck/langchangetrack/issues.
 47 | 
 48 | If you are proposing a feature:
 49 | 
 50 | * Explain in detail how it would work.
 51 | * Keep the scope as narrow as possible, to make it easier to implement.
 52 | * Remember that this is a volunteer-driven project, and that contributions
 53 |   are welcome :)
 54 | 
 55 | Get Started!
 56 | ------------
 57 | 
 58 | Ready to contribute? Here's how to set up `langchangetrack` for local development.
 59 | 
 60 | 1. Fork the `langchangetrack` repo on GitHub.
 61 | 2. Clone your fork locally::
 62 | 
 63 |     $ git clone git@github.com:your_name_here/langchangetrack.git
 64 | 
 65 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 66 | 
 67 |     $ mkvirtualenv langchangetrack
 68 |     $ cd langchangetrack/
 69 |     $ python setup.py develop
 70 | 
 71 | 4. Create a branch for local development::
 72 | 
 73 |     $ git checkout -b name-of-your-bugfix-or-feature
 74 | 
 75 |    Now you can make your changes locally.
 76 | 
 77 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
 78 | 
 79 |     $ flake8 langchangetrack tests
 80 |     $ python setup.py test
 81 |     $ tox
 82 | 
 83 |    To get flake8 and tox, just pip install them into your virtualenv.
 84 | 
 85 | 6. Commit your changes and push your branch to GitHub::
 86 | 
 87 |     $ git add .
 88 |     $ git commit -m "Your detailed description of your changes."
 89 |     $ git push origin name-of-your-bugfix-or-feature
 90 | 
 91 | 7. Submit a pull request through the GitHub website.
 92 | 
 93 | Pull Request Guidelines
 94 | -----------------------
 95 | 
 96 | Before you submit a pull request, check that it meets these guidelines:
 97 | 
 98 | 1. The pull request should include tests.
 99 | 2. If the pull request adds functionality, the docs should be updated. Put
100 |    your new functionality into a function with a docstring, and add the
101 |    feature to the list in README.rst.
102 | 3. The pull request should work for Python 2.6, 2.7, 3.3, and 3.4, and for PyPy. Check
103 |    https://travis-ci.org/viveksck/langchangetrack/pull_requests
104 |    and make sure that the tests pass for all supported Python versions.
105 | 
106 | Tips
107 | ----
108 | 
109 | To run a subset of tests::
110 | 
111 |     $ python -m unittest tests.test_langchangetrack
112 | 


--------------------------------------------------------------------------------
/langchangetrack/utils/entropy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | :mod:`pynchon.bio.alg.entropy`
  4 | ===========================
  5 | 
  6 | Algorithms on entropies.
  7 | """
  8 | from itertools import izip
  9 | import numpy as np
 10 | import scipy as sp
 11 | import math
 12 | 
 13 | 
 14 | def get_base(unit='bit'):
 15 |     if unit == 'bit':
 16 |         log = sp.log2
 17 |     elif unit == 'nat':
 18 |         log = sp.log
 19 |     elif unit in ('digit', 'dit'):
 20 |         log = sp.log10
 21 |     else:
 22 |         raise ValueError('The "unit" "%s" not understood' % unit)
 23 |     return log
 24 | 
 25 | 
 26 | def shannon_entropy(freq, unit='bit'):
 27 |     """Calculates the Shannon Entropy (H) of a frequency.
 28 |     
 29 |     Arguments:
 30 |     
 31 |         - freq (``numpy.ndarray``) A ``Freq`` instance or ``numpy.ndarray`` with 
 32 |           frequency vectors along the last axis.
 33 |         - unit (``str``) The unit of the returned entropy one of 'bit', 'digit' 
 34 |           or 'nat'.
 35 |     """
 36 |     log = get_base(unit)
 37 |     shape = freq.shape  # keep shape to return in right shape
 38 |     Hs = np.ndarray(freq.size / shape[-1])  # place to keep entropies
 39 |     # this returns an array of vectors or just a vector of frequencies
 40 |     freq = freq.reshape((-1, shape[-1]))
 41 |     # this makes sure we have an array of vectors of frequencies
 42 |     freq = np.atleast_2d(freq)
 43 |     # get fancy indexing
 44 |     positives = freq != 0.
 45 |     for i, (freq, idx) in enumerate(izip(freq, positives)):
 46 |         freq = freq[idx]  # keep only non-zero
 47 |         logs = [math.log(f, 2) for f in freq]  # logarithms of non-zero frequencies
 48 |         Hs[i] = -np.sum(freq * logs)
 49 |     Hs.reshape(shape[:-1])
 50 |     return Hs
 51 | 
 52 | 
 53 | def relative_entropy(freq, background, unit='bit'):
 54 |     """
 55 |     Calculates the Releative Entropy (D), which is the Kullback-Leibler 
 56 |     divergence between two frequencies. The two arrays "freq" and "background"
 57 |     need to broadcast to a single shape. 
 58 |     
 59 |     Arguments:
 60 |     
 61 |         - freq (``numpy.ndarray``) A ``Freq`` instance or ``numpy.ndarray`` with
 62 |           frequency vectors along the last axis.
 63 |         - background (``numpy.ndarray``) ``Freq`` instance or ``numpy.ndarray`` 
 64 |           with frequency vectors along the last axis. This typically is a 
 65 |           rank-1 array.
 66 |           
 67 |     Could be normalized?: Dkl = Dkl / log(len(background))
 68 |     """
 69 |     log = get_base(unit)
 70 |     shape = freq.shape
 71 |     freq = freq.reshape((-1, shape[-1]))
 72 |     freq = np.atleast_2d(freq)
 73 |     Dkls = np.ndarray(freq.size / shape[-1])
 74 |     positives = (freq != 0.) & (background != 0.)
 75 |     for i, (freq, idx) in enumerate(izip(freq, positives)):
 76 |         freq = freq[idx]
 77 |         bg = background[idx]
 78 |         logs = log(freq / bg)
 79 |         Dkls[i] = np.sum(freq * logs)
 80 |     Dkls.reshape(shape[:-1])
 81 |     return Dkls
 82 | 
 83 | 
 84 | def mutual_information(jointfreq, rowfreq=None, colfreq=None, unit='bit'):
 85 |     """
 86 |     Calculates the Mutual Information (I) of a joint frequency. The marginal
 87 |     frequencies can be given or are calculated from the joint frequency.
 88 |     
 89 |     Arguments:
 90 |     
 91 |         - jointfreq (``numpy.ndarray``) A normalized ``JointFreq`` instance or
 92 |           ``numpy.ndarray`` of rank-2, which is a joint probability distribution
 93 |           function of two random variables.
 94 |         - rowfreq (``numpy.ndarray``) [default: ``None``] A normalized marginal 
 95 |           probability distribution function for the variable along the axis =0. 
 96 |         - colfreq (``numpy.ndarray``) [default: ``None``] A normalized marginal 
 97 |           probability distribution function for the variable along the axis =1.
 98 |         - unit (``str``) [defualt: ``"bit"``] Unit of the returned information.   
 99 |     """
100 |     log = get_base(unit)
101 |     rowfreq = rowfreq or np.sum(jointfreq, axis=1)
102 |     colfreq = colfreq or np.sum(jointfreq, axis=0)
103 |     indfreq = np.dot(rowfreq[None].transpose(), colfreq[None])
104 |     non_zero = jointfreq != 0.
105 |     jntf = jointfreq[non_zero]
106 |     indf = indfreq[non_zero]
107 |     return np.sum(jntf * log(jntf / indf))
108 | 
109 | 
110 | def jensen_shannon_divergence(freq, weights=None, unit='bit'):
111 |     """
112 |     Calculates the Jensen-Shannon Divergence (Djs) of two or more frequencies.
113 |     The weights are for the relative contribution of each frequency vector. 
114 |     
115 |     Arguments:
116 |     
117 |         - freq (``numpy.ndarray``) A ``Prof`` instance or a rank-2 array of 
118 |           frequencies along the last dimension.
119 |         - weights (``numpy.ndarray``) An array with a weight for each 
120 |           frequency vector. Rank-1.
121 |         - unit (``str``) see: the function ``shannon_entropy``.
122 |     """
123 |     if weights is not None:
124 |         if len(freq) != len(weights):
125 |             raise ValueError('The number of frequencies and weights do not match.')
126 |         if (freq.ndim != 2) or (len(freq) < 2):
127 |             raise ValueError('At least two frequencies in a rank-2 array expected.')
128 |     weighted_average = np.average(freq, axis=0, weights=weights)
129 |     H_avg_freq = shannon_entropy(weighted_average, unit)
130 |     H_freq = shannon_entropy(freq, unit)
131 |     avg_H_freq = np.average(H_freq, weights=weights)
132 |     JSD = H_avg_freq - avg_H_freq
133 |     return JSD
134 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===============================
  2 | langchangetrack
  3 | ===============================
  4 | 
  5 | .. image:: https://badge.fury.io/py/langchangetrack.png
  6 |     :target: http://badge.fury.io/py/langchangetrack
  7 | 
  8 | .. image:: https://travis-ci.org/viveksck/langchangetrack.png?branch=master
  9 |         :target: https://travis-ci.org/viveksck/langchangetrack
 10 | 
 11 | .. image:: https://pypip.in/d/langchangetrack/badge.png
 12 |         :target: https://pypi.python.org/pypi/langchangetrack
 13 |         
 14 | .. image:: https://github.com/viveksck/langchangetrack/blob/master/langchangetrack/images/gay_invisible.png
 15 | 
 16 | 
 17 | Package for Statistically Significant Language Change.
 18 | 
 19 | * Free software: BSD license
 20 | * Documentation: https://langchangetrack.readthedocs.org.
 21 | 
 22 | Features
 23 | --------
 24 | 
 25 | * This package provides tools to detect linguistic change in temporal corpora. 
 26 | 
 27 | * The meta algorithm works in 2 main steps
 28 | 
 29 |     #. **Time series construction**:Given a word, we construct a time series that tracks the displacement of a word through time. We track the displacement of a word using either Frequency, Part of Speech Distribution or Co-occurrences.
 30 | 
 31 |     #. **Change point detection**: We then use change point detection methods to detect if the time series contains a change point and if so what the change point is.
 32 | 
 33 | The details of the above steps are outlined in : http://arxiv.org/abs/1411.3315
 34 | 
 35 | 
 36 | Visualization Demo
 37 | -------------------
 38 | 
 39 | Please see this for a cool visualization of words moving through time: http://tinyurl.com/wordvis
 40 | 
 41 | Usage
 42 | ------
 43 |     
 44 | Input
 45 | ------
 46 | 
 47 | We assume a temporal corpus of text files (appropriately tokenized) to be present in a directory. In addition we assume list of words in a single text file that one is interested in tracking. 
 48 | This could just be the set of words in the common vocabulary of the temporal corpus.
 49 | 
 50 | Output
 51 | ------
 52 | 
 53 | The output consists of the pvalues for each word indicating the significance of the changepoint detected.
 54 | 
 55 | Sample Usage
 56 | ------------
 57 | ``$ngrams_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --context-size 5 --epochs 3 --start-time-point 1900 --end-time-point 2000 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16``
 58 | 
 59 | ``$pos_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --start-time-point 1900 --end-time-point 1930 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16``
 60 | 
 61 | ``$freq_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --start-time-point 1900 --end-time-point 2000 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16``
 62 | 
 63 | **You might need to tune the hyper parameters as per your specific need.**
 64 | 
 65 | Detailed Usage
 66 | ---------------
 67 | **Usage: ngrams_pipeline.py**
 68 | 
 69 | optional arguments:
 70 |   -h, --help            show this help message and exit
 71 |   --corpus-dir CORPUS_DIR
 72 |                         Corpus directory
 73 |   --file-extension EXT  Corpus file extension
 74 |   --working-dir WORKING_DIR
 75 |                         Working directory
 76 |   --output-dir OUTPUT_DIR
 77 |                         Output directory
 78 |   --context-size WINDOW
 79 |                         Context size to use for training embeddings
 80 |   --epochs EPOCHS       Number of epochs to training embeddings
 81 |   --start-time-point START
 82 |                         Start time point
 83 |   --end-time-point END  End time point
 84 |   --step-size STEP      Step size for timepoints
 85 |   --model-family MODEL_FAMILY
 86 |                         Model family default (locallinear)
 87 |   --number-nearest-neighbors KNN 
 88 |                         Number of nearest neighbors to use for mapping to
 89 |                         joint space (default:1000)
 90 |                           --vocabulary-file VOCAB_FILE
 91 |                         Common vocabulary file
 92 |   --threshold THRESHOLD
 93 |                         Threshold for mean shift model for change point
 94 |                         detection (default: 1.75)
 95 |   --bootstrap-samples BOOTSTRAP
 96 |                         Number of bootstrap samples to draw (default: 1000)
 97 |   --workers WORKERS     Maximum number of workers (default: 1)
 98 |   -l LOG, --log LOG     log verbosity level
 99 |  
100 | 
101 | **Usage: pos_pipeline.py**
102 | 
103 | optional arguments:
104 |   -h, --help            show this help message and exit
105 |   --corpus-dir CORPUS_DIR
106 |                         Corpus directory
107 |   --file-extension EXT  Corpus file extension
108 |   --working-dir WORKING_DIR
109 |                         Working directory
110 |   --output-dir OUTPUT_DIR
111 |                         Output directory
112 |   --start-time-point START
113 |                         Start time point
114 |   --end-time-point END  End time point
115 |   --step-size STEP      Step size for timepoints
116 |   --vocabulary-file VOCAB_FILE
117 |                         Common vocabulary file
118 |   --threshold THRESHOLD
119 |                         Threshold for mean shift model for change point
120 |                         detection
121 |   --bootstrap-samples BOOTSTRAP
122 |                         Number of bootstrap samples to draw
123 |   --workers WORKERS     Maximum number of workers
124 |   -l LOG, --log LOG     log verbosity level
125 |   
126 |  
127 |  **usage: freq_pipeline.py**
128 |  
129 | optional arguments:
130 |   -h, --help            show this help message and exit
131 |   --corpus-dir CORPUS_DIR
132 |                         Corpus directory
133 |   --file-extension EXT  Corpus file extension
134 |   --working-dir WORKING_DIR
135 |                         Working directory
136 |   --output-dir OUTPUT_DIR
137 |                         Output directory
138 |   --start-time-point START
139 |                         Start time point
140 |   --end-time-point END  End time point
141 |   --step-size STEP      Step size for timepoints
142 |   --vocabulary-file VOCAB_FILE
143 |                         Common vocabulary file
144 |   --threshold THRESHOLD
145 |                         Threshold for mean shift model for change point
146 |                         detection
147 |   --bootstrap-samples BOOTSTRAP
148 |                         Number of bootstrap samples to draw
149 |   --workers WORKERS     Maximum number of workers
150 |   -l LOG, --log LOG     log verbosity level
151 | 
152 | 
153 | 
154 | Requirements
155 | ------------
156 | * wheel==0.23.0
157 | * argparse>=1.2.1
158 | * numpy>=0.9.1
159 | * scipy>=0.15.1
160 | * more_itertools>=2.2
161 | * joblib>=0.8.3-r1
162 | * gensim==0.10.3
163 | * statsmodels>=0.5.0
164 | * changepoint>=0.1.0
165 | * nltk>=3.0.0
166 | * textblob>=0.9.0
167 | * textblob-aptagger>=0.2.0
168 | * psutil>=2.2.0
169 | * GNU Parallel
170 | * R (good to have)
171 | * rpy2 (good to have)
172 | 
173 | 
174 | 
175 | Installation
176 | ------------
177 | #. Install GNU Parallel from here:  www.gnu.org/software/software.html
178 | #. cd langchangetrack
179 | #. pip install -r requirements.txt 
180 | #. python setup.py install
181 | 
182 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/langchangetrack.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/langchangetrack.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/langchangetrack"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/langchangetrack"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\langchangetrack.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\langchangetrack.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/syntactic/scripts/pos_displacements.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from argparse import ArgumentParser
  5 | 
  6 | import os
  7 | from os import path
  8 | import cPickle as pickle
  9 | import numpy as np
 10 | import scipy
 11 | import itertools
 12 | from scipy.spatial.distance import cosine, euclidean, norm
 13 | import pandas as pd
 14 | import more_itertools
 15 | from joblib import Parallel, delayed
 16 | 
 17 | import gensim
 18 | 
 19 | from langchangetrack.utils.dummy_regressor import DummyRegressor 
 20 | from langchangetrack.utils import LocalLinearRegression
 21 | from langchangetrack.utils import entropy
 22 | from langchangetrack.tsconstruction.displacements import Displacements
 23 | 
 24 | import logging
 25 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 26 | logger = logging.getLogger("langchangetrack")
 27 | 
 28 | import psutil
 29 | from multiprocessing import cpu_count
 30 | 
 31 | p = psutil.Process(os.getpid())
 32 | p.set_cpu_affinity(list(range(cpu_count())))
 33 | 
 34 | def get_vectors_pos(model, norm_embedding=True):
 35 |     return model
 36 | 
 37 | def load_model_pos(model_path):
 38 |     """ Load the POS model from a file."""
 39 |     return pd.read_csv(model_path) 
 40 | 
 41 | def load_predictor_pos(predictor_path):
 42 |     """ Load the predictor model. """
 43 |     return DummyRegressor()
 44 | 
 45 | class POSDisplacements(Displacements):
 46 |     def __init__(self, 
 47 |                  data_dir,
 48 |                  pred_dir, 
 49 |                  words_file,
 50 |                  timepoints,
 51 |                  num_words,
 52 |                  get_vectors,
 53 |                  load_model,
 54 |                  load_predictor,
 55 |                  method,
 56 |                  win_size,
 57 |                  fixed_point,
 58 |                  embedding_suffix,
 59 |                  predictor_suffix,
 60 |                  workers):
 61 |                  
 62 |         """ Constructor """
 63 |         # Initialize the super class.
 64 |         super(POSDisplacements, self).__init__()
 65 |         self.get_vectors = get_vectors
 66 |         self.load_model  = load_model
 67 |         self.has_predictors =  True
 68 |         self.load_predictor = load_predictor
 69 |         self.norm_embedding = False
 70 |         self.words_file = words_file
 71 |         self.timepoints = timepoints
 72 |         self.data_dir = data_dir
 73 |         self.pred_dir = pred_dir
 74 |         self.num_words = num_words
 75 |         self.method = method
 76 |         self.win_size = win_size
 77 |         self.fixed_point = fixed_point
 78 |         self.embedding_suffix = embedding_suffix
 79 |         self.predictor_suffix = predictor_suffix
 80 |         self.workers = workers
 81 | 
 82 |     def number_distance_metrics(self):
 83 |         return 1
 84 | 
 85 |     def calculate_distance(self, vec1, vec2):
 86 |         """ Calculate distances between vector1 and vector2. """
 87 |         if vec1 is None or vec2 is None:
 88 |             return [np.nan]
 89 |         d = entropy.jensen_shannon_divergence(np.vstack([vec1, vec2]), unit='digit')
 90 |         return [d[0]]
 91 | 
 92 |     def load_models_and_predictors(self):
 93 |         """ Load all the models and predictors. """
 94 |         self.models = {}
 95 |         self.predictors = {}
 96 |         model_paths = [path.join(self.data_dir, timepoint + self.embedding_suffix) for timepoint in self.timepoints]
 97 |         predictor_handles = [timepoint for timepoint in self.timepoints]
 98 |         loaded_models = Parallel(n_jobs=self.workers)(delayed(self.load_model)(model_path) for model_path in model_paths)
 99 |         for i, timepoint in enumerate(self.timepoints):
100 |             self.models[timepoint] = loaded_models[i]
101 |             self.predictors[timepoint] = self.load_predictor(predictor_handles[i])
102 |         print "Done loading predictors"
103 | 
104 |     def is_present(self, timepoint, word):
105 |         """ Check if the word is present in the vocabulary at this timepoint. """ 
106 |         model = self.get_model(timepoint)
107 |         return word in model.word.values
108 | 
109 |     def get_vector(self, timepoint, word):
110 |         """ Get the embedding for this word at the specified timepoint."""
111 |         model = self.get_model(timepoint)
112 |         return model[model.word == word].values[0][1:]
113 | 
114 | def main(args):
115 |     syear = int(args.syear)
116 |     eyear = int(args.eyear)
117 |     stepsize = int(args.stepsize)
118 |     timepoints = np.arange(syear, eyear, stepsize)
119 |     timepoints = [str(t) for t in timepoints]
120 |     workers = int(args.workers)
121 |     # Create the main work horse.
122 |     e = POSDisplacements(args.datadir,
123 |                                 args.preddir,
124 |                                 args.filename,
125 |                                 timepoints,
126 |                                 int(args.num_words),
127 |                                 get_vectors_pos,
128 |                                 load_model_pos,
129 |                                 load_predictor_pos,
130 |                                 args.method,
131 |                                 args.win_size,
132 |                                 str(args.fixed_point),
133 |                                 args.embedding_suffix,
134 |                                 args.predictor_suffix,
135 |                                 workers)
136 |                                 
137 |     # Load the models and predictors
138 |     e.load_models_and_predictors()
139 | 
140 |     # Calculate the word displacements and dump.
141 |     L, H, dfo, dfn = e.calculate_words_displacement(column_names=['word', 's', 'otherword', 't', 'jsd'], n_jobs = workers)
142 |     fname = 'timeseries_s_t' + '_' + args.outputsuffix + '.pkl'
143 |     pickle.dump((L,H, dfo, dfn), open(path.join(args.outputdir, fname),'wb'))
144 | 
145 | if __name__ == "__main__":
146 |     parser = ArgumentParser()
147 |     parser.add_argument("-f", "--file", dest="filename", help="Input file for words")
148 |     parser.add_argument("-d", "--data_dir", dest="datadir", help="data directory")
149 |     parser.add_argument("-p", "--pred_dir", dest="preddir", help="data directory")
150 |     parser.add_argument("-o", "--output_dir", dest="outputdir", help="Output directory")
151 |     parser.add_argument("-os", "--output_suffix", dest="outputsuffix", help="Output suffix")
152 |     parser.add_argument("-es", "--emb_suffix", dest="embedding_suffix", help="embedding suffix")
153 |     parser.add_argument("-ps", "--pred_suffix", dest="predictor_suffix",help="predictor suffix")
154 |     parser.add_argument("-sy", "--start", dest="syear", default = '1800', help="start year")
155 |     parser.add_argument("-ey", "--end", dest="eyear", default = '2010', help="end year(not included)")
156 |     parser.add_argument("-s", "--window_size", dest="stepsize", default = 5, help="Window size for time series")
157 |     parser.add_argument("-e", "--embedding_type", dest="embedding_type", default = 'pos',  help="Embedding type")
158 |     parser.add_argument("-m", "--method", dest="method", default="polar", help="Method to use")
159 |     parser.add_argument("-w", "--win_size", dest="win_size", default="-1", help="Window size to use if not polar", type=int)
160 |     parser.add_argument("-y", "--fixed_point", dest="fixed_point", default="-1", help="fixed point to use if method is fixed", type=int)
161 |     parser.add_argument("-n", "--num_words", dest="num_words", default = -1, help="Number of words", type=int)
162 |     parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int)
163 |     logging.basicConfig(level=logging.INFO, format=LOGFORMAT)
164 |     args = parser.parse_args()
165 |     main(args)
166 | 


--------------------------------------------------------------------------------
/langchangetrack/cpdetection/detect_changepoints_word_ts_r.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | 
  3 | import logging
  4 | import pandas as pd
  5 | import numpy as np
  6 | import itertools
  7 | import more_itertools
  8 | import os
  9 | 
 10 | from functools import partial
 11 | 
 12 | from changepoint.utils.ts_stats import parallelize_func
 13 | from changepoint.rchangepoint import estimate_cp_pval, estimate_cp
 14 | 
 15 | import psutil
 16 | from multiprocessing import cpu_count
 17 | 
 18 | p = psutil.Process(os.getpid())
 19 | p.set_cpu_affinity(list(range(cpu_count())))
 20 | 
 21 | __author__ = "Vivek Kulkarni"
 22 | __email__ = "viveksck@gmail.com"
 23 | 
 24 | # Global variable specifying which column index the time series
 25 | # begins in a dataframe 
 26 | TS_OFFSET = 2
 27 | 
 28 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 29 | 
 30 | def normalize_timeseries(df):
 31 |     """ Centre and scale each time series column. """
 32 |     # Normalize a set of time series by subtracting the mean from each column
 33 |     # and dividing by the standard deviation.
 34 |     dfm = df.copy(deep=True)
 35 |     dfmean = df.mean()
 36 |     dfstd = df.std()
 37 |     for col in df.columns[TS_OFFSET:]:
 38 |         dfm[col] = (df[col] - dfmean[col]) / dfstd[col]
 39 |     return dfm
 40 | 
 41 | def get_filtered_df(df, vocab_file):
 42 |     """ Return a data frame with only the words present in the vocab file. """
 43 |     if vocab_file:
 44 |         vocab = open(vocab_file).readlines()
 45 |         vocab = [v.strip() for v in vocab]
 46 |         # Get the set of words.
 47 |         words = pd.Series(df.word.values.ravel()).unique()
 48 |         set_words = set(words)
 49 |         # Find the words common to data frame and vocab
 50 |         common_set_words = set_words & set(vocab)
 51 |         # Filter the dataframe
 52 |         df_filtered = df[df.word.isin(common_set_words)]
 53 |         return df_filtered
 54 |     else:
 55 |         return df
 56 | 
 57 | def get_actual_cp(df, cp_idx):
 58 |     """ 
 59 |     Return the actual time point corresponding to the change point index.
 60 |     """
 61 |     # If the cpt detection did not find any changepoint it
 62 |     # returns NAN in which case we return the same
 63 |     if np.isnan(cp_idx):
 64 |         return cp_idx
 65 | 
 66 |     # Add 1 as the first column is word.
 67 |     return df.columns[cp_idx + 1]
 68 | 
 69 | def get_pval_word_chunk(chunk, df, threshold = None): 
 70 |     """ 
 71 |     Process each word in a chunk and return pvalue and changepoint.
 72 |     Here we set R changepoint class = FALSE which return pvalue.
 73 | 
 74 |     """
 75 |     results = []
 76 |     for w in chunk:
 77 |         # Get the time series.
 78 |         ts = np.array(df[df.word == w].values[0][TS_OFFSET:])
 79 |         # Process that time series.
 80 |         results.append(estimate_cp_pval(ts))
 81 |     return results
 82 | 
 83 | 
 84 | def get_cp_word_chunk(chunk, df, threshold = None): 
 85 |     """ 
 86 |     Process each word in a chunk and return changepoints. Does not return 
 87 |     pvalue. 
 88 |     """
 89 |     results = []
 90 |     for w in chunk:
 91 |         ts = np.array(df[df.word == w].values[0][TS_OFFSET:])
 92 |         cp_list = estimate_cp(ts)
 93 |         if len(cp_list):
 94 |             # Returns most recent change point if any.
 95 |             results.append(cp_list[-1])
 96 |         else:
 97 |             # No change points.
 98 |             results.append(np.nan)
 99 |     return results
100 | 
101 | 
102 | def main(args):
103 |     # Read the arguments
104 |     df_f = args.filename
105 |     common_vocab_file = args.vocab_file
106 |     pval_file = args.pval_file
107 |     col_to_drop = args.col
108 |     should_normalize = not(args.dont_normalize)
109 |     n_jobs = int(args.workers)
110 |     cp_pval = args.dump_pval
111 |     if args.threshold != None:
112 |         threshold = float(args.threshold)
113 |     else:
114 |         threshold = None
115 | 
116 |     print "CONFIG:"
117 |     print "FILENAME:", df_f
118 |     print "VOCAB FILE:", common_vocab_file
119 |     print "PVAL_FILE:", pval_file
120 |     print "COL TO DROP:", col_to_drop
121 |     print "NORMALIZE:", should_normalize
122 |     print "Threshold", threshold
123 | 
124 |     # Read the time series data
125 |     df = pd.read_csv(df_f)
126 |     # Restrict only to the common vocabulary.
127 |     df = get_filtered_df(df, common_vocab_file)
128 | 
129 |     # Normalize the data frame
130 |     if should_normalize:
131 |         norm_df = normalize_timeseries(df)
132 |     else:
133 |         norm_df = df
134 |  
135 |     # Drop a column if needed. 
136 |     if col_to_drop in norm_df.columns:
137 |         cols = df.columns.tolist()
138 |         if col_to_drop == norm_df.columns[-1]:
139 |             time_points = cols[2:]
140 |             new_cols = cols[0:2] + time_points[::-1]
141 |             norm_df = norm_df[new_cols]
142 |             print norm_df.columns  
143 |         norm_df.drop(col_to_drop, axis = 1, inplace=True)
144 |         
145 |     print "Columns of the time series", norm_df.columns
146 |     cwords = norm_df.word.values
147 |     print "Number of words we are processing", len(cwords)
148 | 
149 |     chunksz = np.ceil(len(cwords)/float(n_jobs))
150 |     if cp_pval: 
151 |         results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df = norm_df, threshold = threshold)
152 |         cps, pvals = zip(*results)
153 |         # R returns 1 for a very high stat significance. So we invert it as for
154 |         # us low pvalues mean more significance.
155 |         pvals = [(1.0 - pval) for pval in pvals]
156 |         actual_cps = [get_actual_cp(norm_df, cp) for cp in cps]
157 |         results = zip(cwords, actual_cps, pvals)
158 |         header = ['word', 'cp', 'pval']
159 |         pvalue_df = pd.DataFrame().from_records(results, columns=header)
160 |         sdf = pvalue_df.sort(columns=['pval'])
161 |         sdf.to_csv(pval_file, encoding='utf-8', index = None)
162 |     else:
163 |         results = parallelize_func(cwords[:], get_cp_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df = norm_df)
164 |         cps = results
165 |         actual_cps = [get_actual_cp(norm_df, cp) for cp in cps]
166 |         results = zip(cwords, actual_cps)
167 |         header = ['word', 'cp']
168 |         pvalue_df = pd.DataFrame().from_records(results, columns=header)
169 |         sdf = pvalue_df.sort(columns=['cp'])
170 |         sdf.to_csv(pval_file, encoding='utf-8', index = None)
171 | 
172 | if __name__ == "__main__":
173 |     parser = ArgumentParser()
174 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
175 |     parser.add_argument("-v", "--vfile", dest="vocab_file", help="Input file")
176 |     parser.add_argument("-p", "--pfile", dest="pval_file", help="Input file")
177 |     parser.add_argument("-c", "--col", dest="col", help="Input file")
178 |     parser.add_argument("-s", "--shuffle", dest="shuffle", action='store_true', default = False, help="Shuffle")
179 |     parser.add_argument("-d", "--dont_normalize", dest="dont_normalize", action='store_true', default = False, help="Dont normalize")
180 |     parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use")
181 |     parser.add_argument("-dump_pval", "--dump_pval", dest="dump_pval",default=False, action='store_true', help="Dump pvalue or not")
182 |     parser.add_argument("-t", "--threshold", dest="threshold", default=None, type=float, help="threshold")
183 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO")
184 |     args = parser.parse_args()
185 |     if args.log == 'DEBUG':
186 |         sys.excepthook = debug
187 |     numeric_level = getattr(logging, args.log.upper(), None)
188 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
189 |     main(args)
190 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/dump_timeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """dump_timeseries.py: Dumps the displacements as a timeseries in a data frame"""
  5 | 
  6 | from argparse import ArgumentParser
  7 | import logging
  8 | import sys
  9 | import os
 10 | from os import path
 11 | from time import time
 12 | from glob import glob
 13 | import pickle
 14 | import pandas as pd
 15 | import numpy as np
 16 | import more_itertools
 17 | 
 18 | from scipy.interpolate import interp1d
 19 | from scipy.interpolate import UnivariateSpline
 20 | 
 21 | from joblib import Parallel, delayed
 22 | 
 23 | import psutil
 24 | from multiprocessing import cpu_count
 25 | 
 26 | p = psutil.Process(os.getpid())
 27 | p.set_cpu_affinity(list(range(cpu_count())))
 28 | 
 29 | __author__ = "Vivek Kulkarni"
 30 | __email__ = "viveksck@gmail.com"
 31 | 
 32 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 33 | 
 34 | 
 35 | def interpolate(x, xinter, values, finter):
 36 |     # Find all the points which we need to interpolate
 37 |     xmissing = [xm for xm in xinter if xm not in x]
 38 |     # Interpolate the function value at those points
 39 |     yintervalues = finter(xmissing)
 40 |     # Original points and values pairs
 41 |     orig_pairs = zip(x, values)
 42 |     # Interpolated points and values pairs
 43 |     interp_pairs = zip(xmissing, yintervalues)
 44 |     # Find the final values
 45 |     assert(len(orig_pairs) + len(interp_pairs) == len(xinter))
 46 |     final_pairs = sorted(orig_pairs + interp_pairs)
 47 |     return final_pairs
 48 | 
 49 | 
 50 | def create_word_time_series(old_df, new_df, w, sourcexinter, destxinter, metric_name="", interpolate=False):
 51 |     """ Create the time series for a word. """
 52 | 
 53 |     sourcex = np.asarray(old_df[old_df.word == w].s.values, dtype=int)
 54 |     destx = np.asarray(new_df[new_df.word == w].s.values, dtype=int)
 55 | 
 56 |     old_values = old_df[old_df.word == w][metric_name].values
 57 |     new_values = new_df[new_df.word == w][metric_name].values
 58 | 
 59 |     try:
 60 |         fold = interp1d(sourcex, old_values, bounds_error=False)
 61 |         fnew = interp1d(destx, new_values, bounds_error=False)
 62 |     except:
 63 |         print "Failed to interpolate", w
 64 |         return None, None
 65 | 
 66 |     if interpolate:
 67 |         final_old_pairs = interpolate(sourcex, sourcexinter, old_values, fold)
 68 |         final_new_pairs = interpolate(destx, destxinter, new_values, fnew)
 69 |         xinterold, yinterold = zip(*final_old_pairs)
 70 |         xinternew, yinternew = zip(*final_new_pairs)
 71 |     else:
 72 |         yinterold = old_values
 73 |         yinternew = new_values
 74 | 
 75 |     OL = [w]
 76 |     NL = [w]
 77 |     OL.extend(yinterold)
 78 |     NL.extend(yinternew)
 79 |     return (OL,  NL)
 80 | 
 81 | 
 82 | def process_chunk(chunk, func, olddf, newdf, sourcexinter, destxinter, metric_name, interpolate):
 83 |     """ Process each chunk. """
 84 |     results = [func(olddf, newdf, e, sourcexinter, destxinter, metric_name, interpolate)
 85 |                for e in chunk]
 86 |     return results
 87 | 
 88 | 
 89 | def main(args):
 90 |     # get the arguments
 91 |     method = args.method
 92 |     win_size = args.win_size
 93 |     step = args.step
 94 |     metric_name = args.metric_name
 95 |     n_jobs = args.workers
 96 | 
 97 |     # Load the data.
 98 |     L, H, olddf, newdf = pickle.load(open(args.filename))
 99 |     words = pd.Series(olddf.word.values.ravel()).unique()
100 |     oldrows = []
101 |     newrows = []
102 |     sourcexrange = np.arange(args.mint, args.maxt, step)
103 |     destxrange = np.arange(args.mint, args.maxt, step)
104 |     if method == 'win':
105 |         sourcexrange = sourcexrange[win_size:]
106 |         destxrange = destxrange[:-win_size]
107 | 
108 |     if args.interpolate:
109 |         sourcexinter = np.arange(sourcexrange[0], sourcexrange[-1] + 1, 1)
110 |         destxinter = np.arange(destxrange[0], destxrange[-1] + 1, 1)
111 |     else:
112 |         sourcexinter = sourcexrange
113 |         destxinter = destxrange
114 | 
115 |     # Construct the series
116 |     assert(len(sourcexinter) == len(destxinter))
117 |     chunk_sz = np.ceil(len(words)/float(n_jobs))
118 |     words_chunks = more_itertools.chunked(words, chunk_sz)
119 |     timeseries_chunks = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, create_word_time_series, olddf, newdf,
120 |                                                                                sourcexinter, destxinter,
121 |                                                                                metric_name=metric_name,
122 |                                                                                interpolate=args.interpolate) for chunk in words_chunks)
123 | 
124 |     timeseries = list(more_itertools.flatten(timeseries_chunks))
125 | 
126 |     # Dump the data frame
127 |     for orow, newrow in timeseries:
128 |         if orow and newrow:
129 |             oldrows.append(orow)
130 |             newrows.append(newrow)
131 | 
132 |     oldtimeseries = pd.DataFrame()
133 |     newtimeseries = pd.DataFrame()
134 |     header = ['word']
135 |     header.extend(sourcexinter)
136 |     newheader = ['word']
137 |     newheader.extend(destxinter)
138 |     oldtimeseries = oldtimeseries.from_records(oldrows, columns=header)
139 |     oldtimeseries = oldtimeseries.fillna(method='backfill', axis=1)
140 |     newtimeseries = newtimeseries.from_records(newrows, columns=newheader)
141 |     newtimeseries = newtimeseries.fillna(method='backfill', axis=1)
142 |     oldtimeseries.to_csv(args.sourcetimef, encoding='utf-8')
143 |     newtimeseries.to_csv(args.endtimef, encoding='utf-8')
144 | 
145 | 
146 | def debug(type_, value, tb):
147 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
148 |         # we are in interactive mode or we don't have a tty-like device, so we
149 |         # call the default hook
150 |         sys.__excepthook__(type_, value, tb)
151 |     else:
152 |         import traceback
153 |         import pdb
154 |         # we are NOT in interactive mode, print the exception...
155 |         traceback.print_exception(type_, value, tb)
156 |         print("\n")
157 |         # ...then start the debugger in post-mortem mode.
158 |         pdb.pm()
159 | 
160 | if __name__ == "__main__":
161 |     parser = ArgumentParser()
162 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
163 |     parser.add_argument("-i", "--interpolate", dest="interpolate", help="interpolate", action='store_true', default=False)
164 |     parser.add_argument("-s", "--sfile", dest="sourcetimef", help="Input file")
165 |     parser.add_argument("-e", "--efile", dest="endtimef", help="Input file")
166 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO")
167 |     parser.add_argument("-m", "--min", dest="mint", help="starting time point", default=1900, type=int)
168 |     parser.add_argument("-n", "--max", dest="maxt", help="ending timepoint(not included)", default=2010, type=int)
169 |     parser.add_argument("-st", "--step", dest="step", help="stepsize", default=5, type=int)
170 |     parser.add_argument("-me", "--method", dest="method", default="polar", help="Method to use")
171 |     parser.add_argument("-metric", "--metric_name", dest="metric_name", default="cosine", help="Metric name to use")
172 |     parser.add_argument("-w", "--win_size", dest="win_size", default=-1, help="Window size to use if not polar", type=int)
173 |     parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int)
174 |     args = parser.parse_args()
175 |     if args.log == 'DEBUG':
176 |         sys.excepthook = debug
177 |     numeric_level = getattr(logging, args.log.upper(), None)
178 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
179 |     main(args)
180 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/distributional/scripts/embedding_displacements.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from argparse import ArgumentParser
  5 | 
  6 | import os
  7 | from os import path
  8 | import cPickle as pickle
  9 | import numpy as np
 10 | import scipy
 11 | import itertools
 12 | from scipy.spatial.distance import cosine, euclidean, norm
 13 | import pandas as pd
 14 | import more_itertools
 15 | from joblib import Parallel, delayed
 16 | 
 17 | import langchangetrack
 18 | from langchangetrack.utils.dummy_regressor import DummyRegressor
 19 | import gensim
 20 | 
 21 | from langchangetrack.tsconstruction.displacements import Displacements
 22 | 
 23 | import logging
 24 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 25 | logger = logging.getLogger("langchangetrack")
 26 | 
 27 | import psutil
 28 | from multiprocessing import cpu_count
 29 | 
 30 | p = psutil.Process(os.getpid())
 31 | p.set_cpu_affinity(list(range(cpu_count())))
 32 | 
 33 | 
 34 | def uniform(distances):
 35 |     return np.ones(len(distances))
 36 | 
 37 | 
 38 | def get_vectors_sg(model, norm_embedding=True):
 39 |     """ Return the embeddings of  a skipgram model. """
 40 |     if norm_embedding:
 41 |         return model.syn0norm
 42 |     else:
 43 |         return model.syn0
 44 | 
 45 | 
 46 | def load_model_skipgram(model_path):
 47 |     """ Load the skipgram model from a file in word2vec format. """
 48 |     return gensim.models.Word2Vec.load_word2vec_format(model_path)
 49 | 
 50 | 
 51 | def load_predictor_skipgram(predictor_path):
 52 |     """ Load the predictor model. """
 53 |     return pickle.load(open(predictor_path))
 54 | 
 55 | 
 56 | class EmbeddingsDisplacements(Displacements):
 57 | 
 58 |     def __init__(self,
 59 |                  data_dir,
 60 |                  pred_dir,
 61 |                  words_file,
 62 |                  timepoints,
 63 |                  num_words,
 64 |                  get_vectors,
 65 |                  load_model,
 66 |                  load_predictor,
 67 |                  method,
 68 |                  win_size,
 69 |                  fixed_point,
 70 |                  embedding_suffix,
 71 |                  predictor_suffix,
 72 |                  workers):
 73 |         """ Constructor """
 74 |         # Initialize the super class.
 75 |         super(EmbeddingsDisplacements, self).__init__()
 76 |         self.get_vectors = get_vectors
 77 |         self.load_model = load_model
 78 |         self.has_predictors = True
 79 |         self.load_predictor = load_predictor
 80 |         self.norm_embedding = True
 81 |         self.words_file = words_file
 82 |         self.timepoints = timepoints
 83 |         self.data_dir = data_dir
 84 |         self.pred_dir = pred_dir
 85 |         self.num_words = num_words
 86 |         self.method = method
 87 |         self.win_size = win_size
 88 |         self.fixed_point = fixed_point
 89 |         self.embedding_suffix = embedding_suffix
 90 |         self.predictor_suffix = predictor_suffix
 91 |         self.workers = workers
 92 | 
 93 |     def number_distance_metrics(self):
 94 |         return 2
 95 | 
 96 |     def calculate_distance(self, vec1, vec2):
 97 |         """ Calculate distances between vector1 and vector2. """
 98 |         return [cosine(vec1, vec2), euclidean(vec1, vec2)]
 99 | 
100 |     def load_models_and_predictors(self):
101 |         """ Load all the models and predictors. """
102 |         self.models = {}
103 |         self.predictors = {}
104 |         model_paths = [path.join(self.data_dir, timepoint + '_embeddings' + self.embedding_suffix) for timepoint in self.timepoints]
105 |         predictor_handles = [path.join(self.pred_dir, timepoint + '_embeddings' + self.predictor_suffix) for timepoint in self.timepoints]
106 |         loaded_models = Parallel(n_jobs=self.workers)(delayed(self.load_model)(model_path) for model_path in model_paths)
107 |         for i, timepoint in enumerate(self.timepoints):
108 |             self.models[timepoint] = loaded_models[i]
109 |             self.predictors[timepoint] = self.load_predictor(predictor_handles[i])
110 |             if hasattr(self.predictors[timepoint], 'weight_func'):
111 |                 self.predictors[timepoint].weight_func = uniform
112 |                 print "Loaded predictor for", timepoint
113 |         print "Done loading predictors"
114 | 
115 |     def is_present(self, timepoint, word):
116 |         """ Check if the word is present in the vocabulary at this timepoint. """
117 |         model = self.get_model(timepoint)
118 |         return word in model.vocab
119 | 
120 |     def get_vector(self, timepoint, word):
121 |         """ Get the embedding for this word at the specified timepoint."""
122 |         model = self.get_model(timepoint)
123 |         return self.get_vectors(model, self.norm_embedding)[model.vocab[word].index]
124 | 
125 | 
126 | def main(args):
127 |     syear = int(args.syear)
128 |     eyear = int(args.eyear)
129 |     stepsize = int(args.stepsize)
130 |     timepoints = np.arange(syear, eyear, stepsize)
131 |     timepoints = [str(t) for t in timepoints]
132 |     workers = int(args.workers)
133 |     # Create the main work horse.
134 |     e = EmbeddingsDisplacements(args.datadir,
135 |                                 args.preddir,
136 |                                 args.filename,
137 |                                 timepoints,
138 |                                 int(args.num_words),
139 |                                 get_vectors_sg,
140 |                                 load_model_skipgram,
141 |                                 load_predictor_skipgram,
142 |                                 args.method,
143 |                                 args.win_size,
144 |                                 str(args.fixed_point),
145 |                                 args.embedding_suffix,
146 |                                 args.predictor_suffix,
147 |                                 workers)
148 | 
149 |     # Load the models and predictors
150 |     e.load_models_and_predictors()
151 | 
152 |     # Calculate the word displacements and dump.
153 |     L, H, dfo, dfn = e.calculate_words_displacement(column_names=['word', 's', 'otherword', 't', 'cosine', 'euclidean'], n_jobs=workers)
154 |     fname = 'timeseries_s_t' + '_' + args.outputsuffix + '.pkl'
155 |     pickle.dump((L, H, dfo, dfn), open(path.join(args.outputdir, fname), 'wb'))
156 | 
157 | if __name__ == "__main__":
158 |     parser = ArgumentParser()
159 |     parser.add_argument("-f", "--file", dest="filename", help="Input file for words")
160 |     parser.add_argument("-d", "--data_dir", dest="datadir", help="data directory")
161 |     parser.add_argument("-p", "--pred_dir", dest="preddir", help="data directory")
162 |     parser.add_argument("-o", "--output_dir", dest="outputdir", help="Output directory")
163 |     parser.add_argument("-os", "--output_suffix", dest="outputsuffix", help="Output suffix")
164 |     parser.add_argument("-es", "--emb_suffix", dest="embedding_suffix", help="embedding suffix")
165 |     parser.add_argument("-ps", "--pred_suffix", dest="predictor_suffix", help="predictor suffix")
166 |     parser.add_argument("-sy", "--start", dest="syear", default='1800', help="start year")
167 |     parser.add_argument("-ey", "--end", dest="eyear", default='2010', help="end year(not included)")
168 |     parser.add_argument("-s", "--window_size", dest="stepsize", default=5, help="Window size for time series")
169 |     parser.add_argument("-e", "--embedding_type", dest="embedding_type", default='skipgram',  help="Embedding type")
170 |     parser.add_argument("-m", "--method", dest="method", default="polar", help="Method to use")
171 |     parser.add_argument("-w", "--win_size", dest="win_size", default="-1", help="Window size to use if not polar", type=int)
172 |     parser.add_argument("-y", "--fixed_point", dest="fixed_point", default="-1", help="fixed point to use if method is fixed", type=int)
173 |     parser.add_argument("-n", "--num_words", dest="num_words", default=-1, help="Number of words", type=int)
174 |     parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int)
175 |     logging.basicConfig(level=logging.INFO, format=LOGFORMAT)
176 |     args = parser.parse_args()
177 |     main(args)
178 | 


--------------------------------------------------------------------------------
/langchangetrack/cpdetection/demostrate_cp.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | import logging
  3 | import pandas as pd
  4 | import numpy as np
  5 | import itertools
  6 | import more_itertools
  7 | import os
  8 | 
  9 | from functools import partial
 10 | from changepoint.mean_shift_model import MeanShiftModel
 11 | from changepoint.utils.ts_stats import parallelize_func
 12 | 
 13 | __author__ = "Vivek Kulkarni"
 14 | __email__ = "viveksck@gmail.com"
 15 | 
 16 | import psutil
 17 | from multiprocessing import cpu_count
 18 | 
 19 | p = psutil.Process(os.getpid())
 20 | p.set_cpu_affinity(list(range(cpu_count())))
 21 | 
 22 | # Global variable specifying which column index the time series
 23 | # begins in a dataframe
 24 | TS_OFFSET = 2
 25 | 
 26 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 27 | 
 28 | 
 29 | def normalize_timeseries(df):
 30 |     """ 
 31 |         Normalize each column of the data frame by its mean and standard
 32 |         deviation. 
 33 |     """
 34 |     dfm = df.copy(deep=True)
 35 |     dfmean = df.mean()
 36 |     dfstd = df.std()
 37 |     for col in df.columns[2:]:
 38 |         dfm[col] = (df[col] - dfmean[col]) / dfstd[col]
 39 |     return dfm
 40 | 
 41 | 
 42 | def get_filtered_df(df, vocab_file):
 43 |     """ Return a data frame with only the words present in the vocab file. """
 44 |     if vocab_file:
 45 |         vocab = open(vocab_file).readlines()
 46 |         vocab = [v.strip() for v in vocab]
 47 |         # Get the set of words.
 48 |         words = pd.Series(df.word.values.ravel()).unique()
 49 |         set_words = set(words)
 50 |         # Find the words common to data frame and vocab
 51 |         common_set_words = set_words & set(vocab)
 52 |         # Filter the dataframe
 53 |         df_filtered = df[df.word.isin(common_set_words)]
 54 |         return df_filtered
 55 |     else:
 56 |         return df
 57 | 
 58 | 
 59 | def get_pval_word(df, word, B):
 60 |     """ 
 61 |     Get the pvalue of a change point at each time point 't' corresponding to
 62 |     the word. Also return the number of tail successes during boot strap.
 63 |     Use a mean shift model for this.
 64 |     """
 65 |     # Remove the first TS_OFFSET columns as it is 'index' and 'word' to get the
 66 |     # time series for that word.
 67 |     ts = df[df.word == word].values[0][TS_OFFSET:]
 68 |     # Create a mean shift model
 69 |     model = MeanShiftModel()
 70 |     # Detect the change points using a mean shift model
 71 |     stats_ts, pvals, nums = model.detect_mean_shift(ts, B=B)
 72 |     # Return the word and pvals associated with each time point.
 73 |     L = [word]
 74 |     L.extend(pvals)
 75 |     H = [word]
 76 |     H.extend(nums)
 77 |     return L, H
 78 | 
 79 | 
 80 | def get_pval_word_chunk(chunk, df, B):
 81 |     """ Get the p-values for each time point for a chunk of words. """
 82 |     results = [get_pval_word(df, w, B) for w in chunk]
 83 |     return results
 84 | 
 85 | 
 86 | def get_minpval_cp(pvalue_df_row):
 87 |     """ 
 88 |     Get the minimum p-value and the corresponding time point for each word.
 89 |     """
 90 |     # first column is 'word', so ignore it
 91 |     index_series = pvalue_df_row.index[1:]
 92 |     row_series = pvalue_df_row.values[1:]
 93 |     assert(len(index_series) == len(row_series))
 94 | 
 95 |     # Find the minimum pvalue
 96 |     min_pval = np.min(row_series)
 97 |     # Find the index where the minimum pvalue occurrs.
 98 |     min_idx = np.argmin(row_series)
 99 |     # Get the timepoint corresponding to that index
100 |     min_cp = index_series[min_idx]
101 | 
102 |     return min_pval, min_cp
103 | 
104 | 
105 | def get_cp_pval(pvalue_df_row, zscore_df, threshold=0.0):
106 |     """
107 |         Get the minimum p-value corresponding timepoint which also has 
108 |         a Z-SCORE > threshold.
109 | 
110 |     """
111 |     # First column is 'word', so ignore it
112 |     row_series = pvalue_df_row.values[1:]
113 |     # Corresponding Z-Score series for the exact same set of timepoints.
114 |     zscore_series = np.array(zscore_df[zscore_df.word == pvalue_df_row.word][pvalue_df_row.index[1:]])[0]
115 |     assert(len(zscore_series) == len(row_series))
116 | 
117 |     # Get all the indices where zscore exceeds a threshold
118 |     sel_idx = np.where(zscore_series > threshold)[0]
119 |     # If there are no such indices return NAN
120 |     if not len(sel_idx):
121 |         return 1.0, np.nan
122 | 
123 |     # We have some indices. Select the pvalues for those indices.
124 |     pvals_indices = np.take(row_series, sel_idx)
125 |     # Find the minimum pvalue among those candidates.
126 |     min_pval = np.min(pvals_indices)
127 |     # Find the minimum candidate index corresponding to that pvalue
128 |     min_idx = np.argmin(pvals_indices)
129 |     # Select the actual index that it corresponds to
130 |     cp_idx = sel_idx[min_idx]
131 |     # Translate that to the actual timepoint and return it.
132 |     cp = pvalue_df_row.index[1:][cp_idx]
133 |     return min_pval, cp
134 | 
135 | 
136 | def main(args):
137 |     # Read the arguments
138 |     df_f = args.filename
139 |     pval_file = args.pval_file
140 |     sample_file = args.sample_file
141 |     col_to_drop = args.col
142 |     threshold = float(args.threshold)
143 |     workers = args.workers
144 |     print "Config:"
145 |     print "Input data frame file name:", df_f
146 |     print "Output pvalue file", pval_file
147 |     print "Output sample file", sample_file
148 |     print "Columns to drop", col_to_drop
149 |     print "Threshold", threshold
150 | 
151 |     # Read the time series data
152 |     norm_df = pd.read_csv(df_f)
153 | 
154 |     # Drop the column if needed. We typically drop the 1st column as it always is 0 by
155 |     # default.
156 |     if col_to_drop in norm_df.columns:
157 |         cols = norm_df.columns.tolist()
158 |         if col_to_drop == norm_df.columns[-1]:
159 |             time_points = cols[2:]
160 |             new_cols = cols[0:2] + time_points[::-1]
161 |             norm_df = norm_df[new_cols]
162 |         norm_df.drop(col_to_drop, axis=1, inplace=True)
163 |         print "Dropped column", col_to_drop
164 | 
165 |     print "Columns of the data frame are", norm_df.columns
166 |     cwords = norm_df.word.values
167 |     print "Number of words we are analyzing:", len(cwords)
168 | 
169 |     chunksz = np.ceil(len(cwords) / float(workers))
170 |     results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)
171 | 
172 |     pvals, num_samples = zip(*results)
173 | 
174 |     header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1])
175 |     pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header)
176 | 
177 |     # Append additonal columns to the final df
178 |     pvalue_df_final = pvalue_df.copy(deep=True)
179 | 
180 |     pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1))
181 |     pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold))
182 | 
183 |     pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True)
184 | 
185 |     # Write the pvalue output.
186 |     num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header)
187 |     num_samples_df.to_csv(sample_file, encoding='utf-8')
188 | 
189 |     # Write the sample output
190 |     sdf = pvalue_df_final.sort(columns=['tpval'])
191 |     sdf.to_csv(pval_file, encoding='utf-8')
192 | 
193 | if __name__ == "__main__":
194 |     parser = ArgumentParser()
195 |     parser.add_argument("-f", "--file", dest="filename", help="Input time series file")
196 |     parser.add_argument("-p", "--pfile", dest="pval_file", help="Output pvalue file")
197 |     parser.add_argument("-n", "--nfile", dest="sample_file", help="Output sample file")
198 |     parser.add_argument("-c", "--col", dest="col", help="column to drop")
199 |     parser.add_argument("-t", "--threshold", dest="threshold", default=1.75, type=float, help="Threshold to use for mean shift model.")
200 |     parser.add_argument("-b", "--bootstrap", dest="B", default=1000, type=int, help="Number of  bootstrapped samples to take(default:1000)")
201 |     parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use")
202 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO")
203 |     args = parser.parse_args()
204 |     if args.log == 'DEBUG':
205 |         sys.excepthook = debug
206 |     numeric_level = getattr(logging, args.log.upper(), None)
207 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
208 |     main(args)
209 | 


--------------------------------------------------------------------------------
/langchangetrack/cpdetection/detect_changepoints_word_ts.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | import logging
  3 | import pandas as pd
  4 | import numpy as np
  5 | import itertools
  6 | import more_itertools
  7 | import os
  8 | 
  9 | from functools import partial
 10 | from changepoint.mean_shift_model import MeanShiftModel
 11 | from changepoint.utils.ts_stats import parallelize_func
 12 | 
 13 | __author__ = "Vivek Kulkarni"
 14 | __email__ = "viveksck@gmail.com"
 15 | 
 16 | import psutil
 17 | from multiprocessing import cpu_count
 18 | 
 19 | p = psutil.Process(os.getpid())
 20 | p.set_cpu_affinity(list(range(cpu_count())))
 21 | 
 22 | # Global variable specifying which column index the time series
 23 | # begins in a dataframe
 24 | TS_OFFSET = 2
 25 | 
 26 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 27 | 
 28 | 
 29 | def normalize_timeseries(df):
 30 |     """ 
 31 |         Normalize each column of the data frame by its mean and standard
 32 |         deviation. 
 33 |     """
 34 |     dfm = df.copy(deep=True)
 35 |     dfmean = df.mean()
 36 |     dfstd = df.std()
 37 |     for col in df.columns[2:]:
 38 |         dfm[col] = (df[col] - dfmean[col]) / dfstd[col]
 39 |     return dfm
 40 | 
 41 | 
 42 | def get_filtered_df(df, vocab_file):
 43 |     """ Return a data frame with only the words present in the vocab file. """
 44 |     if vocab_file:
 45 |         vocab = open(vocab_file).readlines()
 46 |         vocab = [v.strip() for v in vocab]
 47 |         # Get the set of words.
 48 |         words = pd.Series(df.word.values.ravel()).unique()
 49 |         set_words = set(words)
 50 |         # Find the words common to data frame and vocab
 51 |         common_set_words = set_words & set(vocab)
 52 |         # Filter the dataframe
 53 |         df_filtered = df[df.word.isin(common_set_words)]
 54 |         return df_filtered
 55 |     else:
 56 |         return df
 57 | 
 58 | 
 59 | def get_pval_word(df, word, B):
 60 |     """ 
 61 |     Get the pvalue of a change point at each time point 't' corresponding to
 62 |     the word. Also return the number of tail successes during boot strap.
 63 |     Use a mean shift model for this.
 64 |     """
 65 |     # Remove the first TS_OFFSET columns as it is 'index' and 'word' to get the
 66 |     # time series for that word.
 67 |     ts = df[df.word == word].values[0][TS_OFFSET:]
 68 |     # Create a mean shift model
 69 |     model = MeanShiftModel()
 70 |     # Detect the change points using a mean shift model
 71 |     stats_ts, pvals, nums = model.detect_mean_shift(ts, B=B)
 72 |     # Return the word and pvals associated with each time point.
 73 |     L = [word]
 74 |     L.extend(pvals)
 75 |     H = [word]
 76 |     H.extend(nums)
 77 |     return L, H
 78 | 
 79 | 
 80 | def get_pval_word_chunk(chunk, df, B):
 81 |     """ Get the p-values for each time point for a chunk of words. """
 82 |     results = [get_pval_word(df, w, B) for w in chunk]
 83 |     return results
 84 | 
 85 | 
 86 | def get_minpval_cp(pvalue_df_row):
 87 |     """ 
 88 |     Get the minimum p-value and the corresponding time point for each word.
 89 |     """
 90 |     # first column is 'word', so ignore it
 91 |     index_series = pvalue_df_row.index[1:]
 92 |     row_series = pvalue_df_row.values[1:]
 93 |     assert(len(index_series) == len(row_series))
 94 | 
 95 |     # Find the minimum pvalue
 96 |     min_pval = np.min(row_series)
 97 |     # Find the index where the minimum pvalue occurrs.
 98 |     min_idx = np.argmin(row_series)
 99 |     # Get the timepoint corresponding to that index
100 |     min_cp = index_series[min_idx]
101 | 
102 |     return min_pval, min_cp
103 | 
104 | 
105 | def get_cp_pval(pvalue_df_row, zscore_df, threshold=0.0):
106 |     """
107 |         Get the minimum p-value corresponding timepoint which also has 
108 |         a Z-SCORE > threshold.
109 | 
110 |     """
111 |     # First column is 'word', so ignore it
112 |     row_series = pvalue_df_row.values[1:]
113 |     # Corresponding Z-Score series for the exact same set of timepoints.
114 |     zscore_series = np.array(zscore_df[zscore_df.word == pvalue_df_row.word][pvalue_df_row.index[1:]])[0]
115 |     assert(len(zscore_series) == len(row_series))
116 | 
117 |     # Get all the indices where zscore exceeds a threshold
118 |     sel_idx = np.where(zscore_series > threshold)[0]
119 |     # If there are no such indices return NAN
120 |     if not len(sel_idx):
121 |         return 1.0, np.nan
122 | 
123 |     # We have some indices. Select the pvalues for those indices.
124 |     pvals_indices = np.take(row_series, sel_idx)
125 |     # Find the minimum pvalue among those candidates.
126 |     min_pval = np.min(pvals_indices)
127 |     # Find the minimum candidate index corresponding to that pvalue
128 |     min_idx = np.argmin(pvals_indices)
129 |     # Select the actual index that it corresponds to
130 |     cp_idx = sel_idx[min_idx]
131 |     # Translate that to the actual timepoint and return it.
132 |     cp = pvalue_df_row.index[1:][cp_idx]
133 |     return min_pval, cp
134 | 
135 | 
136 | def main(args):
137 |     # Read the arguments
138 |     df_f = args.filename
139 |     common_vocab_file = args.vocab_file
140 |     pval_file = args.pval_file
141 |     sample_file = args.sample_file
142 |     col_to_drop = args.col
143 |     should_normalize = not(args.dont_normalize)
144 |     threshold = float(args.threshold)
145 | 
146 |     workers = args.workers
147 | 
148 |     print "Config:"
149 |     print "Input data frame file name:", df_f
150 |     print "Vocab file", common_vocab_file
151 |     print "Output pvalue file", pval_file
152 |     print "Output sample file", sample_file
153 |     print "Columns to drop", col_to_drop
154 |     print "Normalize Time series:", should_normalize
155 |     print "Threshold", threshold
156 | 
157 |     # Read the time series data
158 |     df = pd.read_csv(df_f)
159 |     # Consider only words in the common vocabulary.
160 |     df = get_filtered_df(df, common_vocab_file)
161 | 
162 |     # Normalize the data frame
163 |     if should_normalize:
164 |         norm_df = normalize_timeseries(df)
165 |     else:
166 |         norm_df = df
167 | 
168 |     # Drop the column if needed. We typically drop the 1st column as it always is 0 by
169 |     # default.
170 |     if col_to_drop in norm_df.columns:
171 |         cols = df.columns.tolist()
172 |         if col_to_drop == norm_df.columns[-1]:
173 |             time_points = cols[2:]
174 |             new_cols = cols[0:2] + time_points[::-1]
175 |             norm_df = norm_df[new_cols]
176 |         norm_df.drop(col_to_drop, axis=1, inplace=True)
177 |         print "Dropped column", col_to_drop
178 | 
179 |     print "Columns of the data frame are", norm_df.columns
180 |     cwords = norm_df.word.values
181 |     print "Number of words we are analyzing:", len(cwords)
182 | 
183 |     chunksz = np.ceil(len(cwords) / float(workers))
184 |     results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)
185 | 
186 |     pvals, num_samples = zip(*results)
187 | 
188 |     header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1])
189 |     pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header)
190 | 
191 |     # Append additonal columns to the final df
192 |     pvalue_df_final = pvalue_df.copy(deep=True)
193 | 
194 |     pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1))
195 |     pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold))
196 | 
197 |     pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True)
198 | 
199 |     # Write the pvalue output.
200 |     num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header)
201 |     num_samples_df.to_csv(sample_file, encoding='utf-8')
202 | 
203 |     # Write the sample output
204 |     sdf = pvalue_df_final.sort(columns=['tpval'])
205 |     sdf.to_csv(pval_file, encoding='utf-8')
206 | 
207 | if __name__ == "__main__":
208 |     parser = ArgumentParser()
209 |     parser.add_argument("-f", "--file", dest="filename", help="Input time series file")
210 |     parser.add_argument("-v", "--vfile", dest="vocab_file", help="Common Vocab file")
211 |     parser.add_argument("-p", "--pfile", dest="pval_file", help="Output pvalue file")
212 |     parser.add_argument("-n", "--nfile", dest="sample_file", help="Output sample file")
213 |     parser.add_argument("-c", "--col", dest="col", help="column to drop")
214 |     parser.add_argument("-d", "--dont_normalize", dest="dont_normalize", action='store_true', default=False, help="Dont normalize")
215 |     parser.add_argument("-t", "--threshold", dest="threshold", default=1.75, type=float, help="Threshold to use for mean shift model.")
216 |     parser.add_argument("-b", "--bootstrap", dest="B", default=1000, type=int, help="Number of  bootstrapped samples to take(default:1000)")
217 |     parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use")
218 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO")
219 |     args = parser.parse_args()
220 |     if args.log == 'DEBUG':
221 |         sys.excepthook = debug
222 |     numeric_level = getattr(logging, args.log.upper(), None)
223 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
224 |     main(args)
225 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/displacements.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from argparse import ArgumentParser
  5 | 
  6 | import os
  7 | from os import path
  8 | import cPickle as pickle
  9 | import numpy as np
 10 | import scipy
 11 | import itertools
 12 | from scipy.spatial.distance import cosine, euclidean, norm
 13 | import pandas as pd
 14 | import more_itertools
 15 | from joblib import Parallel, delayed
 16 | 
 17 | from langchangetrack.utils.dummy_regressor import DummyRegressor
 18 | import gensim
 19 | 
 20 | import logging
 21 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 22 | logger = logging.getLogger("langchangetrack")
 23 | 
 24 | import psutil
 25 | from multiprocessing import cpu_count
 26 | 
 27 | p = psutil.Process(os.getpid())
 28 | p.set_cpu_affinity(list(range(cpu_count())))
 29 | 
 30 | def normalize_vector(vec):
 31 |     """ Normalize a vector by its L2 norm. """
 32 |     norm = (vec ** 2).sum() ** 0.5
 33 |     return (vec / norm)
 34 | 
 35 | 
 36 | def pairwise(iterable):
 37 |     """ [a,b,c,d]=>[(a,b), (b,c), (c, d)] """
 38 |     a, b = itertools.tee(iterable)
 39 |     next(b, None)
 40 |     return itertools.izip(a, b)
 41 | 
 42 | 
 43 | def process_word_source(w, eobj):
 44 |     """ Calculate displacements of word for source timepoint tuples. """
 45 |     return eobj.process_word(w, 0)
 46 | 
 47 | 
 48 | def process_word_dest(w, eobj):
 49 |     """ Calculate displacements of word for destination timepoint tuples."""
 50 |     return eobj.process_word(w, 1)
 51 | 
 52 | 
 53 | def process_chunk(chunk, func, *args):
 54 |     """ Apply a function on each element of a iterable. """
 55 |     L = []
 56 |     for i, e in enumerate(chunk):
 57 |         L.append(func(e, *args))
 58 |         if i % 10 == 0:
 59 |             print "Processing chunk", i
 60 |     return L
 61 | 
 62 | 
 63 | class Displacements(object):
 64 | 
 65 |     def __init__(self):
 66 |         """ Constructor """
 67 |         self.get_vectors = None
 68 |         self.load_model = None
 69 |         self.models = {}
 70 |         self.has_predictors = False
 71 |         self.load_predictor = None
 72 |         self.predictors = {}
 73 |         self.norm_embedding = False
 74 |         self.words_file = None
 75 |         self.timepoints = None
 76 |         self.data_dir = None
 77 |         self.pred_dir = None
 78 |         self.num_words = -1
 79 |         self.method = None
 80 |         self.win_size = -1
 81 |         self.fixed_point = -1
 82 |         self.embedding_suffix = None
 83 |         self.predictor_suffix = None
 84 | 
 85 |     def get_word_list(self):
 86 |         """ Returns a list of words for which time series needs to be generated. 
 87 |         """
 88 | 
 89 |         words_list = open(self.words_file, 'r').read().split('\n')
 90 |         if words_list[-1] == '':
 91 |             words_list = words_list[:-1]
 92 |         if self.num_words != -1:
 93 |             return words_list[:num_words]
 94 |         else:
 95 |             return words_list
 96 | 
 97 |     def get_tuples(self, word, timepoint1, timepoint2):
 98 |         """ Return what time point pairs we must consider fot the word. """
 99 |         return [(word, timepoint1, word, timepoint2)]
100 | 
101 |     def generate_displacement_word(self, word, timepoints):
102 |         L = []
103 | 
104 |         for ot, nt in timepoints:
105 |             modelo = self.get_predictor(ot)
106 |             modeln = self.get_predictor(nt)
107 |             tuples = self.get_tuples(word, ot, nt)
108 | 
109 |             for tup in tuples:
110 |                 word1 = tup[0]
111 |                 timepoint1 = tup[1]
112 |                 word2 = tup[2]
113 |                 timepoint2 = tup[3]
114 | 
115 |                 if self.is_present(timepoint1, word1) and self.is_present(timepoint2, word2):
116 |                     vec1 = self.get_vector(timepoint1, word1)
117 |                     vec2 = self.get_vector(timepoint2, word2)
118 | 
119 |                     if self.norm_embedding:
120 |                         assert(np.isclose(norm(vec1), 1.0))
121 |                         assert(np.isclose(norm(vec2), 1.0))
122 | 
123 |                     vec1_pred = modelo.predict(vec1)
124 |                     vec2_pred = modeln.predict(vec2)
125 | 
126 |                     if self.norm_embedding:
127 |                         vec1_pred = normalize_vector(vec1_pred)
128 |                         vec2_pred = normalize_vector(vec2_pred)
129 |                         assert(np.isclose(norm(vec1), 1.0))
130 |                         assert(np.isclose(norm(vec2), 1.0))
131 | 
132 |                     d = self.calculate_distance(vec1_pred, vec2_pred)
133 |                     assert(len(d) == self.number_distance_metrics())
134 |                     L.append([word1, timepoint1, word2, timepoint2] + d)
135 |                 else:
136 |                     # Word is not present in both time periods
137 |                     L.append([word1, timepoint1, word2, timepoint2] + list(itertools.repeat(np.nan, self.number_distance_metrics())))
138 |         return L
139 | 
140 |     def get_timepoints_word(self, w, timepoints):
141 |         """ Get the list of timepoints to be considered for a word. """
142 |         for i, t in enumerate(timepoints):
143 |             if self.is_present(t, w):
144 |                 break
145 |         # We have foind the first instance of the word at this time point,
146 |         timepoints_considered = timepoints[i:]
147 | 
148 |         # Create the tuples for calculating displacements based on strategy
149 |         # used.
150 |         if self.method == "polar":
151 |             timepoints1 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[0], len(timepoints_considered))))
152 |             timepoints2 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[-1], len(timepoints_considered))))
153 |         elif self.method == 'win':
154 |             timepoints1 = zip(timepoints_considered[win_size:], timepoints_considered[:-win_size])
155 |             timepoints2 = zip(timepoints_considered[:-win_size], timepoints_considered[win_size:])
156 |         elif self.method == 'fixed':
157 |             timepoints1 = zip(timepoints_considered, list(itertools.repeat(fixed_point, len(timepoints_considered))))
158 |             timepoints2 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[-1], len(timepoints_considered))))
159 | 
160 |         # Return the list if tuples
161 |         return timepoints1, timepoints2
162 | 
163 |     def process_word(self, w, index):
164 |         """ Calculate displacements of the word at each timepoint tuple.
165 |             index: Are we using timepoints1 or timepoints2.
166 |         """
167 |         t = self.get_timepoints_word(w, self.timepoints)
168 |         return self.generate_displacement_word(w, t[index])
169 | 
170 |     def calculate_words_displacement(self, column_names, n_jobs = 1):
171 |         """ Calculate word displacements for each word in the Pandas data frame. """
172 | 
173 |         words = self.get_word_list()
174 |         # Create chunks of the words to be processed.
175 |         chunk_sz = np.ceil(len(words)/float(n_jobs))
176 |         chunks = list(more_itertools.chunked(words, chunk_sz))
177 | 
178 |         # Calculate the displacements
179 |         chunksL = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_source, self) for chunk in chunks)
180 |         chunksH = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_dest, self) for chunk in chunks)
181 |         L = more_itertools.flatten(chunksL)
182 |         H = more_itertools.flatten(chunksH)
183 |         flattendL = [x for sublist in L for x in sublist]
184 |         flattendH = [x for sublist in H for x in sublist]
185 | 
186 |         # Store the results in a nice pandas data frame
187 |         dfo, dfn = self.create_data_frames(flattendL, flattendH, column_names)
188 |         return flattendL, flattendH, dfo, dfn
189 | 
190 |     def create_data_frames(self, L, H, column_names):
191 |         """ Store the displacement of each word for the pair of timepoints in a
192 |             nice Pandas data frame. """
193 |         dfo = pd.DataFrame()
194 |         dfo = dfo.from_records(L, columns=column_names)
195 |         dfo_clean = dfo.fillna(method='ffill')
196 |         dfn = pd.DataFrame()
197 |         dfn = dfn.from_records(H, columns=column_names)
198 |         dfn_clean = dfn.fillna(method='bfill')
199 |         return dfo_clean, dfn_clean
200 | 
201 |     def get_model(self, timepoint):
202 |         """ Return the model corresponding to this timepoint. """
203 |         return self.models[timepoint]
204 | 
205 |     def get_predictor(self, timepoint):
206 |         """ Return the predictor corresponding to this timepoint. """
207 |         return self.predictors[timepoint]
208 | 
209 |     def number_distance_metrics(self):
210 |         """ The number of distance metrics evaluated by calculate_distance.  """
211 |         raise NotImplementedError, "Pure virtual function"
212 | 
213 |     def calculate_distance(self, vec1, vec2):
214 |         """ Calculate distances between vector1 and vector2. """
215 |         raise NotImplementedError, "Pure virtual function"
216 | 
217 |     def load_models_and_predictors(self):
218 |         raise NotImplementedError, "Pure virtual function"
219 | 
220 |     def is_present(self, timepoint, word):
221 |         """ Check if the word is present in the vocabulary at this timepoint. """
222 |         raise NotImplementedError, "Pure virtual function"
223 | 
224 |     def get_vector(self, timepoint, word):
225 |         """ Get the embedding for this word at the specified timepoint."""
226 |         raise NotImplementedError, "Pure virtual function"
227 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # langchangetrack documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Jul  9 22:26:36 2013.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another
 20 | # directory, add these directories to sys.path here. If the directory is
 21 | # relative to the documentation root, use os.path.abspath to make it
 22 | # absolute, like shown here.
 23 | #sys.path.insert(0, os.path.abspath('.'))
 24 | 
 25 | # Get the project root dir, which is the parent dir of this
 26 | cwd = os.getcwd()
 27 | project_root = os.path.dirname(cwd)
 28 | 
 29 | # Insert the project root dir as the first element in the PYTHONPATH.
 30 | # This lets us ensure that the source package is imported, and that its
 31 | # version is used.
 32 | sys.path.insert(0, project_root)
 33 | 
 34 | import langchangetrack
 35 | 
 36 | # -- General configuration ---------------------------------------------
 37 | 
 38 | # If your documentation needs a minimal Sphinx version, state it here.
 39 | #needs_sphinx = '1.0'
 40 | 
 41 | # Add any Sphinx extension module names here, as strings. They can be
 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 43 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 44 | 
 45 | # Add any paths that contain templates here, relative to this directory.
 46 | templates_path = ['_templates']
 47 | 
 48 | # The suffix of source filenames.
 49 | source_suffix = '.rst'
 50 | 
 51 | # The encoding of source files.
 52 | #source_encoding = 'utf-8-sig'
 53 | 
 54 | # The master toctree document.
 55 | master_doc = 'index'
 56 | 
 57 | # General information about the project.
 58 | project = u'langchangetrack'
 59 | copyright = u'2015, Vivek Kulkarni'
 60 | 
 61 | # The version info for the project you're documenting, acts as replacement
 62 | # for |version| and |release|, also used in various other places throughout
 63 | # the built documents.
 64 | #
 65 | # The short X.Y version.
 66 | version = langchangetrack.__version__
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = langchangetrack.__version__
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #language = None
 73 | 
 74 | # There are two options for replacing |today|: either, you set today to
 75 | # some non-false value, then it is used:
 76 | #today = ''
 77 | # Else, today_fmt is used as the format for a strftime call.
 78 | #today_fmt = '%B %d, %Y'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | exclude_patterns = ['_build']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all
 85 | # documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | # If true, keep warnings as "system message" paragraphs in the built
106 | # documents.
107 | #keep_warnings = False
108 | 
109 | 
110 | # -- Options for HTML output -------------------------------------------
111 | 
112 | # The theme to use for HTML and HTML Help pages.  See the documentation for
113 | # a list of builtin themes.
114 | html_theme = 'default'
115 | 
116 | # Theme options are theme-specific and customize the look and feel of a
117 | # theme further.  For a list of options available for each theme, see the
118 | # documentation.
119 | #html_theme_options = {}
120 | 
121 | # Add any paths that contain custom themes here, relative to this directory.
122 | #html_theme_path = []
123 | 
124 | # The name for this set of Sphinx documents.  If None, it defaults to
125 | # "<project> v<release> documentation".
126 | #html_title = None
127 | 
128 | # A shorter title for the navigation bar.  Default is the same as
129 | # html_title.
130 | #html_short_title = None
131 | 
132 | # The name of an image file (relative to this directory) to place at the
133 | # top of the sidebar.
134 | #html_logo = None
135 | 
136 | # The name of an image file (within the static path) to use as favicon
137 | # of the docs.  This file should be a Windows icon file (.ico) being
138 | # 16x16 or 32x32 pixels large.
139 | #html_favicon = None
140 | 
141 | # Add any paths that contain custom static files (such as style sheets)
142 | # here, relative to this directory. They are copied after the builtin
143 | # static files, so a file named "default.css" will overwrite the builtin
144 | # "default.css".
145 | html_static_path = ['_static']
146 | 
147 | # If not '', a 'Last updated on:' timestamp is inserted at every page
148 | # bottom, using the given strftime format.
149 | #html_last_updated_fmt = '%b %d, %Y'
150 | 
151 | # If true, SmartyPants will be used to convert quotes and dashes to
152 | # typographically correct entities.
153 | #html_use_smartypants = True
154 | 
155 | # Custom sidebar templates, maps document names to template names.
156 | #html_sidebars = {}
157 | 
158 | # Additional templates that should be rendered to pages, maps page names
159 | # to template names.
160 | #html_additional_pages = {}
161 | 
162 | # If false, no module index is generated.
163 | #html_domain_indices = True
164 | 
165 | # If false, no index is generated.
166 | #html_use_index = True
167 | 
168 | # If true, the index is split into individual pages for each letter.
169 | #html_split_index = False
170 | 
171 | # If true, links to the reST sources are added to the pages.
172 | #html_show_sourcelink = True
173 | 
174 | # If true, "Created using Sphinx" is shown in the HTML footer.
175 | # Default is True.
176 | #html_show_sphinx = True
177 | 
178 | # If true, "(C) Copyright ..." is shown in the HTML footer.
179 | # Default is True.
180 | #html_show_copyright = True
181 | 
182 | # If true, an OpenSearch description file will be output, and all pages
183 | # will contain a <link> tag referring to it.  The value of this option
184 | # must be the base URL from which the finished HTML is served.
185 | #html_use_opensearch = ''
186 | 
187 | # This is the file name suffix for HTML files (e.g. ".xhtml").
188 | #html_file_suffix = None
189 | 
190 | # Output file base name for HTML help builder.
191 | htmlhelp_basename = 'langchangetrackdoc'
192 | 
193 | 
194 | # -- Options for LaTeX output ------------------------------------------
195 | 
196 | latex_elements = {
197 |     # The paper size ('letterpaper' or 'a4paper').
198 |     #'papersize': 'letterpaper',
199 | 
200 |     # The font size ('10pt', '11pt' or '12pt').
201 |     #'pointsize': '10pt',
202 | 
203 |     # Additional stuff for the LaTeX preamble.
204 |     #'preamble': '',
205 | }
206 | 
207 | # Grouping the document tree into LaTeX files. List of tuples
208 | # (source start file, target name, title, author, documentclass
209 | # [howto/manual]).
210 | latex_documents = [
211 |     ('index', 'langchangetrack.tex',
212 |      u'langchangetrack Documentation',
213 |      u'Vivek Kulkarni', 'manual'),
214 | ]
215 | 
216 | # The name of an image file (relative to this directory) to place at
217 | # the top of the title page.
218 | #latex_logo = None
219 | 
220 | # For "manual" documents, if this is true, then toplevel headings
221 | # are parts, not chapters.
222 | #latex_use_parts = False
223 | 
224 | # If true, show page references after internal links.
225 | #latex_show_pagerefs = False
226 | 
227 | # If true, show URL addresses after external links.
228 | #latex_show_urls = False
229 | 
230 | # Documents to append as an appendix to all manuals.
231 | #latex_appendices = []
232 | 
233 | # If false, no module index is generated.
234 | #latex_domain_indices = True
235 | 
236 | 
237 | # -- Options for manual page output ------------------------------------
238 | 
239 | # One entry per manual page. List of tuples
240 | # (source start file, name, description, authors, manual section).
241 | man_pages = [
242 |     ('index', 'langchangetrack',
243 |      u'langchangetrack Documentation',
244 |      [u'Vivek Kulkarni'], 1)
245 | ]
246 | 
247 | # If true, show URL addresses after external links.
248 | #man_show_urls = False
249 | 
250 | 
251 | # -- Options for Texinfo output ----------------------------------------
252 | 
253 | # Grouping the document tree into Texinfo files. List of tuples
254 | # (source start file, target name, title, author,
255 | #  dir menu entry, description, category)
256 | texinfo_documents = [
257 |     ('index', 'langchangetrack',
258 |      u'langchangetrack Documentation',
259 |      u'Vivek Kulkarni',
260 |      'langchangetrack',
261 |      'One line description of project.',
262 |      'Miscellaneous'),
263 | ]
264 | 
265 | # Documents to append as an appendix to all manuals.
266 | #texinfo_appendices = []
267 | 
268 | # If false, no module index is generated.
269 | #texinfo_domain_indices = True
270 | 
271 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
272 | #texinfo_show_urls = 'footnote'
273 | 
274 | # If true, do not generate a @detailmenu in the "Top" node's menu.
275 | #texinfo_no_detailmenu = False
276 | 


--------------------------------------------------------------------------------
/langchangetrack/tsconstruction/distributional/scripts/learn_map.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Benchmark for the quality of the joint space"""
  5 | 
  6 | from argparse import ArgumentParser
  7 | import logging
  8 | import sys
  9 | from io import open
 10 | import os
 11 | from os import path
 12 | from time import time
 13 | from glob import glob
 14 | from collections import defaultdict
 15 | from copy import deepcopy
 16 | from random import shuffle
 17 | import json
 18 | import cPickle as pickle
 19 | 
 20 | from sklearn.linear_model import LinearRegression
 21 | from sklearn.neighbors import NearestNeighbors
 22 | import numpy
 23 | from numpy import asarray
 24 | from langchangetrack.utils.LocalLinearRegression import LocalLinearRegression
 25 | 
 26 | __author__ = "Rami Al-Rfou"
 27 | __email__ = "rmyeid@gmail.com"
 28 | 
 29 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
 30 | 
 31 | reg_model = None
 32 | K_NN = 1000
 33 | 
 34 | 
 35 | class Mapping(object):
 36 | 
 37 |     """ Mapping between terms/phrases."""
 38 | 
 39 |     def __init__(self, source=None, target=None):
 40 |         self.s_lang = source
 41 |         self.t_lang = target
 42 |         self.map = None
 43 | 
 44 | 
 45 | class IdentityTranslations(Mapping):
 46 | 
 47 |     def __init__(self, source, target, se, te):
 48 |         super(IdentityTranslations, self).__init__(source, target)
 49 |         words = set(se.word_id.keys()) & set(te.word_id.keys())
 50 |         D = {}
 51 |         for word in words:
 52 |             D[word] = word
 53 |         self.map = D
 54 | 
 55 | 
 56 | class Embeddings(object):
 57 | 
 58 |     """ A list of words and their vector representatoins.
 59 | 
 60 |         We assume that the given words are sorted by their frequency.
 61 |     """
 62 | 
 63 |     def __init__(self, lang, filename=None, vectors=None, words=None):
 64 | 
 65 |         self.lang = lang
 66 |         if filename:
 67 |             self.filename = filename
 68 |             self.read_file()
 69 | 
 70 |         if vectors != None:
 71 |             self.vectors = asarray(vectors)
 72 |         if words:
 73 |             if len(set(words)) == len(words):
 74 |                 self.word_id = {w: i for i, w in enumerate(words)}
 75 |             else:
 76 |                 logging.debug("We have duplicate words.")
 77 |                 self.word_id = {u'{}_{}'.format(w, i): i for i, w in enumerate(words)}
 78 |         self.id_word = {i: w for w, i in self.word_id.iteritems()}
 79 |         self.words = [w for w, i in Embeddings.sorted_words(self.word_id)]
 80 | 
 81 |     def read_file(self):
 82 |         raise NotImplementedError("Implement an embeddings reader.")
 83 | 
 84 |     def get_vectors(self, words=None):
 85 |         if words:
 86 |             return asarray([self.vectors[self.word_id[w]] for w in words])
 87 |         return self.vectors
 88 | 
 89 |     def __most_frequent(self, n, start=0):
 90 |         return [x for x, y in sorted(self.word_id.iteritems(), key=lambda(x, y): y)[start:n]]
 91 | 
 92 |     def most_frequent(self, n, start=0):
 93 |         return Embeddings(lang=self.lang, words=self.words[start:n],
 94 |                           vectors=self.vectors[start:n])
 95 | 
 96 |     def least_frequent_n(self, n):
 97 |         return [x for x, y in sorted(self.word_id.iteritems(),
 98 |                                      key=lambda(x, y): y, reverse=True)[:n]]
 99 | 
100 |     def words_translations(self, other, mapping, segment):
101 |         start, end = segment
102 |         s_words = self.__most_frequent(n=end, start=start)
103 | 
104 |         map_ = mapping.map
105 |         t_words = [map_[w] for w in s_words]
106 |         exact = [(w1, w2) for (w1, w2) in zip(s_words, t_words) if w1.lower() == w2.lower()]
107 |         logging.info("{} exact words translations in between {}-{} for "
108 |                      "{}-{} languages.".format(len(exact), start, end, mapping.s_lang, mapping.t_lang))
109 | 
110 |         s_new_vectors = self.vectors[start:end]
111 |         t_new_vectors = asarray([other.vectors[other.word_id[w]] for w in t_words])
112 | 
113 |         source = Embeddings(vectors=s_new_vectors, words=s_words, lang=self.lang)
114 |         target = Embeddings(vectors=t_new_vectors, words=t_words, lang=other.lang)
115 |         return (source, target)
116 | 
117 |     @staticmethod
118 |     def sorted_words(word_id):
119 |         return sorted(word_id.iteritems(), key=lambda(x, y): y)
120 | 
121 |     def get_common(self, other, mapping):
122 |         """ Limit the two embeddings to the terms that are covered by the mapping."""
123 | 
124 |         self_oov = defaultdict(lambda: 0)
125 |         other_oov = defaultdict(lambda: 0)
126 |         self_word_id = deepcopy(self.word_id)
127 |         other_word_id = deepcopy(other.word_id)
128 |         new_words = []
129 |         map_ = mapping.map
130 |         for i, w in enumerate(self.word_id):
131 |             if w not in map_:
132 |                 self_oov[w] += 1
133 |                 del self_word_id[w]
134 |                 continue
135 | 
136 |             if map_[w] not in other.word_id:
137 |                 other_oov[map_[w]] += 1
138 |                 del self_word_id[w]
139 | 
140 |         for i, w in enumerate(other.word_id):
141 |             if w not in map_:
142 |                 del other_word_id[w]
143 | 
144 |         logging.info("We could not find {} {} words in our dictionary.".format(
145 |             len(self_oov), self.lang))
146 |         logging.info("We could not find {} {} words in our target words.".format(
147 |             len(other_oov), other.lang))
148 |         logging.info("Our {} vocabulary has {} valid words.".format(
149 |             self.lang, len(self_word_id)))
150 | 
151 |         sorted_self_word_id = Embeddings.sorted_words(self_word_id)
152 |         self_vectors = asarray([self.vectors[i] for w, i in sorted_self_word_id])
153 |         self_words = [w for w, i in sorted_self_word_id]
154 |         new_self = Embeddings(lang=self.lang, vectors=self_vectors, words=self_words)
155 | 
156 |         sorted_other_word_id = Embeddings.sorted_words(other_word_id)
157 |         other_vectors = asarray([other.vectors[i] for w, i in sorted_other_word_id])
158 |         other_words = [w for w, i in sorted_other_word_id]
159 |         new_other = Embeddings(lang=self.lang, vectors=other_vectors, words=other_words)
160 | 
161 |         return (new_self, new_other)
162 | 
163 |     def split(self, mapping, ignore_exact=True):
164 |         """ Generates two embeddings that cover the mapping terms.
165 | 
166 |             If we have a1: b1, a2: b2 mappings in an embeddings space where {a1, b1,
167 |             a2, b2} exists, we would like to generates two embeddings spaces one for
168 |             {a1, a2} and another for {b1, b2}.
169 | 
170 |             Sometimes it is not desirable to include exact terms a3:a3 in the new
171 |             embeddings. Hence, you need to ignore the exact terms.
172 |         """
173 | 
174 |         source_oov = defaultdict(lambda: 0)
175 |         target_oov = defaultdict(lambda: 0)
176 |         w_exact = defaultdict(lambda: 0)
177 | 
178 |         source_words = []
179 |         target_words = []
180 |         map_ = mapping.map
181 |         for w, id_ in self.word_id.iteritems():
182 |             if w not in map_:
183 |                 source_oov[w] += 1
184 |                 continue
185 | 
186 |             if map_[w] not in self.word_id:
187 |                 target_oov[map_[w]] += 1
188 |                 continue
189 | 
190 |             if w.lower() == map_[w].lower():
191 |                 w_exact[w] += 1
192 |                 if ignore_exact:
193 |                     continue
194 | 
195 |             source_words.append(w)
196 |             target_words.append(map_[w])
197 | 
198 |         logging.debug("We could not find {} source words in our dictionary.".format(
199 |                       len(source_oov)))
200 |         logging.debug("We could not find {} target words in our target words.".format(
201 |                       len(target_oov)))
202 |         logging.debug("{} words are exact between languages".format(len(w_exact)))
203 |         logging.debug("We found {} pairs of words valid for testing.".format(len(source_words)))
204 | 
205 |         new_s_vectors = asarray([self.vectors[self.word_id[w]] for w in source_words])
206 |         source = Embeddings(vectors=new_s_vectors, words=source_words,
207 |                             lang=mapping.s_lang)
208 | 
209 |         new_t_vectors = asarray([self.vectors[self.word_id[w]] for w in target_words])
210 |         target = Embeddings(vectors=new_t_vectors, words=target_words,
211 |                             lang=mapping.t_lang)
212 |         new_mapping = Mapping(source=mapping.s_lang, target=mapping.t_lang)
213 |         new_mapping.map = dict(zip(source.words, target.words))
214 |         return (source, target, new_mapping)
215 | 
216 |     def common(self, other):
217 |         """ Find common terms between languages.
218 | 
219 |             The post condition is that both embeddings vocabulary are in the same
220 |             order.
221 |         """
222 | 
223 |         common_words = []
224 |         for word in self.word_id:
225 |             if word in other.word_id:
226 |                 common_words.append(word)
227 | 
228 |         new_self_vectors = []
229 |         new_other_vectors = []
230 |         for word in common_words:
231 |             new_self_vectors.append(self.vectors[self.word_id[word]])
232 |             new_other_vectors.append(other.vectors[other.word_id[word]])
233 | 
234 |         new_self = Embeddings(vectors=asarray(new_self_vectors), words=common_words,
235 |                               lang=self.lang)
236 | 
237 |         new_other = Embeddings(vectors=asarray(new_other_vectors), words=common_words,
238 |                                lang=self.lang)
239 | 
240 |         return (new_self, new_other)
241 | 
242 | 
243 | class Word2VecEmbeddings(Embeddings):
244 | 
245 |     """ Word2Vec embeddings reader."""
246 | 
247 |     def read_file(self, limit=-1):
248 |         words = []
249 |         embeddings = []
250 |         with open(self.filename, 'rb') as f:
251 |             words_number, size = [int(x) for x in f.readline().strip().split()][:2]
252 |             for i, line in enumerate(f):
253 |                 try:
254 |                     ws = line.decode('utf-8').strip().split()
255 |                     words.append(' '.join(ws[:-size]))
256 |                     embeddings.append([float(x) for x in ws[-size:]])
257 |                     if i == limit:
258 |                         break
259 |                 except Exception, e:
260 |                     print "Exception", i
261 |                     print "Exception", line
262 |         self.word_id = {w: i for i, w in enumerate(words)}
263 |         self.vectors = asarray(embeddings)
264 |         assert len(self.word_id) == self.vectors.shape[0]
265 | 
266 | 
267 | class Evaluator(object):
268 | 
269 |     """ Evaluator of the alignment between two languages."""
270 | 
271 |     def __init__(self, source_embeddings, target_embeddings, metric='l2', k=5):
272 |         self.metric = metric
273 |         self.source_embeddings = source_embeddings
274 |         self.target_embeddings = target_embeddings
275 |         self.k = k
276 |         self.row_normalize = True
277 |         self.col_normalize = False
278 | 
279 |     @staticmethod
280 |     def cosine_knn(vectors, point, k):
281 |         distances = numpy.dot(vectors, point)
282 |         indices = list(reversed(distances.argsort()))[:k]
283 |         return distances[indices], [indices]
284 | 
285 |     def norm(self, vectors):
286 |         out = vectors
287 |         if self.row_normalize:
288 |             norms = (vectors ** 2).sum(axis=1) ** 0.5
289 |             out = (vectors.T / norms).T
290 | 
291 |         if self.col_normalize:
292 |             norms = (vectors ** 2).sum(axis=0) ** 0.5
293 |             norms[norms == 0] = 1
294 |             out = vectors / norms
295 |         return out
296 | 
297 |     def precision_at_k(self, test_pairs):
298 |         if self.metric == 'cosine':
299 |             return self.precision_at_k_cosine(test_pairs)
300 |         return self.precision_at_k_l2(test_pairs)
301 | 
302 |     def precision_at_k_l2(self, test_pairs):
303 |         t_knn = NearestNeighbors(n_neighbors=self.k, algorithm='ball_tree', p=2)
304 |         t_knn.fit(self.target_embeddings.vectors)
305 | 
306 |         right = 0
307 |         index = 0
308 |         for s, t in test_pairs:
309 |             assert(s == t)
310 |             point = self.source_embeddings.vectors[self.source_embeddings.word_id[s]]
311 |             distances, indices = t_knn.kneighbors(point)
312 | 
313 |             t_words = [self.target_embeddings.id_word[i] for i in indices[0]]
314 |             t = t.rsplit('_', 1)[0]
315 |             t_words = [x.rsplit('_', 1)[0] for x in t_words]
316 | 
317 |             line = u"{: <20}{:<20}{:<50}".format(s, t, u' '.join(t_words))
318 |             logging.debug(line.encode('utf-8'))
319 |             if t in t_words:
320 |                 right += 1
321 |             index = index + 1
322 |         return right / float(len(test_pairs))
323 | 
324 |     def precision_at_k_cosine(self, test_pairs):
325 |         s_vectors = self.norm(self.source_embeddings.vectors)
326 |         t_vectors = self.norm(self.target_embeddings.vectors)
327 | 
328 |         right = 0
329 |         for s, t in test_pairs:
330 |             point = self.source_embeddings.vectors[self.source_embeddings.word_id[s]]
331 |             distances, indices = Evaluator.cosine_knn(t_vectors, point, self.k)
332 | 
333 |             t_words = [self.target_embeddings.id_word[i] for i in indices[0]]
334 | 
335 |             t = t.rsplit('_', 1)[0]
336 |             t_words = [x.rsplit('_', 1)[0] for x in t_words]
337 | 
338 |             line = u"{: <20}{:<20}{:<50}".format(s, t, u' '.join(t_words))
339 |             logging.debug(line.encode('utf-8'))
340 |             if t in t_words:
341 |                 right += 1
342 |         return right / float(len(test_pairs))
343 | 
344 |     def evaluate(self, mapping, operation, training_segment, test_segment):
345 | 
346 |         (s_train, t_train) = self.source_embeddings.words_translations(self.target_embeddings, mapping, training_segment)
347 |         (s_test, t_test) = self.source_embeddings.words_translations(self.target_embeddings, mapping, test_segment)
348 | 
349 |         s_train.vectors = self.norm(s_train.vectors)
350 |         t_train.vectors = self.norm(t_train.vectors)
351 |         s_test.vectors = self.norm(s_test.vectors)
352 |         t_test.vectors = self.norm(t_test.vectors)
353 | 
354 |         if set(s_train.words).intersection(set(s_test.words)):
355 |             print (u"Train and test words are overlapping")
356 | 
357 |         s_new, t_new = operation((s_train, t_train), (s_test, t_test))
358 | 
359 |         return None
360 | 
361 | 
362 | def linear_regression(train_embeddings, test_embeddings):
363 |     global reg_model
364 |     s_embeddings, t_embeddings = train_embeddings
365 |     s_test, t_test = test_embeddings
366 | 
367 |     reg = LinearRegression()
368 |     reg.fit(s_embeddings.vectors, t_embeddings.vectors)
369 |     pickle.dump(reg, open(reg_model, 'wb'))
370 |     s = Embeddings(vectors=reg.predict(s_test.vectors),
371 |                    words=s_test.words, lang=s_embeddings.lang)
372 |     return s, t_test
373 | 
374 | 
375 | def local_linear_regression(train_embeddings, test_embeddings):
376 |     global reg_model
377 |     print "Using local linear regression with k = ", K_NN
378 |     s_embeddings, t_embeddings = train_embeddings
379 |     s_test, t_test = test_embeddings
380 |     reg = LocalLinearRegression(k_nn=K_NN)
381 |     reg.fit(s_embeddings.vectors, t_embeddings.vectors)
382 |     pickle.dump(reg, open(reg_model, 'wb'))
383 |     return None, None
384 | 
385 | 
386 | def identity(train_vectors, all_vectors):
387 |     return all_vectors
388 | 
389 | 
390 | def evaluate_word2vec(sl, tl, source_file, target_file, method):
391 |     print "Proceeding to load embeddings"
392 |     s_ = Word2VecEmbeddings(lang=sl, filename=source_file)
393 |     t_ = Word2VecEmbeddings(lang=tl, filename=target_file)
394 |     print "Loaded word embeddings"
395 |     mapping = IdentityTranslations(source=sl, target=tl, se=s_, te=t_)
396 |     print "Mapping done"
397 |     s, t = s_.get_common(t_, mapping)
398 |     print "Common vocab done"
399 |     evaluator = Evaluator(source_embeddings=s, target_embeddings=t, metric='l2')
400 |     print "Evaluator constructed"
401 |     assert(s.vectors.shape == t.vectors.shape)
402 |     print "Evaluating"
403 |     if method == 'linear':
404 |         p1 = evaluator.evaluate(mapping, linear_regression, (0, s.vectors.shape[0]), (0, s.vectors.shape[0]))
405 |     elif method == 'locallinear':
406 |         p1 = evaluator.evaluate(mapping, local_linear_regression, (0, s.vectors.shape[0]), (0, s.vectors.shape[0]))
407 | 
408 | 
409 | def main(args):
410 |     global reg_model
411 |     global K_NN
412 |     reg_model = args.filename
413 |     if args.method == 'linear':
414 |         evaluate_word2vec('old', 'new', args.old_model, args.new_model, 'linear')
415 |     elif args.method == 'locallinear':
416 |         K_NN = int(args.knn_val)
417 |         evaluate_word2vec('old', 'new', args.old_model, args.new_model, 'locallinear')
418 | 
419 | if __name__ == "__main__":
420 |     parser = ArgumentParser()
421 |     parser.add_argument("-f", "--file", dest="filename", help="Input file")
422 |     parser.add_argument("-o", "--old_model", dest="old_model", help="old model")
423 |     parser.add_argument("-n", "--new_model", dest="new_model", help="new model")
424 |     parser.add_argument("-k", "--knn", dest="knn_val", default=1000, type=int, help="K in KNN for local linear regression")
425 |     parser.add_argument("-m", "--method", dest="method", help="method")
426 |     parser.add_argument("-l", "--log", dest="log", help="log verbosity level",
427 |                         default="INFO")
428 |     args = parser.parse_args()
429 |     if args.log == 'DEBUG':
430 |         sys.excepthook = debug
431 |     numeric_level = getattr(logging, args.log.upper(), None)
432 |     logging.basicConfig(level=numeric_level, format=LOGFORMAT)
433 |     main(args)
434 | 


--------------------------------------------------------------------------------