├── docs ├── authors.rst ├── history.rst ├── readme.rst ├── contributing.rst ├── usage.rst ├── installation.rst ├── index.rst ├── Makefile ├── make.bat └── conf.py ├── tests ├── __init__.py └── test_langchangetrack.py ├── langchangetrack ├── langchangetrack.py ├── examples │ ├── pos │ │ └── pos_tag_dist_example.sh │ ├── freq │ │ └── freq_count_example.sh │ ├── data │ │ ├── temporal_corpus │ │ │ └── create_temporal_corpora.sh │ │ ├── test_pval.csv │ │ ├── test_sample.csv │ │ └── normalized_timeseries_sample.csv │ └── distributional │ │ └── findNearest.py ├── images │ └── gay_invisible.png ├── __init__.py ├── utils │ ├── __init__.py │ ├── scripts │ │ ├── calculate_freq_counts.sh │ │ ├── calculate_pos_dist.sh │ │ ├── train_models.sh │ │ ├── freq_count.py │ │ ├── common_vocab.py │ │ └── pos_tag.py │ ├── dummy_regressor.py │ ├── LocalLinearRegression.py │ └── entropy.py ├── corpusreaders │ ├── __init__.py │ └── plainngramscorpus.py ├── tsconstruction │ ├── __init__.py │ ├── distributional │ │ ├── __init__.py │ │ ├── corpustoembeddings.py │ │ └── scripts │ │ │ ├── train_embeddings_ngrams.py │ │ │ ├── embedding_displacements.py │ │ │ └── learn_map.py │ ├── freq │ │ └── scripts │ │ │ └── create_freq_timeseries.py │ ├── syntactic │ │ └── scripts │ │ │ └── pos_displacements.py │ ├── dump_timeseries.py │ └── displacements.py ├── scripts │ ├── detect_cp_freq.sh │ ├── detect_cp_pos.sh │ ├── detect_cp_distributional.sh │ ├── freq_pipeline.py │ ├── pos_pipeline.py │ └── ngrams_pipeline.py └── cpdetection │ ├── detect_changepoints_word_ts_r.py │ ├── demostrate_cp.py │ └── detect_changepoints_word_ts.py ├── setup.cfg ├── HISTORY.rst ├── AUTHORS.rst ├── tox.ini ├── MANIFEST.in ├── requirements.txt ├── .travis.yml ├── .gitignore ├── LICENSE ├── Makefile ├── setup.py ├── CONTRIBUTING.rst └── README.rst /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /langchangetrack/langchangetrack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | description-file = README.rst 4 | -------------------------------------------------------------------------------- /langchangetrack/examples/pos/pos_tag_dist_example.sh: -------------------------------------------------------------------------------- 1 | pos_tag.py -f ./gutenberg.txt -o gutenberg.posdist 2 | -------------------------------------------------------------------------------- /langchangetrack/examples/freq/freq_count_example.sh: -------------------------------------------------------------------------------- 1 | freq_count.py -f ../data/sample_corpora/gutenberg.txt > gutenberg.freq 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Usage 3 | ======== 4 | 5 | To use langchangetrack in a project:: 6 | 7 | import langchangetrack 8 | -------------------------------------------------------------------------------- /langchangetrack/images/gay_invisible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/viveksck/langchangetrack/HEAD/langchangetrack/images/gay_invisible.png -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 0.1.0 (2015-02-20) 7 | --------------------- 8 | 9 | * First release on PyPI. 10 | -------------------------------------------------------------------------------- /langchangetrack/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Vivek Kulkarni' 4 | __email__ = 'viveksck@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /langchangetrack/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Vivek Kulkarni' 4 | __email__ = 'viveksck@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /langchangetrack/corpusreaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Vivek Kulkarni' 4 | __email__ = 'viveksck@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Vivek Kulkarni' 4 | __email__ = 'viveksck@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/distributional/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Vivek Kulkarni' 4 | __email__ = 'viveksck@gmail.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Vivek Kulkarni 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /langchangetrack/examples/data/temporal_corpus/create_temporal_corpora.sh: -------------------------------------------------------------------------------- 1 | SAMPLESIZE=100000 2 | ls /scratch2/vvkulkarni/new_semantic/ngrams_expanded/eng-fiction/19*[0,5].ngrams | parallel -j16 --progress shuf -n $SAMPLESIZE {} -o {/} 3 | -------------------------------------------------------------------------------- /langchangetrack/examples/data/test_pval.csv: -------------------------------------------------------------------------------- 1 | ,word,min_pval,cp,tpval,tcp 2 | 3,gay,0.0,1975,0.0,1980 3 | 2,bitch,0.0,1950,0.0001,1955 4 | 1,sex,0.0,1955,0.0007,1965 5 | 4,recording,0.0284,1990,0.0284,1990 6 | 0,tree,0.7833,1910,1.0, 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26, py27, py33, py34 3 | 4 | [testenv] 5 | setenv = 6 | PYTHONPATH = {toxinidir}:{toxinidir}/langchangetrack 7 | commands = python setup.py test 8 | deps = 9 | -r{toxinidir}/requirements.txt 10 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | At the command line:: 6 | 7 | $ easy_install langchangetrack 8 | 9 | Or, if you have virtualenvwrapper installed:: 10 | 11 | $ mkvirtualenv langchangetrack 12 | $ pip install langchangetrack 13 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/calculate_freq_counts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CORPUS_DIR=$1 3 | WORKING_DIR=$2 4 | EXT=$3 5 | WORKERS=$4 6 | mkdir -p $WORKING_DIR 7 | mkdir -p $WORKING_DIR/counts/ 8 | ls $CORPUS_DIR/*.$EXT | parallel -j${WORKERS} "freq_count.py -f {} > $WORKING_DIR/counts/{/.}.freq" 9 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/calculate_pos_dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CORPUS_DIR=$1 3 | WORKING_DIR=$2 4 | EXT=$3 5 | WORKERS=$4 6 | mkdir -p $WORKING_DIR 7 | mkdir -p $WORKING_DIR/posdist/ 8 | ls $CORPUS_DIR/*.$EXT | parallel -j${WORKERS} "pos_tag.py -f {} -o $WORKING_DIR/posdist/{/.}.posdist" 9 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel==0.23.0 2 | argparse>=1.2.1 3 | numpy>=0.9.1 4 | scipy>=0.15.1 5 | more_itertools>=2.2 6 | joblib>=0.8.3-r1 7 | gensim==0.10.3 8 | six>=1.7.0 9 | statsmodels>=0.5.0 10 | changepoint>=0.1.0 11 | nltk>=3.0.0 12 | textblob>=0.9.0 13 | textblob-aptagger>=0.2.0 14 | psutil>=2.1.1 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "2.7" 7 | - "2.6" 8 | - "pypy" 9 | 10 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 | install: pip install -r requirements.txt 12 | 13 | # command to run tests, e.g. python setup.py test 14 | script: python setup.py test 15 | -------------------------------------------------------------------------------- /langchangetrack/corpusreaders/plainngramscorpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import gensim 7 | 8 | 9 | class PlainNGRAMSCorpus(object): 10 | 11 | """Iterate over sentences(ngram) of plain ngram file""" 12 | 13 | def __init__(self, filename): 14 | self.filename = filename 15 | 16 | def __iter__(self): 17 | text = open(self.filename) 18 | for sentence in text: 19 | yield gensim.utils.simple_preprocess(sentence, deacc=True) 20 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/train_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CORPUS_DIR=$1 3 | WORKING_DIR=$2 4 | EXT=${3} 5 | WINDOW=${4} 6 | EPOCHS=${5} 7 | WORKERS=${6} 8 | EMBEDDINGS_TYPE=skipgram 9 | arr=("$CORPUS_DIR/*.$EXT") 10 | echo "Processing files", $arr 11 | echo "Training embeddings" 12 | mkdir -p $WORKING_DIR/models 13 | echo "Models will be stored in", $WORKING_DIR/models 14 | parallel -vv -j ${WORKERS} --progress train_embeddings_ngrams.py -f {} -o $WORKING_DIR/models -p {/.} -e $EMBEDDINGS_TYPE -workers ${WORKERS} --epochs ${EPOCHS} -w ${WINDOW} ::: $arr 15 | -------------------------------------------------------------------------------- /tests/test_langchangetrack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_langchangetrack 6 | ---------------------------------- 7 | 8 | Tests for `langchangetrack` module. 9 | """ 10 | 11 | import unittest 12 | 13 | from langchangetrack import langchangetrack 14 | 15 | 16 | class TestLangchangetrack(unittest.TestCase): 17 | 18 | def setUp(self): 19 | pass 20 | 21 | def test_something(self): 22 | pass 23 | 24 | def tearDown(self): 25 | pass 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | htmlcov 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Complexity 39 | output/*.html 40 | output/*/index.html 41 | 42 | # Sphinx 43 | docs/_build 44 | -------------------------------------------------------------------------------- /langchangetrack/examples/data/test_sample.csv: -------------------------------------------------------------------------------- 1 | ,word,1905,1910,1915,1920,1925,1930,1935,1940,1945,1950,1955,1960,1965,1970,1975,1980,1985,1990,1995,2000 2 | 0,tree,8089,7833,9362,9431,9929,9890,9763,9700,9989,9994,10000,10000,9999,9997,9994,9998,9951,9951,9848,9966 3 | 1,sex,4518,1986,2271,2110,661,269,95,52,9,6,0,0,7,19,11,3,10,63,471,1382 4 | 2,bitch,1272,3324,2695,484,37,23,6,1,3,0,1,7,2,33,33,576,582,2166,3117,4494 5 | 3,gay,975,2312,968,1479,895,535,460,559,517,206,139,7,10,1,0,0,3,10,127,1835 6 | 4,recording,9998,9076,7201,7725,4498,3293,1808,2149,2661,2259,2433,478,670,618,586,549,559,284,376,1375 7 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. langchangetrack documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to langchangetrack's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | readme 15 | installation 16 | usage 17 | contributing 18 | authors 19 | history 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | -------------------------------------------------------------------------------- /langchangetrack/scripts/detect_cp_freq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_DIR=$1 3 | WORKING_DIR=$2 4 | OUTPUT_DIR=$3 5 | STARTTIMEPOINT=$4 6 | ENDTIMEPOINT=$5 7 | STEP=$6 8 | FILTER_VOCAB_FILE=$7 9 | BOOTSTRAP=${8} 10 | THRESHOLD=${9} 11 | WORKERS=${10} 12 | 13 | mkdir -p $WORKING_DIR 14 | mkdir -p $OUTPUT_DIR 15 | mkdir -p $WORKING_DIR/timeseries 16 | 17 | create_freq_timeseries.py -d $INPUT_DIR -s $STARTTIMEPOINT -e $ENDTIMEPOINT -p $STEP -f $WORKING_DIR/timeseries/freq_timeseries.csv --log10 18 | 19 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/freq_timeseries.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -d -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD} 20 | -------------------------------------------------------------------------------- /langchangetrack/scripts/detect_cp_pos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_DIR=$1 3 | WORKING_DIR=$2 4 | OUTPUT_DIR=$3 5 | STARTTIMEPOINT=$4 6 | ENDTIMEPOINT=$5 7 | STEP=$6 8 | FILTER_VOCAB_FILE=$7 9 | BOOTSTRAP=${8} 10 | THRESHOLD=${9} 11 | WORKERS=${10} 12 | 13 | mkdir -p $WORKING_DIR 14 | mkdir -p $OUTPUT_DIR 15 | 16 | mkdir -p $WORKING_DIR/displacements/ 17 | pos_displacements.py -f $FILTER_VOCAB_FILE -d $INPUT_DIR/ -p "" -os pos -es ".posdist" -ps "" -sy $STARTTIMEPOINT -ey $ENDTIMEPOINT -s $STEP -e "pos" -o $WORKING_DIR/displacements -workers ${WORKERS} 18 | 19 | mkdir -p $WORKING_DIR/timeseries/ 20 | dump_timeseries.py -f $WORKING_DIR/displacements/timeseries_s_t_pos.pkl -s $WORKING_DIR/timeseries/source.csv -e $WORKING_DIR/timeseries/dest.csv -m $STARTTIMEPOINT -n $ENDTIMEPOINT -st $STEP -me "polar" -metric "jsd" -workers ${WORKERS} 21 | 22 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/source.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD} 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Vivek Kulkarni 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | * Neither the name of langchangetrack nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /langchangetrack/examples/distributional/findNearest.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import logging 3 | import sys 4 | import math 5 | import operator 6 | 7 | import numpy as np 8 | from numpy import linalg as LA 9 | import gensim 10 | 11 | __author__ = "Vivek Kulkarni" 12 | __email__ = "viveksck@gmail.com" 13 | 14 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 15 | 16 | def main(args): 17 | return process(args.filename) 18 | 19 | def process(filename): 20 | m = gensim.models.Word2Vec.load_word2vec_format(filename) 21 | print "query (ctrl-c to quit): ", 22 | line = sys.stdin.readline() 23 | while line: 24 | word = line.rstrip() 25 | print word 26 | tuples = m.most_similar(word, topn=10) 27 | for w, s in tuples: 28 | print w, s 29 | print "----------------------------------" 30 | print "query (ctrl-c to quit): ", 31 | line = sys.stdin.readline() 32 | 33 | if __name__ == "__main__": 34 | parser = ArgumentParser() 35 | parser.add_argument("--embeddings-file", dest="filename", help="embeddings file") 36 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 37 | default="INFO") 38 | args = parser.parse_args() 39 | if args.log == 'DEBUG': 40 | sys.excepthook = debug 41 | numeric_level = getattr(logging, args.log.upper(), None) 42 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 43 | main(args) 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean - remove all build, test, coverage and Python artifacts" 5 | @echo "clean-build - remove build artifacts" 6 | @echo "clean-pyc - remove Python file artifacts" 7 | @echo "clean-test - remove test and coverage artifacts" 8 | @echo "lint - check style with flake8" 9 | @echo "test - run tests quickly with the default Python" 10 | @echo "test-all - run tests on every Python version with tox" 11 | @echo "coverage - check code coverage quickly with the default Python" 12 | @echo "docs - generate Sphinx HTML documentation, including API docs" 13 | @echo "release - package and upload a release" 14 | @echo "dist - package" 15 | 16 | clean: clean-build clean-pyc clean-test 17 | 18 | clean-build: 19 | rm -fr build/ 20 | rm -fr dist/ 21 | rm -fr *.egg-info 22 | 23 | clean-pyc: 24 | find . -name '*.pyc' -exec rm -f {} + 25 | find . -name '*.pyo' -exec rm -f {} + 26 | find . -name '*~' -exec rm -f {} + 27 | find . -name '__pycache__' -exec rm -fr {} + 28 | 29 | clean-test: 30 | rm -fr .tox/ 31 | rm -f .coverage 32 | rm -fr htmlcov/ 33 | 34 | lint: 35 | flake8 langchangetrack tests 36 | 37 | test: 38 | python setup.py test 39 | 40 | test-all: 41 | tox 42 | 43 | coverage: 44 | coverage run --source langchangetrack setup.py test 45 | coverage report -m 46 | coverage html 47 | open htmlcov/index.html 48 | 49 | docs: 50 | rm -f docs/langchangetrack.rst 51 | rm -f docs/modules.rst 52 | sphinx-apidoc -o docs/ langchangetrack 53 | $(MAKE) -C docs clean 54 | $(MAKE) -C docs html 55 | open docs/_build/html/index.html 56 | 57 | release: clean 58 | python setup.py sdist upload 59 | python setup.py bdist_wheel upload 60 | 61 | dist: clean 62 | python setup.py sdist 63 | python setup.py bdist_wheel 64 | ls -l dist 65 | -------------------------------------------------------------------------------- /langchangetrack/scripts/detect_cp_distributional.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_DIR=$1 3 | WORKING_DIR=$2 4 | OUTPUT_DIR=$3 5 | STARTTIMEPOINT=$4 6 | ENDTIMEPOINT=$5 7 | STEP=$6 8 | MODEL_FAMILY=$7 9 | KNN=$8 10 | FILTER_VOCAB_FILE=${9} 11 | BOOTSTRAP=${10} 12 | THRESHOLD=${11} 13 | WORKERS=${12} 14 | 15 | EMBEDDINGS_TYPE=skipgram 16 | echo "Output directory is", $OUTPUT_DIR 17 | 18 | mkdir -p $WORKING_DIR 19 | mkdir -p $OUTPUT_DIR 20 | 21 | echo "Mapping to joint space" 22 | mkdir -p $WORKING_DIR/predictors 23 | echo "Predictors will be stored in", $WORKING_DIR/predictors 24 | arr=("$INPUT_DIR/*.model") 25 | ((FINALTIMEPOINT=$ENDTIMEPOINT-$STEP)) 26 | parallel -j${WORKERS} learn_map.py -k ${KNN} -f $WORKING_DIR/predictors/{/.}.predictor -o {} -n {//}/${FINALTIMEPOINT}_*.model -m $MODEL_FAMILY ::: $arr 27 | 28 | WORDS_FILE=${FILTER_VOCAB_FILE} 29 | 30 | echo "Computing displacements" 31 | mkdir -p $WORKING_DIR/displacements/ 32 | export MKL_NUM_THREADS=1 33 | export NUMEXPR_NUM_THREADS=1 34 | export OMP_NUM_THREADS=1 35 | export MKL_DYNAMIC=FALSE 36 | embedding_displacements.py -f $WORDS_FILE -d $INPUT_DIR/ -p $WORKING_DIR/predictors/ -os words -es ".model" -ps ".predictor" -sy $STARTTIMEPOINT -ey $ENDTIMEPOINT -s $STEP -e $EMBEDDINGS_TYPE -o $WORKING_DIR/displacements/ -workers ${WORKERS} 37 | 38 | echo "Creating time series" 39 | mkdir -p $WORKING_DIR/timeseries/ 40 | dump_timeseries.py -f $WORKING_DIR/displacements/timeseries_s_t_words.pkl -s $WORKING_DIR/timeseries/source.csv -e $WORKING_DIR/timeseries/dest.csv -m $STARTTIMEPOINT -n $ENDTIMEPOINT -st $STEP -me "polar" -metric "cosine" -workers ${WORKERS} 41 | 42 | detect_changepoints_word_ts.py -f $WORKING_DIR/timeseries/source.csv -v $FILTER_VOCAB_FILE -p $OUTPUT_DIR/pvals.csv -n $OUTPUT_DIR/samples.csv -c $STARTTIMEPOINT -w ${WORKERS} -b ${BOOTSTRAP} -t ${THRESHOLD} 43 | -------------------------------------------------------------------------------- /langchangetrack/utils/dummy_regressor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """dummy_regressor.py: Regressor that is the identity""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import pickle 14 | 15 | __author__ = "Vivek Kulkarni" 16 | __email__ = "viveksck@gmail.com" 17 | 18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 19 | 20 | 21 | class DummyRegressor(object): 22 | 23 | def predict(self, X): 24 | return X 25 | 26 | 27 | def main(args): 28 | d = DummyRegressor() 29 | pickle.dump(d, open('dummy_regressor.pkl', 'wb')) 30 | 31 | 32 | def debug(type_, value, tb): 33 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 34 | # we are in interactive mode or we don't have a tty-like 35 | # device, so we call the default hook 36 | sys.__excepthook__(type_, value, tb) 37 | else: 38 | import traceback 39 | import pdb 40 | # we are NOT in interactive mode, print the exception... 41 | traceback.print_exception(type_, value, tb) 42 | print("\n") 43 | # ...then start the debugger in post-mortem mode. 44 | pdb.pm() 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 49 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 50 | default="INFO") 51 | args = parser.parse_args() 52 | if args.log == 'DEBUG': 53 | sys.excepthook = debug 54 | numeric_level = getattr(logging, args.log.upper(), None) 55 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 56 | main(args) 57 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/freq_count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """freq_count.py: Dumps the frequency distribution of a corpus in desc order""" 5 | 6 | 7 | from argparse import ArgumentParser 8 | import logging 9 | import sys 10 | from io import open 11 | from os import path 12 | from time import time 13 | from glob import glob 14 | import nltk 15 | 16 | __author__ = "Vivek Kulkarni" 17 | __email__ = "viveksck@gmail.com" 18 | 19 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 20 | 21 | 22 | def main(args): 23 | encoding = sys.stdout.encoding or 'utf-8' 24 | f = open(args.filename) 25 | fd = nltk.FreqDist() 26 | for line in f: 27 | for sent in nltk.sent_tokenize(line): 28 | for word in nltk.word_tokenize(sent): 29 | fd[word] += 1 30 | 31 | for w, count in fd.most_common(): 32 | tup = u"{} {}".format(w, count) 33 | print tup.encode(encoding) 34 | 35 | 36 | def debug(type_, value, tb): 37 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 38 | # we are in interactive mode or we don't have a tty-like 39 | # device, so we call the default hook 40 | sys.__excepthook__(type_, value, tb) 41 | else: 42 | import traceback 43 | import pdb 44 | # we are NOT in interactive mode, print the exception... 45 | traceback.print_exception(type_, value, tb) 46 | print("\n") 47 | # ...then start the debugger in post-mortem mode. 48 | pdb.pm() 49 | 50 | if __name__ == "__main__": 51 | parser = ArgumentParser() 52 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 53 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 54 | default="INFO") 55 | args = parser.parse_args() 56 | if args.log == 'DEBUG': 57 | sys.excepthook = debug 58 | numeric_level = getattr(logging, args.log.upper(), None) 59 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 60 | main(args) 61 | -------------------------------------------------------------------------------- /langchangetrack/examples/data/normalized_timeseries_sample.csv: -------------------------------------------------------------------------------- 1 | Unnamed: 0,word,1900,1905,1910,1915,1920,1925,1930,1935,1940,1945,1950,1955,1960,1965,1970,1975,1980,1985,1990,1995,2000,2005 2 | 830,tree,0.15327714247103288,-0.19885247327831865,-0.3683410408490255,0.09558896968329192,-0.3490389607298094,0.2565163581969198,-0.527302927985289,-0.7428741306743359,-0.574162051879182,0.34184472338303645,-0.3714711433188301,-0.1406226862557359,-0.5567142692737548,-0.8473020213018153,-0.7812379046311363,-0.7237730159955219,-0.6133051538066998,-1.0328756603907936,-0.7434065484530583,-0.9341274372494442,-0.8209303655079251,-1.5439039179382728 3 | 1232,sex,0.15327714247103288,0.8422974209765391,0.4670831714915557,1.2004514277740952,1.089117575545353,-0.042869473512371865,0.4905971357936949,0.4420351237685775,0.6754477335596171,0.3616949047487767,0.7148357324968245,0.4483215569154581,0.6638101589107545,2.8213000472467233,1.6867586375636152,1.2839351704861663,1.026826051998202,1.9734392345129588,2.3191684380330164,2.5913956386522274,2.266890620494351,2.5228906797208204 4 | 1280,bitch,0.15327714247103288,1.142182904418651,2.629857299421962,1.5050173023356772,-0.19664099967373186,-0.1816506923943963,1.3944691232705158,1.3129137681032181,1.698648850193928,2.3741872415908647,1.5339619507724442,2.4752087951820902,2.6833522170360724,1.9358190746841688,3.133418509779865,2.4097395640333708,4.423179526228856,2.375519385842317,3.8408415271690113,2.6895338913639515,2.6321814385912887,2.407180122647251 5 | 2008,gay,0.15327714247103288,0.4420047305142134,1.6090753807905729,0.6967969687421449,1.6586549543356486,0.9196378440898209,0.9731171389763618,1.28614418926982,1.5990551088842633,1.4001333223409744,0.7467762811647296,1.1979101086635646,0.3335700353267224,1.5371247773359915,0.7701374463207338,0.7797479007969157,2.4813433922784727,1.946577671429413,2.7217396855527105,3.0896267594069604,3.5749294003565684,2.5136765443816858 6 | 3281,recording,0.15327714247103288,4.177000925947883,1.9405949335453256,1.5249572014401584,2.698653478348632,0.9964745894159096,1.7323864289598985,1.4656472143368606,2.4480439305566315,2.6008059328492257,2.040880904439721,2.3918680858825256,0.5625011152828968,2.6666885636321247,2.3098860150844365,2.3988788624164106,2.389346840226949,2.4943894017849337,2.2083815531437043,2.73865670378146,3.610134796433214,3.0174046962457997 7 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/common_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """common_vocab.py: Dumps the common vocabulary between a set of text files.""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import nltk 14 | 15 | __author__ = "Vivek Kulkarni" 16 | __email__ = "viveksck@gmail.com" 17 | 18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 19 | 20 | 21 | def main(args): 22 | encoding = sys.stdout.encoding or 'utf-8' 23 | common_vocab = None 24 | list_of_files = glob(args.filepattern) 25 | for fname in list_of_files: 26 | file_vocab = set() 27 | f = open(fname) 28 | for line in f: 29 | for sent in nltk.sent_tokenize(line): 30 | for word in nltk.word_tokenize(sent): 31 | file_vocab.add(word) 32 | if common_vocab == None: 33 | common_vocab = file_vocab 34 | else: 35 | common_vocab = common_vocab & file_vocab 36 | f.close() 37 | 38 | for w in common_vocab: 39 | print w.encode(encoding) 40 | 41 | 42 | def debug(type_, value, tb): 43 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 44 | # we are in interactive mode or we don't have a tty-like 45 | # device, so we call the default hook 46 | sys.__excepthook__(type_, value, tb) 47 | else: 48 | import traceback 49 | import pdb 50 | # we are NOT in interactive mode, print the exception... 51 | traceback.print_exception(type_, value, tb) 52 | print("\n") 53 | # ...then start the debugger in post-mortem mode. 54 | pdb.pm() 55 | 56 | if __name__ == "__main__": 57 | parser = ArgumentParser() 58 | parser.add_argument("-f", "--filepattern", 59 | dest="filepattern", help="Input file pattern") 60 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 61 | default="INFO") 62 | args = parser.parse_args() 63 | if args.log == 'DEBUG': 64 | sys.excepthook = debug 65 | numeric_level = getattr(logging, args.log.upper(), None) 66 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 67 | main(args) 68 | -------------------------------------------------------------------------------- /langchangetrack/scripts/freq_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """template.py: Description of what the module does.""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import subprocess 14 | 15 | __author__ = "Vivek Kulkarni" 16 | __email__ = "viveksck@gmail.com" 17 | 18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 19 | 20 | 21 | def main(args): 22 | train_cmd = "calculate_freq_counts.sh {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.workers) 23 | subprocess.check_call(train_cmd, shell=True) 24 | 25 | cmd = "detect_cp_freq.sh {} {} {} {} {} {} {} {} {} {}" 26 | input_dir = path.join(args.working_dir, 'counts') 27 | cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start, 28 | args.end, args.step, args.vocab_file, args.bootstrap, args.threshold, args.workers) 29 | subprocess.check_call(cmd, shell=True) 30 | 31 | if __name__ == "__main__": 32 | parser = ArgumentParser() 33 | parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory") 34 | parser.add_argument("--file-extension", dest="ext", help="Corpus file extension") 35 | parser.add_argument("--working-dir", dest="working_dir", help="Working directory") 36 | parser.add_argument("--output-dir", dest="output_dir", help="Output directory") 37 | parser.add_argument("--start-time-point", dest="start", help="Start time point") 38 | parser.add_argument("--end-time-point", dest="end", help="End time point") 39 | parser.add_argument("--step-size", dest="step", help="Step size for timepoints") 40 | parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file") 41 | parser.add_argument("--threshold", dest="threshold", default=0.0, type=float, help="Threshold for mean shift model for change point detection") 42 | parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw") 43 | parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers") 44 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 45 | default="INFO") 46 | args = parser.parse_args() 47 | if args.log == 'DEBUG': 48 | sys.excepthook = debug 49 | numeric_level = getattr(logging, args.log.upper(), None) 50 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 51 | main(args) 52 | -------------------------------------------------------------------------------- /langchangetrack/scripts/pos_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Pipeline to detect language change using part of speech.""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import subprocess 14 | 15 | __author__ = "Vivek Kulkarni" 16 | __email__ = "viveksck@gmail.com" 17 | 18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 19 | 20 | 21 | def main(args): 22 | train_cmd = "calculate_pos_dist.sh {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.workers) 23 | subprocess.check_call(train_cmd, shell=True) 24 | 25 | cmd = "detect_cp_pos.sh {} {} {} {} {} {} {} {} {} {}" 26 | input_dir = path.join(args.working_dir, 'posdist') 27 | cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start, 28 | args.end, args.step, args.vocab_file, args.bootstrap, args.threshold, args.workers) 29 | subprocess.check_call(cmd, shell=True) 30 | 31 | if __name__ == "__main__": 32 | parser = ArgumentParser() 33 | parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory") 34 | parser.add_argument("--file-extension", dest="ext", help="Corpus file extension") 35 | parser.add_argument("--working-dir", dest="working_dir", help="Working directory") 36 | parser.add_argument("--output-dir", dest="output_dir", help="Output directory") 37 | parser.add_argument("--start-time-point", dest="start", help="Start time point") 38 | parser.add_argument("--end-time-point", dest="end", help="End time point") 39 | parser.add_argument("--step-size", dest="step", help="Step size for timepoints") 40 | parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file") 41 | parser.add_argument("--threshold", dest="threshold", default=1.75, type=float, help="Threshold for mean shift model for change point detection") 42 | parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw") 43 | parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers") 44 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 45 | default="INFO") 46 | args = parser.parse_args() 47 | if args.log == 'DEBUG': 48 | sys.excepthook = debug 49 | numeric_level = getattr(logging, args.log.upper(), None) 50 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 51 | main(args) 52 | -------------------------------------------------------------------------------- /langchangetrack/utils/scripts/pos_tag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | import logging 6 | import sys 7 | from io import open 8 | from os import path 9 | from time import time 10 | from glob import glob 11 | 12 | from textblob import Blobber 13 | from textblob_aptagger import PerceptronTagger 14 | 15 | from collections import Counter, defaultdict 16 | import numpy as np 17 | import pandas as pd 18 | 19 | __author__ = "Vivek Kulkarni" 20 | __email__ = "viveksck@gmail.com" 21 | 22 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 23 | 24 | 25 | def main(args): 26 | f = open(args.filename) 27 | D = {} 28 | tag_set = set([]) 29 | tb = Blobber(pos_tagger=PerceptronTagger()) 30 | for i, line in enumerate(f): 31 | b1 = tb(line) 32 | for w, t in b1.tags: 33 | tag_set.add(t) 34 | if w not in D: 35 | D[w] = Counter() 36 | D[w][t] = float(D[w][t] + 1) 37 | 38 | sorted_pos_tags = sorted(list(tag_set)) 39 | rows = [] 40 | for w in D.keys(): 41 | row = [w] 42 | pos_counts_word = np.array([float(D[w][t]) for t in sorted_pos_tags]) 43 | pos_dist_word = pos_counts_word / float(np.sum(pos_counts_word)) 44 | assert(np.isclose(np.sum(pos_dist_word), 1.0)) 45 | row = row + list(pos_dist_word) 46 | rows.append(row) 47 | 48 | header = ['word'] + sorted_pos_tags 49 | print "Set of POS tags in sorted order", header 50 | df = pd.DataFrame().from_records(rows, columns=header) 51 | print "Dumping the POS distribution." 52 | df.to_csv(args.outputfile, index=None, encoding='utf-8') 53 | 54 | 55 | def debug(type_, value, tb): 56 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 57 | # we are in interactive mode or we don't have a tty-like 58 | # device, so we call the default hook 59 | sys.__excepthook__(type_, value, tb) 60 | else: 61 | import traceback 62 | import pdb 63 | # we are NOT in interactive mode, print the exception... 64 | traceback.print_exception(type_, value, tb) 65 | print("\n") 66 | # ...then start the debugger in post-mortem mode. 67 | pdb.pm() 68 | 69 | if __name__ == "__main__": 70 | parser = ArgumentParser() 71 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 72 | parser.add_argument("-o", "--outputfile", dest="outputfile", help="Output file") 73 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 74 | default="INFO") 75 | args = parser.parse_args() 76 | if args.log == 'DEBUG': 77 | sys.excepthook = debug 78 | numeric_level = getattr(logging, args.log.upper(), None) 79 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 80 | main(args) 81 | -------------------------------------------------------------------------------- /langchangetrack/scripts/ngrams_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Pipeline to train a corpus of ngrams""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import subprocess 14 | 15 | __author__ = "Vivek Kulkarni" 16 | __email__ = "viveksck@gmail.com" 17 | 18 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 19 | 20 | 21 | def main(args): 22 | train_cmd = "train_models.sh {} {} {} {} {} {}".format(args.corpus_dir, args.working_dir, args.ext, args.window, args.epochs, args.workers) 23 | subprocess.check_call(train_cmd, shell=True) 24 | 25 | cmd = "detect_cp_distributional.sh {} {} {} {} {} {} {} {} {} {} {} {}" 26 | input_dir = path.join(args.working_dir, 'models') 27 | cmd = cmd.format(input_dir, args.working_dir, args.output_dir, args.start, args.end, args.step, args.model_family, args.knn, args.vocab_file, args.bootstrap, args.threshold, args.workers) 28 | subprocess.check_call(cmd, shell=True) 29 | 30 | if __name__ == "__main__": 31 | parser = ArgumentParser() 32 | parser.add_argument("--corpus-dir", dest="corpus_dir", help="Corpus directory") 33 | parser.add_argument("--file-extension", dest="ext", help="Corpus file extension") 34 | parser.add_argument("--working-dir", dest="working_dir", help="Working directory") 35 | parser.add_argument("--output-dir", dest="output_dir", help="Output directory") 36 | parser.add_argument("--context-size", dest="window", default=5, type=int, help="Context size to use for training embeddings") 37 | parser.add_argument("--epochs", dest="epochs", default=3, type=int, help="Number of epochs to training embeddings") 38 | parser.add_argument("--start-time-point", dest="start", help="Start time point") 39 | parser.add_argument("--end-time-point", dest="end", help="End time point") 40 | parser.add_argument("--step-size", dest="step", help="Step size for timepoints") 41 | parser.add_argument("--model-family", dest="model_family", default="locallinear", help="Model family default (locallinear)") 42 | parser.add_argument("--number-nearest-neighbors", dest="knn", default=1000, 43 | type=int, help="Number of nearest neighbors to use for mapping to joint space (default:1000)") 44 | parser.add_argument("--vocabulary-file", dest="vocab_file", help="Common vocabulary file") 45 | parser.add_argument("--threshold", dest="threshold", default=1.75, type=float, help="Threshold for mean shift model for change point detection") 46 | parser.add_argument("--bootstrap-samples", dest="bootstrap", default=1000, type=int, help="Number of bootstrap samples to draw") 47 | parser.add_argument("--workers", dest="workers", default=1, type=int, help="Maximum number of workers") 48 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 49 | default="INFO") 50 | args = parser.parse_args() 51 | if args.log == 'DEBUG': 52 | sys.excepthook = debug 53 | numeric_level = getattr(logging, args.log.upper(), None) 54 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 55 | main(args) 56 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/freq/scripts/create_freq_timeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | import logging 6 | import sys 7 | from io import open 8 | from os import path 9 | from time import time 10 | from glob import glob 11 | import numpy as np 12 | import pandas as pd 13 | 14 | __author__ = "Vivek Kulkarni" 15 | __email__ = "viveksck@gmail.com" 16 | 17 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 18 | 19 | 20 | def main(args): 21 | # Read the input arguments. 22 | inputdir = args.inputdir 23 | start = args.start 24 | end = args.end 25 | step = args.step 26 | timepoints = np.arange(start, end, step) 27 | timepoints = [str(timepoint) for timepoint in timepoints] 28 | num = int(args.num) 29 | freq = args.freq 30 | 31 | # Normalize the frequencies. 32 | normdf = None 33 | dfs = (pd.read_table(path.join(inputdir, timepoint + '.freq'), sep=' ', 34 | quotechar=' ', names=['word', timepoint]) for timepoint in (timepoints)) 35 | for i, df in enumerate(dfs): 36 | df[str(timepoints[i])] = df[str(timepoints[i])] / df[str(timepoints[i])].sum() 37 | if normdf is None: 38 | normdf = df[:num] 39 | continue 40 | df = df[:num] 41 | normdf = pd.merge(normdf, df, on='word', how='outer') 42 | 43 | # Convert them to log scale becoz that is what matters ! 44 | if args.log10: 45 | for timepoint in timepoints: 46 | normdf[timepoint] = np.log10(normdf[timepoint]) 47 | 48 | normdf.to_csv(freq, encoding='utf-8') 49 | 50 | 51 | def debug(type_, value, tb): 52 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 53 | # we are in interactive mode or we don't have a tty-like 54 | # device, so we call the default hook 55 | sys.__excepthook__(type_, value, tb) 56 | else: 57 | import traceback 58 | import pdb 59 | # we are NOT in interactive mode, print the exception... 60 | traceback.print_exception(type_, value, tb) 61 | print("\n") 62 | # ...then start the debugger in post-mortem mode. 63 | pdb.pm() 64 | 65 | if __name__ == "__main__": 66 | parser = ArgumentParser() 67 | parser.add_argument("-d", "--inputdir", dest="inputdir", help="Input file") 68 | parser.add_argument("-s", "--start", dest="start", help="start time", type=int) 69 | parser.add_argument("-e", "--end", dest="end", help="end time(not included)", type=int) 70 | parser.add_argument("-p", "--step", dest="step", help="step", type=int) 71 | parser.add_argument("-num", "--num", dest="num", help="Number of words topN", type=int, default=30000) 72 | parser.add_argument("-f", "--freq", dest="freq", help="Output freq dist file") 73 | parser.add_argument("-log", "--log10", dest="log10", action="store_true", default=False, help="freq") 74 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 75 | default="INFO") 76 | args = parser.parse_args() 77 | if args.log == 'DEBUG': 78 | sys.excepthook = debug 79 | numeric_level = getattr(logging, args.log.upper(), None) 80 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 81 | main(args) 82 | -------------------------------------------------------------------------------- /langchangetrack/utils/LocalLinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import statsmodels.api as sm 3 | from sklearn.neighbors import NearestNeighbors 4 | from sklearn.base import BaseEstimator 5 | from statsmodels.sandbox.regression.predstd import wls_prediction_std 6 | 7 | import pickle 8 | 9 | # Code to pickle a VW model 10 | import copy_reg 11 | from types import FunctionType, FileType, MethodType 12 | 13 | 14 | def stub_pickler(obj): 15 | return stub_unpickler, () 16 | 17 | 18 | def stub_unpickler(): 19 | return "STUB" 20 | 21 | copy_reg.pickle(MethodType, stub_pickler, stub_unpickler) 22 | copy_reg.pickle(FileType, stub_pickler, stub_unpickler) 23 | copy_reg.pickle(FunctionType, stub_pickler, stub_unpickler) 24 | 25 | 26 | ''' 27 | Given a list of numbers, produce a list of weights using the specified kernel 28 | ''' 29 | 30 | 31 | class KernelFunctions: 32 | 33 | @staticmethod 34 | def uniform(distances): 35 | return numpy.ones(len(distances)) 36 | 37 | @staticmethod 38 | def gauss(distances): 39 | dist_norm = distances / distances[len(distances) - 1] 40 | weights = [math.exp(-dist * dist) for dist in dist_norm] 41 | return weights 42 | 43 | @staticmethod 44 | def linear(distances): 45 | dist_norm = distances / distances[len(distances) - 1] 46 | weights = [1.0001 - dist for dist in dist_norm] 47 | return weights 48 | 49 | @staticmethod 50 | def epanechnikov(distances): 51 | dist_norm = distances / distances[len(distances) - 1] 52 | weights = [(3. / 4.) * (1.0001 - dist * dist) for dist in dist_norm] 53 | return weights 54 | 55 | @staticmethod 56 | def tricube(distances): 57 | dist_norm = distances / distances[len(distances) - 1] 58 | weights = [pow((1.0001 - pow(dist, 3)), 3) for dist in dist_norm] 59 | return weights 60 | 61 | 62 | class LocalLinearRegression(BaseEstimator): 63 | 64 | def __init__(self, k_nn, weight_func=KernelFunctions.uniform): 65 | self.k_nn = k_nn 66 | self.weight_func = weight_func 67 | print self.k_nn, self.weight_func 68 | 69 | ''' 70 | X: A list of points to transform 71 | Y: The corresponding target points 72 | ''' 73 | 74 | def fit(self, X, Y): 75 | if len(X) != len(Y): 76 | raise ValueError("len(X) != len(Y)") 77 | if len(X) < self.k_nn: 78 | raise ValueError("Not enough points for local linear regression for the specified number of neighbors (" + 79 | str(len(X)) + " < " + str(self.k_nn) + ")") 80 | self.X = numpy.array(X) 81 | self.Y = numpy.array(Y) 82 | self.nn = NearestNeighbors(n_neighbors=self.k_nn, algorithm='ball_tree', p=2) 83 | self.nn.fit(self.X) 84 | print "Fit the model" 85 | 86 | ''' 87 | X: The point to transform based on its neighbors 88 | ''' 89 | 90 | def predict(self, X): 91 | neighbors = self.nn.kneighbors(X) 92 | distances = neighbors[0][0] 93 | neighbor_indices = neighbors[1][0] 94 | local_X = self.X.take(neighbor_indices, axis=0) 95 | local_Y = self.Y.take(neighbor_indices, axis=0) 96 | wls = sm.WLS(local_Y, local_X, weights=self.weight_func(distances)).fit() 97 | return wls.predict(X) 98 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/distributional/corpustoembeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | 7 | import gensim 8 | 9 | import logging 10 | logger = logging.getLogger("langchangetrack") 11 | 12 | 13 | class CorpusToEmbeddings(object): 14 | 15 | """ Class that encapsulates functionality for obtaining embeddings from a corpus.""" 16 | 17 | def __init__(self, corpus_iter, embeddings_type, lang='en', 18 | model_config={}, save_model_file=None): 19 | """ Initialize the object with the corpus iterator and 20 | the type of embeddings. 21 | 22 | The corpus iterator should just support iterating over 23 | sentences. It can be a list or a generator which yields 24 | sentences. The embeddings type can be one of the supported 25 | embedding types: 'skipgram' 26 | 27 | The model_config is an optional named tuple containing specific 28 | configurations parameters to be passed when training the model. 29 | """ 30 | 31 | assert(corpus_iter) 32 | assert(embeddings_type in CorpusToEmbeddings.supported_embedding_types()) 33 | 34 | self.corpus_iter = corpus_iter 35 | self.lang = lang 36 | self.embeddings_type = embeddings_type 37 | self.model_config = model_config 38 | 39 | self.embeddings_builder_map = { 40 | 'skipgram': self.buildword2vec 41 | } 42 | self.model = None 43 | self.save_model_file = save_model_file 44 | return 45 | 46 | @staticmethod 47 | def supported_embedding_types(): 48 | """ Embedding types we support. """ 49 | return ['skipgram'] 50 | 51 | def buildword2vec(self): 52 | """ Trains a word2vec model on the corpus. """ 53 | 54 | cfg_size = self.model_config.get('size', 200) 55 | cfg_window = self.model_config.get('window', 5) 56 | cfg_min_count = self.model_config.get('min_count', 10) 57 | cfg_workers = self.model_config.get('workers', 16) 58 | cfg_alpha = self.model_config.get('alpha', 0.01) 59 | logger.info('window size:{}, alpha:{}, embedding size:{}, min_count:{}, workers:{}'.format(cfg_window, cfg_alpha, cfg_size, cfg_min_count, cfg_workers)) 60 | self.model = gensim.models.Word2Vec(self.corpus_iter, 61 | size=cfg_size, 62 | window=cfg_window, 63 | min_count=cfg_min_count, 64 | alpha=cfg_alpha, 65 | workers=cfg_workers, 66 | sample=1e-5, 67 | negative=0) 68 | 69 | if self.save_model_file: 70 | self.model.save_word2vec_format(self.save_model_file) 71 | 72 | def build(self): 73 | """ Trains a model on the corpus to obtain embeddings.""" 74 | sys.stdout.write("Building a model from the corpus.\n") 75 | sys.stdout.flush() 76 | self.embeddings_builder_map[self.embeddings_type]() 77 | sys.stdout.write("Model built.\n") 78 | sys.stdout.flush() 79 | 80 | def save_model(self, model_file): 81 | """ Saves the model file. """ 82 | self.model.save_word2vec_format(model_file) 83 | return 84 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/distributional/scripts/train_embeddings_ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | from argparse import ArgumentParser 8 | import sys 9 | from io import open 10 | from os import path 11 | from time import time 12 | import itertools 13 | 14 | from langchangetrack.corpusreaders.plainngramscorpus import PlainNGRAMSCorpus 15 | from langchangetrack.tsconstruction.distributional.corpustoembeddings import CorpusToEmbeddings 16 | 17 | import logging 18 | logger = logging.getLogger("langchangetrack") 19 | 20 | __author__ = "Vivek Kulkarni" 21 | __email__ = "viveksck@gmail.com" 22 | 23 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 24 | 25 | import psutil 26 | from multiprocessing import cpu_count 27 | 28 | p = psutil.Process(os.getpid()) 29 | p.set_cpu_affinity(list(range(cpu_count()))) 30 | 31 | 32 | class RepeatCorpusNTimes(object): 33 | 34 | def __init__(self, corpus, n): 35 | """ 36 | Repeat a `corpus` `n` times. 37 | >>> corpus = [[(1, 0.5)], []] 38 | >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times 39 | [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] 40 | """ 41 | self.corpus = corpus 42 | self.n = n 43 | 44 | def __iter__(self): 45 | return itertools.chain.from_iterable(itertools.repeat(tuple(self.corpus), self.n)) 46 | 47 | 48 | def run(filename, output_dir, file_prefix, window_size, embedding_type, embedding_size, workers, num_epochs): 49 | corpus_reader = RepeatCorpusNTimes(PlainNGRAMSCorpus(args.filename), num_epochs) 50 | model_config = {} 51 | model_config['window'] = window_size 52 | model_config['size'] = embedding_size 53 | model_config['workers'] = workers 54 | model_file = path.join(output_dir, '_'. join([file_prefix, 'embeddings.model'])) 55 | c = CorpusToEmbeddings(corpus_reader, embedding_type, model_config=model_config, save_model_file=model_file) 56 | c.build() 57 | 58 | 59 | def main(args): 60 | filename = args.filename 61 | output_dir = args.output_dir 62 | file_prefix = args.file_prefix 63 | window_size = int(args.window_size) 64 | embedding_type = args.embedding_type 65 | embedding_size = args.embedding_size 66 | workers = args.workers 67 | num_epochs = args.epochs 68 | run(filename, output_dir, file_prefix, window_size, embedding_type, embedding_size, workers, num_epochs) 69 | 70 | if __name__ == "__main__": 71 | parser = ArgumentParser() 72 | parser.add_argument("-f", "--file", dest="filename", help="Input file for ngrams") 73 | parser.add_argument("-o", "--output_dir", dest="output_dir", help="Output directory") 74 | parser.add_argument("-p", "--file-prefix", dest="file_prefix", default='exp', help="File prefix") 75 | parser.add_argument("-w", "--window_size", dest="window_size", default=5, help="Window size for word2vec") 76 | parser.add_argument("-e", "--embedding_type", dest="embedding_type", default='skipgram', help="Embedding type") 77 | parser.add_argument("-s", "--embedding_size", dest="embedding_size", default=200, type=int, help="Window size for word2vec") 78 | parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int) 79 | parser.add_argument("-epochs", "--epochs", dest="epochs", default=1, help="Number of epochs", type=int) 80 | logging.basicConfig(level=logging.INFO, format=LOGFORMAT) 81 | args = parser.parse_args() 82 | main(args) 83 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | try: 5 | from setuptools import setup 6 | except ImportError: 7 | from distutils.core import setup 8 | 9 | 10 | readme = open('README.rst').read() 11 | history = open('HISTORY.rst').read().replace('.. :changelog:', '') 12 | 13 | requirements = [ 14 | 'wheel==0.23.0', 15 | 'argparse>=1.2.1', 16 | 'numpy>=0.9.1', 17 | 'scipy>=0.15.1', 18 | 'more_itertools>=2.2', 19 | 'joblib>=0.8.3-r1', 20 | 'gensim==0.10.3', 21 | 'statsmodels>=0.5.0', 22 | 'changepoint>=0.1.1', 23 | 'nltk>=3.0.0', 24 | 'textblob>=0.9.0', 25 | 'textblob-aptagger>=0.2.0', 26 | 'psutil>=2.1.1', 27 | ] 28 | 29 | test_requirements = [ 30 | # TODO: put package test requirements here 31 | ] 32 | 33 | setup( 34 | name='langchangetrack', 35 | version='0.1.0', 36 | description='Package for statistically significant language change.', 37 | long_description=readme + '\n\n' + history, 38 | author='Vivek Kulkarni', 39 | author_email='viveksck@gmail.com', 40 | url='https://github.com/viveksck/langchangetrack', 41 | packages=[ 42 | 'langchangetrack', 43 | 'langchangetrack.utils', 44 | 'langchangetrack.corpusreaders', 45 | 'langchangetrack.tsconstruction', 46 | 'langchangetrack.tsconstruction.distributional' 47 | ], 48 | package_dir={'langchangetrack': 49 | 'langchangetrack'}, 50 | include_package_data=True, 51 | install_requires=requirements, 52 | license="BSD", 53 | zip_safe=False, 54 | keywords='langchangetrack', 55 | classifiers=[ 56 | 'Development Status :: 2 - Pre-Alpha', 57 | 'Intended Audience :: Developers', 58 | 'License :: OSI Approved :: BSD License', 59 | 'Natural Language :: English', 60 | "Programming Language :: Python :: 2", 61 | 'Programming Language :: Python :: 2.6', 62 | 'Programming Language :: Python :: 2.7', 63 | 'Programming Language :: Python :: 3', 64 | 'Programming Language :: Python :: 3.3', 65 | 'Programming Language :: Python :: 3.4', 66 | ], 67 | scripts=[ 68 | 'langchangetrack/tsconstruction/freq/scripts/create_freq_timeseries.py', 69 | 'langchangetrack/tsconstruction/syntactic/scripts/pos_displacements.py', 70 | 'langchangetrack/tsconstruction/distributional/scripts/train_embeddings_ngrams.py', 71 | 'langchangetrack/tsconstruction/distributional/scripts/learn_map.py', 72 | 'langchangetrack/tsconstruction/distributional/scripts/embedding_displacements.py', 73 | 'langchangetrack/tsconstruction/dump_timeseries.py', 74 | 'langchangetrack/cpdetection/detect_changepoints_word_ts.py', 75 | 'langchangetrack/cpdetection/detect_changepoints_word_ts_r.py', 76 | 'langchangetrack/scripts/detect_cp_freq.sh', 77 | 'langchangetrack/scripts/detect_cp_pos.sh', 78 | 'langchangetrack/scripts/detect_cp_distributional.sh', 79 | 'langchangetrack/scripts/ngrams_pipeline.py', 80 | 'langchangetrack/scripts/pos_pipeline.py', 81 | 'langchangetrack/scripts/freq_pipeline.py', 82 | 'langchangetrack/utils/scripts/freq_count.py', 83 | 'langchangetrack/utils/scripts/common_vocab.py', 84 | 'langchangetrack/utils/scripts/pos_tag.py', 85 | 'langchangetrack/utils/scripts/calculate_pos_dist.sh', 86 | 'langchangetrack/utils/scripts/calculate_freq_counts.sh', 87 | 'langchangetrack/utils/scripts/train_models.sh', 88 | ], 89 | test_suite='tests', 90 | tests_require=test_requirements 91 | ) 92 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/viveksck/langchangetrack/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "feature" 34 | is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | langchangetrack could always use more documentation, whether as part of the 40 | official langchangetrack docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/viveksck/langchangetrack/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | 55 | Get Started! 56 | ------------ 57 | 58 | Ready to contribute? Here's how to set up `langchangetrack` for local development. 59 | 60 | 1. Fork the `langchangetrack` repo on GitHub. 61 | 2. Clone your fork locally:: 62 | 63 | $ git clone git@github.com:your_name_here/langchangetrack.git 64 | 65 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 66 | 67 | $ mkvirtualenv langchangetrack 68 | $ cd langchangetrack/ 69 | $ python setup.py develop 70 | 71 | 4. Create a branch for local development:: 72 | 73 | $ git checkout -b name-of-your-bugfix-or-feature 74 | 75 | Now you can make your changes locally. 76 | 77 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 78 | 79 | $ flake8 langchangetrack tests 80 | $ python setup.py test 81 | $ tox 82 | 83 | To get flake8 and tox, just pip install them into your virtualenv. 84 | 85 | 6. Commit your changes and push your branch to GitHub:: 86 | 87 | $ git add . 88 | $ git commit -m "Your detailed description of your changes." 89 | $ git push origin name-of-your-bugfix-or-feature 90 | 91 | 7. Submit a pull request through the GitHub website. 92 | 93 | Pull Request Guidelines 94 | ----------------------- 95 | 96 | Before you submit a pull request, check that it meets these guidelines: 97 | 98 | 1. The pull request should include tests. 99 | 2. If the pull request adds functionality, the docs should be updated. Put 100 | your new functionality into a function with a docstring, and add the 101 | feature to the list in README.rst. 102 | 3. The pull request should work for Python 2.6, 2.7, 3.3, and 3.4, and for PyPy. Check 103 | https://travis-ci.org/viveksck/langchangetrack/pull_requests 104 | and make sure that the tests pass for all supported Python versions. 105 | 106 | Tips 107 | ---- 108 | 109 | To run a subset of tests:: 110 | 111 | $ python -m unittest tests.test_langchangetrack 112 | -------------------------------------------------------------------------------- /langchangetrack/utils/entropy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | :mod:`pynchon.bio.alg.entropy` 4 | =========================== 5 | 6 | Algorithms on entropies. 7 | """ 8 | from itertools import izip 9 | import numpy as np 10 | import scipy as sp 11 | import math 12 | 13 | 14 | def get_base(unit='bit'): 15 | if unit == 'bit': 16 | log = sp.log2 17 | elif unit == 'nat': 18 | log = sp.log 19 | elif unit in ('digit', 'dit'): 20 | log = sp.log10 21 | else: 22 | raise ValueError('The "unit" "%s" not understood' % unit) 23 | return log 24 | 25 | 26 | def shannon_entropy(freq, unit='bit'): 27 | """Calculates the Shannon Entropy (H) of a frequency. 28 | 29 | Arguments: 30 | 31 | - freq (``numpy.ndarray``) A ``Freq`` instance or ``numpy.ndarray`` with 32 | frequency vectors along the last axis. 33 | - unit (``str``) The unit of the returned entropy one of 'bit', 'digit' 34 | or 'nat'. 35 | """ 36 | log = get_base(unit) 37 | shape = freq.shape # keep shape to return in right shape 38 | Hs = np.ndarray(freq.size / shape[-1]) # place to keep entropies 39 | # this returns an array of vectors or just a vector of frequencies 40 | freq = freq.reshape((-1, shape[-1])) 41 | # this makes sure we have an array of vectors of frequencies 42 | freq = np.atleast_2d(freq) 43 | # get fancy indexing 44 | positives = freq != 0. 45 | for i, (freq, idx) in enumerate(izip(freq, positives)): 46 | freq = freq[idx] # keep only non-zero 47 | logs = [math.log(f, 2) for f in freq] # logarithms of non-zero frequencies 48 | Hs[i] = -np.sum(freq * logs) 49 | Hs.reshape(shape[:-1]) 50 | return Hs 51 | 52 | 53 | def relative_entropy(freq, background, unit='bit'): 54 | """ 55 | Calculates the Releative Entropy (D), which is the Kullback-Leibler 56 | divergence between two frequencies. The two arrays "freq" and "background" 57 | need to broadcast to a single shape. 58 | 59 | Arguments: 60 | 61 | - freq (``numpy.ndarray``) A ``Freq`` instance or ``numpy.ndarray`` with 62 | frequency vectors along the last axis. 63 | - background (``numpy.ndarray``) ``Freq`` instance or ``numpy.ndarray`` 64 | with frequency vectors along the last axis. This typically is a 65 | rank-1 array. 66 | 67 | Could be normalized?: Dkl = Dkl / log(len(background)) 68 | """ 69 | log = get_base(unit) 70 | shape = freq.shape 71 | freq = freq.reshape((-1, shape[-1])) 72 | freq = np.atleast_2d(freq) 73 | Dkls = np.ndarray(freq.size / shape[-1]) 74 | positives = (freq != 0.) & (background != 0.) 75 | for i, (freq, idx) in enumerate(izip(freq, positives)): 76 | freq = freq[idx] 77 | bg = background[idx] 78 | logs = log(freq / bg) 79 | Dkls[i] = np.sum(freq * logs) 80 | Dkls.reshape(shape[:-1]) 81 | return Dkls 82 | 83 | 84 | def mutual_information(jointfreq, rowfreq=None, colfreq=None, unit='bit'): 85 | """ 86 | Calculates the Mutual Information (I) of a joint frequency. The marginal 87 | frequencies can be given or are calculated from the joint frequency. 88 | 89 | Arguments: 90 | 91 | - jointfreq (``numpy.ndarray``) A normalized ``JointFreq`` instance or 92 | ``numpy.ndarray`` of rank-2, which is a joint probability distribution 93 | function of two random variables. 94 | - rowfreq (``numpy.ndarray``) [default: ``None``] A normalized marginal 95 | probability distribution function for the variable along the axis =0. 96 | - colfreq (``numpy.ndarray``) [default: ``None``] A normalized marginal 97 | probability distribution function for the variable along the axis =1. 98 | - unit (``str``) [defualt: ``"bit"``] Unit of the returned information. 99 | """ 100 | log = get_base(unit) 101 | rowfreq = rowfreq or np.sum(jointfreq, axis=1) 102 | colfreq = colfreq or np.sum(jointfreq, axis=0) 103 | indfreq = np.dot(rowfreq[None].transpose(), colfreq[None]) 104 | non_zero = jointfreq != 0. 105 | jntf = jointfreq[non_zero] 106 | indf = indfreq[non_zero] 107 | return np.sum(jntf * log(jntf / indf)) 108 | 109 | 110 | def jensen_shannon_divergence(freq, weights=None, unit='bit'): 111 | """ 112 | Calculates the Jensen-Shannon Divergence (Djs) of two or more frequencies. 113 | The weights are for the relative contribution of each frequency vector. 114 | 115 | Arguments: 116 | 117 | - freq (``numpy.ndarray``) A ``Prof`` instance or a rank-2 array of 118 | frequencies along the last dimension. 119 | - weights (``numpy.ndarray``) An array with a weight for each 120 | frequency vector. Rank-1. 121 | - unit (``str``) see: the function ``shannon_entropy``. 122 | """ 123 | if weights is not None: 124 | if len(freq) != len(weights): 125 | raise ValueError('The number of frequencies and weights do not match.') 126 | if (freq.ndim != 2) or (len(freq) < 2): 127 | raise ValueError('At least two frequencies in a rank-2 array expected.') 128 | weighted_average = np.average(freq, axis=0, weights=weights) 129 | H_avg_freq = shannon_entropy(weighted_average, unit) 130 | H_freq = shannon_entropy(freq, unit) 131 | avg_H_freq = np.average(H_freq, weights=weights) 132 | JSD = H_avg_freq - avg_H_freq 133 | return JSD 134 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | langchangetrack 3 | =============================== 4 | 5 | .. image:: https://badge.fury.io/py/langchangetrack.png 6 | :target: http://badge.fury.io/py/langchangetrack 7 | 8 | .. image:: https://travis-ci.org/viveksck/langchangetrack.png?branch=master 9 | :target: https://travis-ci.org/viveksck/langchangetrack 10 | 11 | .. image:: https://pypip.in/d/langchangetrack/badge.png 12 | :target: https://pypi.python.org/pypi/langchangetrack 13 | 14 | .. image:: https://github.com/viveksck/langchangetrack/blob/master/langchangetrack/images/gay_invisible.png 15 | 16 | 17 | Package for Statistically Significant Language Change. 18 | 19 | * Free software: BSD license 20 | * Documentation: https://langchangetrack.readthedocs.org. 21 | 22 | Features 23 | -------- 24 | 25 | * This package provides tools to detect linguistic change in temporal corpora. 26 | 27 | * The meta algorithm works in 2 main steps 28 | 29 | #. **Time series construction**:Given a word, we construct a time series that tracks the displacement of a word through time. We track the displacement of a word using either Frequency, Part of Speech Distribution or Co-occurrences. 30 | 31 | #. **Change point detection**: We then use change point detection methods to detect if the time series contains a change point and if so what the change point is. 32 | 33 | The details of the above steps are outlined in : http://arxiv.org/abs/1411.3315 34 | 35 | 36 | Visualization Demo 37 | ------------------- 38 | 39 | Please see this for a cool visualization of words moving through time: http://tinyurl.com/wordvis 40 | 41 | Usage 42 | ------ 43 | 44 | Input 45 | ------ 46 | 47 | We assume a temporal corpus of text files (appropriately tokenized) to be present in a directory. In addition we assume list of words in a single text file that one is interested in tracking. 48 | This could just be the set of words in the common vocabulary of the temporal corpus. 49 | 50 | Output 51 | ------ 52 | 53 | The output consists of the pvalues for each word indicating the significance of the changepoint detected. 54 | 55 | Sample Usage 56 | ------------ 57 | ``$ngrams_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --context-size 5 --epochs 3 --start-time-point 1900 --end-time-point 2000 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16`` 58 | 59 | ``$pos_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --start-time-point 1900 --end-time-point 1930 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16`` 60 | 61 | ``$freq_pipeline.py --corpus-dir data/temporal_corpus/ --file-extension "ngrams" --working-dir ./working --output-dir ./output --start-time-point 1900 --end-time-point 2000 --step-size 5 --vocabulary-file data/temporal_corpus/common_vocab.txt --workers 16`` 62 | 63 | **You might need to tune the hyper parameters as per your specific need.** 64 | 65 | Detailed Usage 66 | --------------- 67 | **Usage: ngrams_pipeline.py** 68 | 69 | optional arguments: 70 | -h, --help show this help message and exit 71 | --corpus-dir CORPUS_DIR 72 | Corpus directory 73 | --file-extension EXT Corpus file extension 74 | --working-dir WORKING_DIR 75 | Working directory 76 | --output-dir OUTPUT_DIR 77 | Output directory 78 | --context-size WINDOW 79 | Context size to use for training embeddings 80 | --epochs EPOCHS Number of epochs to training embeddings 81 | --start-time-point START 82 | Start time point 83 | --end-time-point END End time point 84 | --step-size STEP Step size for timepoints 85 | --model-family MODEL_FAMILY 86 | Model family default (locallinear) 87 | --number-nearest-neighbors KNN 88 | Number of nearest neighbors to use for mapping to 89 | joint space (default:1000) 90 | --vocabulary-file VOCAB_FILE 91 | Common vocabulary file 92 | --threshold THRESHOLD 93 | Threshold for mean shift model for change point 94 | detection (default: 1.75) 95 | --bootstrap-samples BOOTSTRAP 96 | Number of bootstrap samples to draw (default: 1000) 97 | --workers WORKERS Maximum number of workers (default: 1) 98 | -l LOG, --log LOG log verbosity level 99 | 100 | 101 | **Usage: pos_pipeline.py** 102 | 103 | optional arguments: 104 | -h, --help show this help message and exit 105 | --corpus-dir CORPUS_DIR 106 | Corpus directory 107 | --file-extension EXT Corpus file extension 108 | --working-dir WORKING_DIR 109 | Working directory 110 | --output-dir OUTPUT_DIR 111 | Output directory 112 | --start-time-point START 113 | Start time point 114 | --end-time-point END End time point 115 | --step-size STEP Step size for timepoints 116 | --vocabulary-file VOCAB_FILE 117 | Common vocabulary file 118 | --threshold THRESHOLD 119 | Threshold for mean shift model for change point 120 | detection 121 | --bootstrap-samples BOOTSTRAP 122 | Number of bootstrap samples to draw 123 | --workers WORKERS Maximum number of workers 124 | -l LOG, --log LOG log verbosity level 125 | 126 | 127 | **usage: freq_pipeline.py** 128 | 129 | optional arguments: 130 | -h, --help show this help message and exit 131 | --corpus-dir CORPUS_DIR 132 | Corpus directory 133 | --file-extension EXT Corpus file extension 134 | --working-dir WORKING_DIR 135 | Working directory 136 | --output-dir OUTPUT_DIR 137 | Output directory 138 | --start-time-point START 139 | Start time point 140 | --end-time-point END End time point 141 | --step-size STEP Step size for timepoints 142 | --vocabulary-file VOCAB_FILE 143 | Common vocabulary file 144 | --threshold THRESHOLD 145 | Threshold for mean shift model for change point 146 | detection 147 | --bootstrap-samples BOOTSTRAP 148 | Number of bootstrap samples to draw 149 | --workers WORKERS Maximum number of workers 150 | -l LOG, --log LOG log verbosity level 151 | 152 | 153 | 154 | Requirements 155 | ------------ 156 | * wheel==0.23.0 157 | * argparse>=1.2.1 158 | * numpy>=0.9.1 159 | * scipy>=0.15.1 160 | * more_itertools>=2.2 161 | * joblib>=0.8.3-r1 162 | * gensim==0.10.3 163 | * statsmodels>=0.5.0 164 | * changepoint>=0.1.0 165 | * nltk>=3.0.0 166 | * textblob>=0.9.0 167 | * textblob-aptagger>=0.2.0 168 | * psutil>=2.2.0 169 | * GNU Parallel 170 | * R (good to have) 171 | * rpy2 (good to have) 172 | 173 | 174 | 175 | Installation 176 | ------------ 177 | #. Install GNU Parallel from here: www.gnu.org/software/software.html 178 | #. cd langchangetrack 179 | #. pip install -r requirements.txt 180 | #. python setup.py install 181 | 182 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/langchangetrack.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/langchangetrack.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/langchangetrack" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/langchangetrack" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\langchangetrack.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\langchangetrack.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/syntactic/scripts/pos_displacements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | 6 | import os 7 | from os import path 8 | import cPickle as pickle 9 | import numpy as np 10 | import scipy 11 | import itertools 12 | from scipy.spatial.distance import cosine, euclidean, norm 13 | import pandas as pd 14 | import more_itertools 15 | from joblib import Parallel, delayed 16 | 17 | import gensim 18 | 19 | from langchangetrack.utils.dummy_regressor import DummyRegressor 20 | from langchangetrack.utils import LocalLinearRegression 21 | from langchangetrack.utils import entropy 22 | from langchangetrack.tsconstruction.displacements import Displacements 23 | 24 | import logging 25 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 26 | logger = logging.getLogger("langchangetrack") 27 | 28 | import psutil 29 | from multiprocessing import cpu_count 30 | 31 | p = psutil.Process(os.getpid()) 32 | p.set_cpu_affinity(list(range(cpu_count()))) 33 | 34 | def get_vectors_pos(model, norm_embedding=True): 35 | return model 36 | 37 | def load_model_pos(model_path): 38 | """ Load the POS model from a file.""" 39 | return pd.read_csv(model_path) 40 | 41 | def load_predictor_pos(predictor_path): 42 | """ Load the predictor model. """ 43 | return DummyRegressor() 44 | 45 | class POSDisplacements(Displacements): 46 | def __init__(self, 47 | data_dir, 48 | pred_dir, 49 | words_file, 50 | timepoints, 51 | num_words, 52 | get_vectors, 53 | load_model, 54 | load_predictor, 55 | method, 56 | win_size, 57 | fixed_point, 58 | embedding_suffix, 59 | predictor_suffix, 60 | workers): 61 | 62 | """ Constructor """ 63 | # Initialize the super class. 64 | super(POSDisplacements, self).__init__() 65 | self.get_vectors = get_vectors 66 | self.load_model = load_model 67 | self.has_predictors = True 68 | self.load_predictor = load_predictor 69 | self.norm_embedding = False 70 | self.words_file = words_file 71 | self.timepoints = timepoints 72 | self.data_dir = data_dir 73 | self.pred_dir = pred_dir 74 | self.num_words = num_words 75 | self.method = method 76 | self.win_size = win_size 77 | self.fixed_point = fixed_point 78 | self.embedding_suffix = embedding_suffix 79 | self.predictor_suffix = predictor_suffix 80 | self.workers = workers 81 | 82 | def number_distance_metrics(self): 83 | return 1 84 | 85 | def calculate_distance(self, vec1, vec2): 86 | """ Calculate distances between vector1 and vector2. """ 87 | if vec1 is None or vec2 is None: 88 | return [np.nan] 89 | d = entropy.jensen_shannon_divergence(np.vstack([vec1, vec2]), unit='digit') 90 | return [d[0]] 91 | 92 | def load_models_and_predictors(self): 93 | """ Load all the models and predictors. """ 94 | self.models = {} 95 | self.predictors = {} 96 | model_paths = [path.join(self.data_dir, timepoint + self.embedding_suffix) for timepoint in self.timepoints] 97 | predictor_handles = [timepoint for timepoint in self.timepoints] 98 | loaded_models = Parallel(n_jobs=self.workers)(delayed(self.load_model)(model_path) for model_path in model_paths) 99 | for i, timepoint in enumerate(self.timepoints): 100 | self.models[timepoint] = loaded_models[i] 101 | self.predictors[timepoint] = self.load_predictor(predictor_handles[i]) 102 | print "Done loading predictors" 103 | 104 | def is_present(self, timepoint, word): 105 | """ Check if the word is present in the vocabulary at this timepoint. """ 106 | model = self.get_model(timepoint) 107 | return word in model.word.values 108 | 109 | def get_vector(self, timepoint, word): 110 | """ Get the embedding for this word at the specified timepoint.""" 111 | model = self.get_model(timepoint) 112 | return model[model.word == word].values[0][1:] 113 | 114 | def main(args): 115 | syear = int(args.syear) 116 | eyear = int(args.eyear) 117 | stepsize = int(args.stepsize) 118 | timepoints = np.arange(syear, eyear, stepsize) 119 | timepoints = [str(t) for t in timepoints] 120 | workers = int(args.workers) 121 | # Create the main work horse. 122 | e = POSDisplacements(args.datadir, 123 | args.preddir, 124 | args.filename, 125 | timepoints, 126 | int(args.num_words), 127 | get_vectors_pos, 128 | load_model_pos, 129 | load_predictor_pos, 130 | args.method, 131 | args.win_size, 132 | str(args.fixed_point), 133 | args.embedding_suffix, 134 | args.predictor_suffix, 135 | workers) 136 | 137 | # Load the models and predictors 138 | e.load_models_and_predictors() 139 | 140 | # Calculate the word displacements and dump. 141 | L, H, dfo, dfn = e.calculate_words_displacement(column_names=['word', 's', 'otherword', 't', 'jsd'], n_jobs = workers) 142 | fname = 'timeseries_s_t' + '_' + args.outputsuffix + '.pkl' 143 | pickle.dump((L,H, dfo, dfn), open(path.join(args.outputdir, fname),'wb')) 144 | 145 | if __name__ == "__main__": 146 | parser = ArgumentParser() 147 | parser.add_argument("-f", "--file", dest="filename", help="Input file for words") 148 | parser.add_argument("-d", "--data_dir", dest="datadir", help="data directory") 149 | parser.add_argument("-p", "--pred_dir", dest="preddir", help="data directory") 150 | parser.add_argument("-o", "--output_dir", dest="outputdir", help="Output directory") 151 | parser.add_argument("-os", "--output_suffix", dest="outputsuffix", help="Output suffix") 152 | parser.add_argument("-es", "--emb_suffix", dest="embedding_suffix", help="embedding suffix") 153 | parser.add_argument("-ps", "--pred_suffix", dest="predictor_suffix",help="predictor suffix") 154 | parser.add_argument("-sy", "--start", dest="syear", default = '1800', help="start year") 155 | parser.add_argument("-ey", "--end", dest="eyear", default = '2010', help="end year(not included)") 156 | parser.add_argument("-s", "--window_size", dest="stepsize", default = 5, help="Window size for time series") 157 | parser.add_argument("-e", "--embedding_type", dest="embedding_type", default = 'pos', help="Embedding type") 158 | parser.add_argument("-m", "--method", dest="method", default="polar", help="Method to use") 159 | parser.add_argument("-w", "--win_size", dest="win_size", default="-1", help="Window size to use if not polar", type=int) 160 | parser.add_argument("-y", "--fixed_point", dest="fixed_point", default="-1", help="fixed point to use if method is fixed", type=int) 161 | parser.add_argument("-n", "--num_words", dest="num_words", default = -1, help="Number of words", type=int) 162 | parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int) 163 | logging.basicConfig(level=logging.INFO, format=LOGFORMAT) 164 | args = parser.parse_args() 165 | main(args) 166 | -------------------------------------------------------------------------------- /langchangetrack/cpdetection/detect_changepoints_word_ts_r.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | import logging 4 | import pandas as pd 5 | import numpy as np 6 | import itertools 7 | import more_itertools 8 | import os 9 | 10 | from functools import partial 11 | 12 | from changepoint.utils.ts_stats import parallelize_func 13 | from changepoint.rchangepoint import estimate_cp_pval, estimate_cp 14 | 15 | import psutil 16 | from multiprocessing import cpu_count 17 | 18 | p = psutil.Process(os.getpid()) 19 | p.set_cpu_affinity(list(range(cpu_count()))) 20 | 21 | __author__ = "Vivek Kulkarni" 22 | __email__ = "viveksck@gmail.com" 23 | 24 | # Global variable specifying which column index the time series 25 | # begins in a dataframe 26 | TS_OFFSET = 2 27 | 28 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 29 | 30 | def normalize_timeseries(df): 31 | """ Centre and scale each time series column. """ 32 | # Normalize a set of time series by subtracting the mean from each column 33 | # and dividing by the standard deviation. 34 | dfm = df.copy(deep=True) 35 | dfmean = df.mean() 36 | dfstd = df.std() 37 | for col in df.columns[TS_OFFSET:]: 38 | dfm[col] = (df[col] - dfmean[col]) / dfstd[col] 39 | return dfm 40 | 41 | def get_filtered_df(df, vocab_file): 42 | """ Return a data frame with only the words present in the vocab file. """ 43 | if vocab_file: 44 | vocab = open(vocab_file).readlines() 45 | vocab = [v.strip() for v in vocab] 46 | # Get the set of words. 47 | words = pd.Series(df.word.values.ravel()).unique() 48 | set_words = set(words) 49 | # Find the words common to data frame and vocab 50 | common_set_words = set_words & set(vocab) 51 | # Filter the dataframe 52 | df_filtered = df[df.word.isin(common_set_words)] 53 | return df_filtered 54 | else: 55 | return df 56 | 57 | def get_actual_cp(df, cp_idx): 58 | """ 59 | Return the actual time point corresponding to the change point index. 60 | """ 61 | # If the cpt detection did not find any changepoint it 62 | # returns NAN in which case we return the same 63 | if np.isnan(cp_idx): 64 | return cp_idx 65 | 66 | # Add 1 as the first column is word. 67 | return df.columns[cp_idx + 1] 68 | 69 | def get_pval_word_chunk(chunk, df, threshold = None): 70 | """ 71 | Process each word in a chunk and return pvalue and changepoint. 72 | Here we set R changepoint class = FALSE which return pvalue. 73 | 74 | """ 75 | results = [] 76 | for w in chunk: 77 | # Get the time series. 78 | ts = np.array(df[df.word == w].values[0][TS_OFFSET:]) 79 | # Process that time series. 80 | results.append(estimate_cp_pval(ts)) 81 | return results 82 | 83 | 84 | def get_cp_word_chunk(chunk, df, threshold = None): 85 | """ 86 | Process each word in a chunk and return changepoints. Does not return 87 | pvalue. 88 | """ 89 | results = [] 90 | for w in chunk: 91 | ts = np.array(df[df.word == w].values[0][TS_OFFSET:]) 92 | cp_list = estimate_cp(ts) 93 | if len(cp_list): 94 | # Returns most recent change point if any. 95 | results.append(cp_list[-1]) 96 | else: 97 | # No change points. 98 | results.append(np.nan) 99 | return results 100 | 101 | 102 | def main(args): 103 | # Read the arguments 104 | df_f = args.filename 105 | common_vocab_file = args.vocab_file 106 | pval_file = args.pval_file 107 | col_to_drop = args.col 108 | should_normalize = not(args.dont_normalize) 109 | n_jobs = int(args.workers) 110 | cp_pval = args.dump_pval 111 | if args.threshold != None: 112 | threshold = float(args.threshold) 113 | else: 114 | threshold = None 115 | 116 | print "CONFIG:" 117 | print "FILENAME:", df_f 118 | print "VOCAB FILE:", common_vocab_file 119 | print "PVAL_FILE:", pval_file 120 | print "COL TO DROP:", col_to_drop 121 | print "NORMALIZE:", should_normalize 122 | print "Threshold", threshold 123 | 124 | # Read the time series data 125 | df = pd.read_csv(df_f) 126 | # Restrict only to the common vocabulary. 127 | df = get_filtered_df(df, common_vocab_file) 128 | 129 | # Normalize the data frame 130 | if should_normalize: 131 | norm_df = normalize_timeseries(df) 132 | else: 133 | norm_df = df 134 | 135 | # Drop a column if needed. 136 | if col_to_drop in norm_df.columns: 137 | cols = df.columns.tolist() 138 | if col_to_drop == norm_df.columns[-1]: 139 | time_points = cols[2:] 140 | new_cols = cols[0:2] + time_points[::-1] 141 | norm_df = norm_df[new_cols] 142 | print norm_df.columns 143 | norm_df.drop(col_to_drop, axis = 1, inplace=True) 144 | 145 | print "Columns of the time series", norm_df.columns 146 | cwords = norm_df.word.values 147 | print "Number of words we are processing", len(cwords) 148 | 149 | chunksz = np.ceil(len(cwords)/float(n_jobs)) 150 | if cp_pval: 151 | results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df = norm_df, threshold = threshold) 152 | cps, pvals = zip(*results) 153 | # R returns 1 for a very high stat significance. So we invert it as for 154 | # us low pvalues mean more significance. 155 | pvals = [(1.0 - pval) for pval in pvals] 156 | actual_cps = [get_actual_cp(norm_df, cp) for cp in cps] 157 | results = zip(cwords, actual_cps, pvals) 158 | header = ['word', 'cp', 'pval'] 159 | pvalue_df = pd.DataFrame().from_records(results, columns=header) 160 | sdf = pvalue_df.sort(columns=['pval']) 161 | sdf.to_csv(pval_file, encoding='utf-8', index = None) 162 | else: 163 | results = parallelize_func(cwords[:], get_cp_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df = norm_df) 164 | cps = results 165 | actual_cps = [get_actual_cp(norm_df, cp) for cp in cps] 166 | results = zip(cwords, actual_cps) 167 | header = ['word', 'cp'] 168 | pvalue_df = pd.DataFrame().from_records(results, columns=header) 169 | sdf = pvalue_df.sort(columns=['cp']) 170 | sdf.to_csv(pval_file, encoding='utf-8', index = None) 171 | 172 | if __name__ == "__main__": 173 | parser = ArgumentParser() 174 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 175 | parser.add_argument("-v", "--vfile", dest="vocab_file", help="Input file") 176 | parser.add_argument("-p", "--pfile", dest="pval_file", help="Input file") 177 | parser.add_argument("-c", "--col", dest="col", help="Input file") 178 | parser.add_argument("-s", "--shuffle", dest="shuffle", action='store_true', default = False, help="Shuffle") 179 | parser.add_argument("-d", "--dont_normalize", dest="dont_normalize", action='store_true', default = False, help="Dont normalize") 180 | parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use") 181 | parser.add_argument("-dump_pval", "--dump_pval", dest="dump_pval",default=False, action='store_true', help="Dump pvalue or not") 182 | parser.add_argument("-t", "--threshold", dest="threshold", default=None, type=float, help="threshold") 183 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO") 184 | args = parser.parse_args() 185 | if args.log == 'DEBUG': 186 | sys.excepthook = debug 187 | numeric_level = getattr(logging, args.log.upper(), None) 188 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 189 | main(args) 190 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/dump_timeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """dump_timeseries.py: Dumps the displacements as a timeseries in a data frame""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | import os 10 | from os import path 11 | from time import time 12 | from glob import glob 13 | import pickle 14 | import pandas as pd 15 | import numpy as np 16 | import more_itertools 17 | 18 | from scipy.interpolate import interp1d 19 | from scipy.interpolate import UnivariateSpline 20 | 21 | from joblib import Parallel, delayed 22 | 23 | import psutil 24 | from multiprocessing import cpu_count 25 | 26 | p = psutil.Process(os.getpid()) 27 | p.set_cpu_affinity(list(range(cpu_count()))) 28 | 29 | __author__ = "Vivek Kulkarni" 30 | __email__ = "viveksck@gmail.com" 31 | 32 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 33 | 34 | 35 | def interpolate(x, xinter, values, finter): 36 | # Find all the points which we need to interpolate 37 | xmissing = [xm for xm in xinter if xm not in x] 38 | # Interpolate the function value at those points 39 | yintervalues = finter(xmissing) 40 | # Original points and values pairs 41 | orig_pairs = zip(x, values) 42 | # Interpolated points and values pairs 43 | interp_pairs = zip(xmissing, yintervalues) 44 | # Find the final values 45 | assert(len(orig_pairs) + len(interp_pairs) == len(xinter)) 46 | final_pairs = sorted(orig_pairs + interp_pairs) 47 | return final_pairs 48 | 49 | 50 | def create_word_time_series(old_df, new_df, w, sourcexinter, destxinter, metric_name="", interpolate=False): 51 | """ Create the time series for a word. """ 52 | 53 | sourcex = np.asarray(old_df[old_df.word == w].s.values, dtype=int) 54 | destx = np.asarray(new_df[new_df.word == w].s.values, dtype=int) 55 | 56 | old_values = old_df[old_df.word == w][metric_name].values 57 | new_values = new_df[new_df.word == w][metric_name].values 58 | 59 | try: 60 | fold = interp1d(sourcex, old_values, bounds_error=False) 61 | fnew = interp1d(destx, new_values, bounds_error=False) 62 | except: 63 | print "Failed to interpolate", w 64 | return None, None 65 | 66 | if interpolate: 67 | final_old_pairs = interpolate(sourcex, sourcexinter, old_values, fold) 68 | final_new_pairs = interpolate(destx, destxinter, new_values, fnew) 69 | xinterold, yinterold = zip(*final_old_pairs) 70 | xinternew, yinternew = zip(*final_new_pairs) 71 | else: 72 | yinterold = old_values 73 | yinternew = new_values 74 | 75 | OL = [w] 76 | NL = [w] 77 | OL.extend(yinterold) 78 | NL.extend(yinternew) 79 | return (OL, NL) 80 | 81 | 82 | def process_chunk(chunk, func, olddf, newdf, sourcexinter, destxinter, metric_name, interpolate): 83 | """ Process each chunk. """ 84 | results = [func(olddf, newdf, e, sourcexinter, destxinter, metric_name, interpolate) 85 | for e in chunk] 86 | return results 87 | 88 | 89 | def main(args): 90 | # get the arguments 91 | method = args.method 92 | win_size = args.win_size 93 | step = args.step 94 | metric_name = args.metric_name 95 | n_jobs = args.workers 96 | 97 | # Load the data. 98 | L, H, olddf, newdf = pickle.load(open(args.filename)) 99 | words = pd.Series(olddf.word.values.ravel()).unique() 100 | oldrows = [] 101 | newrows = [] 102 | sourcexrange = np.arange(args.mint, args.maxt, step) 103 | destxrange = np.arange(args.mint, args.maxt, step) 104 | if method == 'win': 105 | sourcexrange = sourcexrange[win_size:] 106 | destxrange = destxrange[:-win_size] 107 | 108 | if args.interpolate: 109 | sourcexinter = np.arange(sourcexrange[0], sourcexrange[-1] + 1, 1) 110 | destxinter = np.arange(destxrange[0], destxrange[-1] + 1, 1) 111 | else: 112 | sourcexinter = sourcexrange 113 | destxinter = destxrange 114 | 115 | # Construct the series 116 | assert(len(sourcexinter) == len(destxinter)) 117 | chunk_sz = np.ceil(len(words)/float(n_jobs)) 118 | words_chunks = more_itertools.chunked(words, chunk_sz) 119 | timeseries_chunks = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, create_word_time_series, olddf, newdf, 120 | sourcexinter, destxinter, 121 | metric_name=metric_name, 122 | interpolate=args.interpolate) for chunk in words_chunks) 123 | 124 | timeseries = list(more_itertools.flatten(timeseries_chunks)) 125 | 126 | # Dump the data frame 127 | for orow, newrow in timeseries: 128 | if orow and newrow: 129 | oldrows.append(orow) 130 | newrows.append(newrow) 131 | 132 | oldtimeseries = pd.DataFrame() 133 | newtimeseries = pd.DataFrame() 134 | header = ['word'] 135 | header.extend(sourcexinter) 136 | newheader = ['word'] 137 | newheader.extend(destxinter) 138 | oldtimeseries = oldtimeseries.from_records(oldrows, columns=header) 139 | oldtimeseries = oldtimeseries.fillna(method='backfill', axis=1) 140 | newtimeseries = newtimeseries.from_records(newrows, columns=newheader) 141 | newtimeseries = newtimeseries.fillna(method='backfill', axis=1) 142 | oldtimeseries.to_csv(args.sourcetimef, encoding='utf-8') 143 | newtimeseries.to_csv(args.endtimef, encoding='utf-8') 144 | 145 | 146 | def debug(type_, value, tb): 147 | if hasattr(sys, 'ps1') or not sys.stderr.isatty(): 148 | # we are in interactive mode or we don't have a tty-like device, so we 149 | # call the default hook 150 | sys.__excepthook__(type_, value, tb) 151 | else: 152 | import traceback 153 | import pdb 154 | # we are NOT in interactive mode, print the exception... 155 | traceback.print_exception(type_, value, tb) 156 | print("\n") 157 | # ...then start the debugger in post-mortem mode. 158 | pdb.pm() 159 | 160 | if __name__ == "__main__": 161 | parser = ArgumentParser() 162 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 163 | parser.add_argument("-i", "--interpolate", dest="interpolate", help="interpolate", action='store_true', default=False) 164 | parser.add_argument("-s", "--sfile", dest="sourcetimef", help="Input file") 165 | parser.add_argument("-e", "--efile", dest="endtimef", help="Input file") 166 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO") 167 | parser.add_argument("-m", "--min", dest="mint", help="starting time point", default=1900, type=int) 168 | parser.add_argument("-n", "--max", dest="maxt", help="ending timepoint(not included)", default=2010, type=int) 169 | parser.add_argument("-st", "--step", dest="step", help="stepsize", default=5, type=int) 170 | parser.add_argument("-me", "--method", dest="method", default="polar", help="Method to use") 171 | parser.add_argument("-metric", "--metric_name", dest="metric_name", default="cosine", help="Metric name to use") 172 | parser.add_argument("-w", "--win_size", dest="win_size", default=-1, help="Window size to use if not polar", type=int) 173 | parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int) 174 | args = parser.parse_args() 175 | if args.log == 'DEBUG': 176 | sys.excepthook = debug 177 | numeric_level = getattr(logging, args.log.upper(), None) 178 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 179 | main(args) 180 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/distributional/scripts/embedding_displacements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | 6 | import os 7 | from os import path 8 | import cPickle as pickle 9 | import numpy as np 10 | import scipy 11 | import itertools 12 | from scipy.spatial.distance import cosine, euclidean, norm 13 | import pandas as pd 14 | import more_itertools 15 | from joblib import Parallel, delayed 16 | 17 | import langchangetrack 18 | from langchangetrack.utils.dummy_regressor import DummyRegressor 19 | import gensim 20 | 21 | from langchangetrack.tsconstruction.displacements import Displacements 22 | 23 | import logging 24 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 25 | logger = logging.getLogger("langchangetrack") 26 | 27 | import psutil 28 | from multiprocessing import cpu_count 29 | 30 | p = psutil.Process(os.getpid()) 31 | p.set_cpu_affinity(list(range(cpu_count()))) 32 | 33 | 34 | def uniform(distances): 35 | return np.ones(len(distances)) 36 | 37 | 38 | def get_vectors_sg(model, norm_embedding=True): 39 | """ Return the embeddings of a skipgram model. """ 40 | if norm_embedding: 41 | return model.syn0norm 42 | else: 43 | return model.syn0 44 | 45 | 46 | def load_model_skipgram(model_path): 47 | """ Load the skipgram model from a file in word2vec format. """ 48 | return gensim.models.Word2Vec.load_word2vec_format(model_path) 49 | 50 | 51 | def load_predictor_skipgram(predictor_path): 52 | """ Load the predictor model. """ 53 | return pickle.load(open(predictor_path)) 54 | 55 | 56 | class EmbeddingsDisplacements(Displacements): 57 | 58 | def __init__(self, 59 | data_dir, 60 | pred_dir, 61 | words_file, 62 | timepoints, 63 | num_words, 64 | get_vectors, 65 | load_model, 66 | load_predictor, 67 | method, 68 | win_size, 69 | fixed_point, 70 | embedding_suffix, 71 | predictor_suffix, 72 | workers): 73 | """ Constructor """ 74 | # Initialize the super class. 75 | super(EmbeddingsDisplacements, self).__init__() 76 | self.get_vectors = get_vectors 77 | self.load_model = load_model 78 | self.has_predictors = True 79 | self.load_predictor = load_predictor 80 | self.norm_embedding = True 81 | self.words_file = words_file 82 | self.timepoints = timepoints 83 | self.data_dir = data_dir 84 | self.pred_dir = pred_dir 85 | self.num_words = num_words 86 | self.method = method 87 | self.win_size = win_size 88 | self.fixed_point = fixed_point 89 | self.embedding_suffix = embedding_suffix 90 | self.predictor_suffix = predictor_suffix 91 | self.workers = workers 92 | 93 | def number_distance_metrics(self): 94 | return 2 95 | 96 | def calculate_distance(self, vec1, vec2): 97 | """ Calculate distances between vector1 and vector2. """ 98 | return [cosine(vec1, vec2), euclidean(vec1, vec2)] 99 | 100 | def load_models_and_predictors(self): 101 | """ Load all the models and predictors. """ 102 | self.models = {} 103 | self.predictors = {} 104 | model_paths = [path.join(self.data_dir, timepoint + '_embeddings' + self.embedding_suffix) for timepoint in self.timepoints] 105 | predictor_handles = [path.join(self.pred_dir, timepoint + '_embeddings' + self.predictor_suffix) for timepoint in self.timepoints] 106 | loaded_models = Parallel(n_jobs=self.workers)(delayed(self.load_model)(model_path) for model_path in model_paths) 107 | for i, timepoint in enumerate(self.timepoints): 108 | self.models[timepoint] = loaded_models[i] 109 | self.predictors[timepoint] = self.load_predictor(predictor_handles[i]) 110 | if hasattr(self.predictors[timepoint], 'weight_func'): 111 | self.predictors[timepoint].weight_func = uniform 112 | print "Loaded predictor for", timepoint 113 | print "Done loading predictors" 114 | 115 | def is_present(self, timepoint, word): 116 | """ Check if the word is present in the vocabulary at this timepoint. """ 117 | model = self.get_model(timepoint) 118 | return word in model.vocab 119 | 120 | def get_vector(self, timepoint, word): 121 | """ Get the embedding for this word at the specified timepoint.""" 122 | model = self.get_model(timepoint) 123 | return self.get_vectors(model, self.norm_embedding)[model.vocab[word].index] 124 | 125 | 126 | def main(args): 127 | syear = int(args.syear) 128 | eyear = int(args.eyear) 129 | stepsize = int(args.stepsize) 130 | timepoints = np.arange(syear, eyear, stepsize) 131 | timepoints = [str(t) for t in timepoints] 132 | workers = int(args.workers) 133 | # Create the main work horse. 134 | e = EmbeddingsDisplacements(args.datadir, 135 | args.preddir, 136 | args.filename, 137 | timepoints, 138 | int(args.num_words), 139 | get_vectors_sg, 140 | load_model_skipgram, 141 | load_predictor_skipgram, 142 | args.method, 143 | args.win_size, 144 | str(args.fixed_point), 145 | args.embedding_suffix, 146 | args.predictor_suffix, 147 | workers) 148 | 149 | # Load the models and predictors 150 | e.load_models_and_predictors() 151 | 152 | # Calculate the word displacements and dump. 153 | L, H, dfo, dfn = e.calculate_words_displacement(column_names=['word', 's', 'otherword', 't', 'cosine', 'euclidean'], n_jobs=workers) 154 | fname = 'timeseries_s_t' + '_' + args.outputsuffix + '.pkl' 155 | pickle.dump((L, H, dfo, dfn), open(path.join(args.outputdir, fname), 'wb')) 156 | 157 | if __name__ == "__main__": 158 | parser = ArgumentParser() 159 | parser.add_argument("-f", "--file", dest="filename", help="Input file for words") 160 | parser.add_argument("-d", "--data_dir", dest="datadir", help="data directory") 161 | parser.add_argument("-p", "--pred_dir", dest="preddir", help="data directory") 162 | parser.add_argument("-o", "--output_dir", dest="outputdir", help="Output directory") 163 | parser.add_argument("-os", "--output_suffix", dest="outputsuffix", help="Output suffix") 164 | parser.add_argument("-es", "--emb_suffix", dest="embedding_suffix", help="embedding suffix") 165 | parser.add_argument("-ps", "--pred_suffix", dest="predictor_suffix", help="predictor suffix") 166 | parser.add_argument("-sy", "--start", dest="syear", default='1800', help="start year") 167 | parser.add_argument("-ey", "--end", dest="eyear", default='2010', help="end year(not included)") 168 | parser.add_argument("-s", "--window_size", dest="stepsize", default=5, help="Window size for time series") 169 | parser.add_argument("-e", "--embedding_type", dest="embedding_type", default='skipgram', help="Embedding type") 170 | parser.add_argument("-m", "--method", dest="method", default="polar", help="Method to use") 171 | parser.add_argument("-w", "--win_size", dest="win_size", default="-1", help="Window size to use if not polar", type=int) 172 | parser.add_argument("-y", "--fixed_point", dest="fixed_point", default="-1", help="fixed point to use if method is fixed", type=int) 173 | parser.add_argument("-n", "--num_words", dest="num_words", default=-1, help="Number of words", type=int) 174 | parser.add_argument("-workers", "--workers", dest="workers", default=1, help="Maximum number of workers", type=int) 175 | logging.basicConfig(level=logging.INFO, format=LOGFORMAT) 176 | args = parser.parse_args() 177 | main(args) 178 | -------------------------------------------------------------------------------- /langchangetrack/cpdetection/demostrate_cp.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import logging 3 | import pandas as pd 4 | import numpy as np 5 | import itertools 6 | import more_itertools 7 | import os 8 | 9 | from functools import partial 10 | from changepoint.mean_shift_model import MeanShiftModel 11 | from changepoint.utils.ts_stats import parallelize_func 12 | 13 | __author__ = "Vivek Kulkarni" 14 | __email__ = "viveksck@gmail.com" 15 | 16 | import psutil 17 | from multiprocessing import cpu_count 18 | 19 | p = psutil.Process(os.getpid()) 20 | p.set_cpu_affinity(list(range(cpu_count()))) 21 | 22 | # Global variable specifying which column index the time series 23 | # begins in a dataframe 24 | TS_OFFSET = 2 25 | 26 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 27 | 28 | 29 | def normalize_timeseries(df): 30 | """ 31 | Normalize each column of the data frame by its mean and standard 32 | deviation. 33 | """ 34 | dfm = df.copy(deep=True) 35 | dfmean = df.mean() 36 | dfstd = df.std() 37 | for col in df.columns[2:]: 38 | dfm[col] = (df[col] - dfmean[col]) / dfstd[col] 39 | return dfm 40 | 41 | 42 | def get_filtered_df(df, vocab_file): 43 | """ Return a data frame with only the words present in the vocab file. """ 44 | if vocab_file: 45 | vocab = open(vocab_file).readlines() 46 | vocab = [v.strip() for v in vocab] 47 | # Get the set of words. 48 | words = pd.Series(df.word.values.ravel()).unique() 49 | set_words = set(words) 50 | # Find the words common to data frame and vocab 51 | common_set_words = set_words & set(vocab) 52 | # Filter the dataframe 53 | df_filtered = df[df.word.isin(common_set_words)] 54 | return df_filtered 55 | else: 56 | return df 57 | 58 | 59 | def get_pval_word(df, word, B): 60 | """ 61 | Get the pvalue of a change point at each time point 't' corresponding to 62 | the word. Also return the number of tail successes during boot strap. 63 | Use a mean shift model for this. 64 | """ 65 | # Remove the first TS_OFFSET columns as it is 'index' and 'word' to get the 66 | # time series for that word. 67 | ts = df[df.word == word].values[0][TS_OFFSET:] 68 | # Create a mean shift model 69 | model = MeanShiftModel() 70 | # Detect the change points using a mean shift model 71 | stats_ts, pvals, nums = model.detect_mean_shift(ts, B=B) 72 | # Return the word and pvals associated with each time point. 73 | L = [word] 74 | L.extend(pvals) 75 | H = [word] 76 | H.extend(nums) 77 | return L, H 78 | 79 | 80 | def get_pval_word_chunk(chunk, df, B): 81 | """ Get the p-values for each time point for a chunk of words. """ 82 | results = [get_pval_word(df, w, B) for w in chunk] 83 | return results 84 | 85 | 86 | def get_minpval_cp(pvalue_df_row): 87 | """ 88 | Get the minimum p-value and the corresponding time point for each word. 89 | """ 90 | # first column is 'word', so ignore it 91 | index_series = pvalue_df_row.index[1:] 92 | row_series = pvalue_df_row.values[1:] 93 | assert(len(index_series) == len(row_series)) 94 | 95 | # Find the minimum pvalue 96 | min_pval = np.min(row_series) 97 | # Find the index where the minimum pvalue occurrs. 98 | min_idx = np.argmin(row_series) 99 | # Get the timepoint corresponding to that index 100 | min_cp = index_series[min_idx] 101 | 102 | return min_pval, min_cp 103 | 104 | 105 | def get_cp_pval(pvalue_df_row, zscore_df, threshold=0.0): 106 | """ 107 | Get the minimum p-value corresponding timepoint which also has 108 | a Z-SCORE > threshold. 109 | 110 | """ 111 | # First column is 'word', so ignore it 112 | row_series = pvalue_df_row.values[1:] 113 | # Corresponding Z-Score series for the exact same set of timepoints. 114 | zscore_series = np.array(zscore_df[zscore_df.word == pvalue_df_row.word][pvalue_df_row.index[1:]])[0] 115 | assert(len(zscore_series) == len(row_series)) 116 | 117 | # Get all the indices where zscore exceeds a threshold 118 | sel_idx = np.where(zscore_series > threshold)[0] 119 | # If there are no such indices return NAN 120 | if not len(sel_idx): 121 | return 1.0, np.nan 122 | 123 | # We have some indices. Select the pvalues for those indices. 124 | pvals_indices = np.take(row_series, sel_idx) 125 | # Find the minimum pvalue among those candidates. 126 | min_pval = np.min(pvals_indices) 127 | # Find the minimum candidate index corresponding to that pvalue 128 | min_idx = np.argmin(pvals_indices) 129 | # Select the actual index that it corresponds to 130 | cp_idx = sel_idx[min_idx] 131 | # Translate that to the actual timepoint and return it. 132 | cp = pvalue_df_row.index[1:][cp_idx] 133 | return min_pval, cp 134 | 135 | 136 | def main(args): 137 | # Read the arguments 138 | df_f = args.filename 139 | pval_file = args.pval_file 140 | sample_file = args.sample_file 141 | col_to_drop = args.col 142 | threshold = float(args.threshold) 143 | workers = args.workers 144 | print "Config:" 145 | print "Input data frame file name:", df_f 146 | print "Output pvalue file", pval_file 147 | print "Output sample file", sample_file 148 | print "Columns to drop", col_to_drop 149 | print "Threshold", threshold 150 | 151 | # Read the time series data 152 | norm_df = pd.read_csv(df_f) 153 | 154 | # Drop the column if needed. We typically drop the 1st column as it always is 0 by 155 | # default. 156 | if col_to_drop in norm_df.columns: 157 | cols = norm_df.columns.tolist() 158 | if col_to_drop == norm_df.columns[-1]: 159 | time_points = cols[2:] 160 | new_cols = cols[0:2] + time_points[::-1] 161 | norm_df = norm_df[new_cols] 162 | norm_df.drop(col_to_drop, axis=1, inplace=True) 163 | print "Dropped column", col_to_drop 164 | 165 | print "Columns of the data frame are", norm_df.columns 166 | cwords = norm_df.word.values 167 | print "Number of words we are analyzing:", len(cwords) 168 | 169 | chunksz = np.ceil(len(cwords) / float(workers)) 170 | results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B) 171 | 172 | pvals, num_samples = zip(*results) 173 | 174 | header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1]) 175 | pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header) 176 | 177 | # Append additonal columns to the final df 178 | pvalue_df_final = pvalue_df.copy(deep=True) 179 | 180 | pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1)) 181 | pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold)) 182 | 183 | pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True) 184 | 185 | # Write the pvalue output. 186 | num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header) 187 | num_samples_df.to_csv(sample_file, encoding='utf-8') 188 | 189 | # Write the sample output 190 | sdf = pvalue_df_final.sort(columns=['tpval']) 191 | sdf.to_csv(pval_file, encoding='utf-8') 192 | 193 | if __name__ == "__main__": 194 | parser = ArgumentParser() 195 | parser.add_argument("-f", "--file", dest="filename", help="Input time series file") 196 | parser.add_argument("-p", "--pfile", dest="pval_file", help="Output pvalue file") 197 | parser.add_argument("-n", "--nfile", dest="sample_file", help="Output sample file") 198 | parser.add_argument("-c", "--col", dest="col", help="column to drop") 199 | parser.add_argument("-t", "--threshold", dest="threshold", default=1.75, type=float, help="Threshold to use for mean shift model.") 200 | parser.add_argument("-b", "--bootstrap", dest="B", default=1000, type=int, help="Number of bootstrapped samples to take(default:1000)") 201 | parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use") 202 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO") 203 | args = parser.parse_args() 204 | if args.log == 'DEBUG': 205 | sys.excepthook = debug 206 | numeric_level = getattr(logging, args.log.upper(), None) 207 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 208 | main(args) 209 | -------------------------------------------------------------------------------- /langchangetrack/cpdetection/detect_changepoints_word_ts.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import logging 3 | import pandas as pd 4 | import numpy as np 5 | import itertools 6 | import more_itertools 7 | import os 8 | 9 | from functools import partial 10 | from changepoint.mean_shift_model import MeanShiftModel 11 | from changepoint.utils.ts_stats import parallelize_func 12 | 13 | __author__ = "Vivek Kulkarni" 14 | __email__ = "viveksck@gmail.com" 15 | 16 | import psutil 17 | from multiprocessing import cpu_count 18 | 19 | p = psutil.Process(os.getpid()) 20 | p.set_cpu_affinity(list(range(cpu_count()))) 21 | 22 | # Global variable specifying which column index the time series 23 | # begins in a dataframe 24 | TS_OFFSET = 2 25 | 26 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 27 | 28 | 29 | def normalize_timeseries(df): 30 | """ 31 | Normalize each column of the data frame by its mean and standard 32 | deviation. 33 | """ 34 | dfm = df.copy(deep=True) 35 | dfmean = df.mean() 36 | dfstd = df.std() 37 | for col in df.columns[2:]: 38 | dfm[col] = (df[col] - dfmean[col]) / dfstd[col] 39 | return dfm 40 | 41 | 42 | def get_filtered_df(df, vocab_file): 43 | """ Return a data frame with only the words present in the vocab file. """ 44 | if vocab_file: 45 | vocab = open(vocab_file).readlines() 46 | vocab = [v.strip() for v in vocab] 47 | # Get the set of words. 48 | words = pd.Series(df.word.values.ravel()).unique() 49 | set_words = set(words) 50 | # Find the words common to data frame and vocab 51 | common_set_words = set_words & set(vocab) 52 | # Filter the dataframe 53 | df_filtered = df[df.word.isin(common_set_words)] 54 | return df_filtered 55 | else: 56 | return df 57 | 58 | 59 | def get_pval_word(df, word, B): 60 | """ 61 | Get the pvalue of a change point at each time point 't' corresponding to 62 | the word. Also return the number of tail successes during boot strap. 63 | Use a mean shift model for this. 64 | """ 65 | # Remove the first TS_OFFSET columns as it is 'index' and 'word' to get the 66 | # time series for that word. 67 | ts = df[df.word == word].values[0][TS_OFFSET:] 68 | # Create a mean shift model 69 | model = MeanShiftModel() 70 | # Detect the change points using a mean shift model 71 | stats_ts, pvals, nums = model.detect_mean_shift(ts, B=B) 72 | # Return the word and pvals associated with each time point. 73 | L = [word] 74 | L.extend(pvals) 75 | H = [word] 76 | H.extend(nums) 77 | return L, H 78 | 79 | 80 | def get_pval_word_chunk(chunk, df, B): 81 | """ Get the p-values for each time point for a chunk of words. """ 82 | results = [get_pval_word(df, w, B) for w in chunk] 83 | return results 84 | 85 | 86 | def get_minpval_cp(pvalue_df_row): 87 | """ 88 | Get the minimum p-value and the corresponding time point for each word. 89 | """ 90 | # first column is 'word', so ignore it 91 | index_series = pvalue_df_row.index[1:] 92 | row_series = pvalue_df_row.values[1:] 93 | assert(len(index_series) == len(row_series)) 94 | 95 | # Find the minimum pvalue 96 | min_pval = np.min(row_series) 97 | # Find the index where the minimum pvalue occurrs. 98 | min_idx = np.argmin(row_series) 99 | # Get the timepoint corresponding to that index 100 | min_cp = index_series[min_idx] 101 | 102 | return min_pval, min_cp 103 | 104 | 105 | def get_cp_pval(pvalue_df_row, zscore_df, threshold=0.0): 106 | """ 107 | Get the minimum p-value corresponding timepoint which also has 108 | a Z-SCORE > threshold. 109 | 110 | """ 111 | # First column is 'word', so ignore it 112 | row_series = pvalue_df_row.values[1:] 113 | # Corresponding Z-Score series for the exact same set of timepoints. 114 | zscore_series = np.array(zscore_df[zscore_df.word == pvalue_df_row.word][pvalue_df_row.index[1:]])[0] 115 | assert(len(zscore_series) == len(row_series)) 116 | 117 | # Get all the indices where zscore exceeds a threshold 118 | sel_idx = np.where(zscore_series > threshold)[0] 119 | # If there are no such indices return NAN 120 | if not len(sel_idx): 121 | return 1.0, np.nan 122 | 123 | # We have some indices. Select the pvalues for those indices. 124 | pvals_indices = np.take(row_series, sel_idx) 125 | # Find the minimum pvalue among those candidates. 126 | min_pval = np.min(pvals_indices) 127 | # Find the minimum candidate index corresponding to that pvalue 128 | min_idx = np.argmin(pvals_indices) 129 | # Select the actual index that it corresponds to 130 | cp_idx = sel_idx[min_idx] 131 | # Translate that to the actual timepoint and return it. 132 | cp = pvalue_df_row.index[1:][cp_idx] 133 | return min_pval, cp 134 | 135 | 136 | def main(args): 137 | # Read the arguments 138 | df_f = args.filename 139 | common_vocab_file = args.vocab_file 140 | pval_file = args.pval_file 141 | sample_file = args.sample_file 142 | col_to_drop = args.col 143 | should_normalize = not(args.dont_normalize) 144 | threshold = float(args.threshold) 145 | 146 | workers = args.workers 147 | 148 | print "Config:" 149 | print "Input data frame file name:", df_f 150 | print "Vocab file", common_vocab_file 151 | print "Output pvalue file", pval_file 152 | print "Output sample file", sample_file 153 | print "Columns to drop", col_to_drop 154 | print "Normalize Time series:", should_normalize 155 | print "Threshold", threshold 156 | 157 | # Read the time series data 158 | df = pd.read_csv(df_f) 159 | # Consider only words in the common vocabulary. 160 | df = get_filtered_df(df, common_vocab_file) 161 | 162 | # Normalize the data frame 163 | if should_normalize: 164 | norm_df = normalize_timeseries(df) 165 | else: 166 | norm_df = df 167 | 168 | # Drop the column if needed. We typically drop the 1st column as it always is 0 by 169 | # default. 170 | if col_to_drop in norm_df.columns: 171 | cols = df.columns.tolist() 172 | if col_to_drop == norm_df.columns[-1]: 173 | time_points = cols[2:] 174 | new_cols = cols[0:2] + time_points[::-1] 175 | norm_df = norm_df[new_cols] 176 | norm_df.drop(col_to_drop, axis=1, inplace=True) 177 | print "Dropped column", col_to_drop 178 | 179 | print "Columns of the data frame are", norm_df.columns 180 | cwords = norm_df.word.values 181 | print "Number of words we are analyzing:", len(cwords) 182 | 183 | chunksz = np.ceil(len(cwords) / float(workers)) 184 | results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B) 185 | 186 | pvals, num_samples = zip(*results) 187 | 188 | header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1]) 189 | pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header) 190 | 191 | # Append additonal columns to the final df 192 | pvalue_df_final = pvalue_df.copy(deep=True) 193 | 194 | pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1)) 195 | pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold)) 196 | 197 | pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True) 198 | 199 | # Write the pvalue output. 200 | num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header) 201 | num_samples_df.to_csv(sample_file, encoding='utf-8') 202 | 203 | # Write the sample output 204 | sdf = pvalue_df_final.sort(columns=['tpval']) 205 | sdf.to_csv(pval_file, encoding='utf-8') 206 | 207 | if __name__ == "__main__": 208 | parser = ArgumentParser() 209 | parser.add_argument("-f", "--file", dest="filename", help="Input time series file") 210 | parser.add_argument("-v", "--vfile", dest="vocab_file", help="Common Vocab file") 211 | parser.add_argument("-p", "--pfile", dest="pval_file", help="Output pvalue file") 212 | parser.add_argument("-n", "--nfile", dest="sample_file", help="Output sample file") 213 | parser.add_argument("-c", "--col", dest="col", help="column to drop") 214 | parser.add_argument("-d", "--dont_normalize", dest="dont_normalize", action='store_true', default=False, help="Dont normalize") 215 | parser.add_argument("-t", "--threshold", dest="threshold", default=1.75, type=float, help="Threshold to use for mean shift model.") 216 | parser.add_argument("-b", "--bootstrap", dest="B", default=1000, type=int, help="Number of bootstrapped samples to take(default:1000)") 217 | parser.add_argument("-w", "--workers", dest="workers", default=1, type=int, help="Number of workers to use") 218 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", default="INFO") 219 | args = parser.parse_args() 220 | if args.log == 'DEBUG': 221 | sys.excepthook = debug 222 | numeric_level = getattr(logging, args.log.upper(), None) 223 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 224 | main(args) 225 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/displacements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | 6 | import os 7 | from os import path 8 | import cPickle as pickle 9 | import numpy as np 10 | import scipy 11 | import itertools 12 | from scipy.spatial.distance import cosine, euclidean, norm 13 | import pandas as pd 14 | import more_itertools 15 | from joblib import Parallel, delayed 16 | 17 | from langchangetrack.utils.dummy_regressor import DummyRegressor 18 | import gensim 19 | 20 | import logging 21 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 22 | logger = logging.getLogger("langchangetrack") 23 | 24 | import psutil 25 | from multiprocessing import cpu_count 26 | 27 | p = psutil.Process(os.getpid()) 28 | p.set_cpu_affinity(list(range(cpu_count()))) 29 | 30 | def normalize_vector(vec): 31 | """ Normalize a vector by its L2 norm. """ 32 | norm = (vec ** 2).sum() ** 0.5 33 | return (vec / norm) 34 | 35 | 36 | def pairwise(iterable): 37 | """ [a,b,c,d]=>[(a,b), (b,c), (c, d)] """ 38 | a, b = itertools.tee(iterable) 39 | next(b, None) 40 | return itertools.izip(a, b) 41 | 42 | 43 | def process_word_source(w, eobj): 44 | """ Calculate displacements of word for source timepoint tuples. """ 45 | return eobj.process_word(w, 0) 46 | 47 | 48 | def process_word_dest(w, eobj): 49 | """ Calculate displacements of word for destination timepoint tuples.""" 50 | return eobj.process_word(w, 1) 51 | 52 | 53 | def process_chunk(chunk, func, *args): 54 | """ Apply a function on each element of a iterable. """ 55 | L = [] 56 | for i, e in enumerate(chunk): 57 | L.append(func(e, *args)) 58 | if i % 10 == 0: 59 | print "Processing chunk", i 60 | return L 61 | 62 | 63 | class Displacements(object): 64 | 65 | def __init__(self): 66 | """ Constructor """ 67 | self.get_vectors = None 68 | self.load_model = None 69 | self.models = {} 70 | self.has_predictors = False 71 | self.load_predictor = None 72 | self.predictors = {} 73 | self.norm_embedding = False 74 | self.words_file = None 75 | self.timepoints = None 76 | self.data_dir = None 77 | self.pred_dir = None 78 | self.num_words = -1 79 | self.method = None 80 | self.win_size = -1 81 | self.fixed_point = -1 82 | self.embedding_suffix = None 83 | self.predictor_suffix = None 84 | 85 | def get_word_list(self): 86 | """ Returns a list of words for which time series needs to be generated. 87 | """ 88 | 89 | words_list = open(self.words_file, 'r').read().split('\n') 90 | if words_list[-1] == '': 91 | words_list = words_list[:-1] 92 | if self.num_words != -1: 93 | return words_list[:num_words] 94 | else: 95 | return words_list 96 | 97 | def get_tuples(self, word, timepoint1, timepoint2): 98 | """ Return what time point pairs we must consider fot the word. """ 99 | return [(word, timepoint1, word, timepoint2)] 100 | 101 | def generate_displacement_word(self, word, timepoints): 102 | L = [] 103 | 104 | for ot, nt in timepoints: 105 | modelo = self.get_predictor(ot) 106 | modeln = self.get_predictor(nt) 107 | tuples = self.get_tuples(word, ot, nt) 108 | 109 | for tup in tuples: 110 | word1 = tup[0] 111 | timepoint1 = tup[1] 112 | word2 = tup[2] 113 | timepoint2 = tup[3] 114 | 115 | if self.is_present(timepoint1, word1) and self.is_present(timepoint2, word2): 116 | vec1 = self.get_vector(timepoint1, word1) 117 | vec2 = self.get_vector(timepoint2, word2) 118 | 119 | if self.norm_embedding: 120 | assert(np.isclose(norm(vec1), 1.0)) 121 | assert(np.isclose(norm(vec2), 1.0)) 122 | 123 | vec1_pred = modelo.predict(vec1) 124 | vec2_pred = modeln.predict(vec2) 125 | 126 | if self.norm_embedding: 127 | vec1_pred = normalize_vector(vec1_pred) 128 | vec2_pred = normalize_vector(vec2_pred) 129 | assert(np.isclose(norm(vec1), 1.0)) 130 | assert(np.isclose(norm(vec2), 1.0)) 131 | 132 | d = self.calculate_distance(vec1_pred, vec2_pred) 133 | assert(len(d) == self.number_distance_metrics()) 134 | L.append([word1, timepoint1, word2, timepoint2] + d) 135 | else: 136 | # Word is not present in both time periods 137 | L.append([word1, timepoint1, word2, timepoint2] + list(itertools.repeat(np.nan, self.number_distance_metrics()))) 138 | return L 139 | 140 | def get_timepoints_word(self, w, timepoints): 141 | """ Get the list of timepoints to be considered for a word. """ 142 | for i, t in enumerate(timepoints): 143 | if self.is_present(t, w): 144 | break 145 | # We have foind the first instance of the word at this time point, 146 | timepoints_considered = timepoints[i:] 147 | 148 | # Create the tuples for calculating displacements based on strategy 149 | # used. 150 | if self.method == "polar": 151 | timepoints1 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[0], len(timepoints_considered)))) 152 | timepoints2 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[-1], len(timepoints_considered)))) 153 | elif self.method == 'win': 154 | timepoints1 = zip(timepoints_considered[win_size:], timepoints_considered[:-win_size]) 155 | timepoints2 = zip(timepoints_considered[:-win_size], timepoints_considered[win_size:]) 156 | elif self.method == 'fixed': 157 | timepoints1 = zip(timepoints_considered, list(itertools.repeat(fixed_point, len(timepoints_considered)))) 158 | timepoints2 = zip(timepoints_considered, list(itertools.repeat(timepoints_considered[-1], len(timepoints_considered)))) 159 | 160 | # Return the list if tuples 161 | return timepoints1, timepoints2 162 | 163 | def process_word(self, w, index): 164 | """ Calculate displacements of the word at each timepoint tuple. 165 | index: Are we using timepoints1 or timepoints2. 166 | """ 167 | t = self.get_timepoints_word(w, self.timepoints) 168 | return self.generate_displacement_word(w, t[index]) 169 | 170 | def calculate_words_displacement(self, column_names, n_jobs = 1): 171 | """ Calculate word displacements for each word in the Pandas data frame. """ 172 | 173 | words = self.get_word_list() 174 | # Create chunks of the words to be processed. 175 | chunk_sz = np.ceil(len(words)/float(n_jobs)) 176 | chunks = list(more_itertools.chunked(words, chunk_sz)) 177 | 178 | # Calculate the displacements 179 | chunksL = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_source, self) for chunk in chunks) 180 | chunksH = Parallel(n_jobs=n_jobs, verbose=20)(delayed(process_chunk)(chunk, process_word_dest, self) for chunk in chunks) 181 | L = more_itertools.flatten(chunksL) 182 | H = more_itertools.flatten(chunksH) 183 | flattendL = [x for sublist in L for x in sublist] 184 | flattendH = [x for sublist in H for x in sublist] 185 | 186 | # Store the results in a nice pandas data frame 187 | dfo, dfn = self.create_data_frames(flattendL, flattendH, column_names) 188 | return flattendL, flattendH, dfo, dfn 189 | 190 | def create_data_frames(self, L, H, column_names): 191 | """ Store the displacement of each word for the pair of timepoints in a 192 | nice Pandas data frame. """ 193 | dfo = pd.DataFrame() 194 | dfo = dfo.from_records(L, columns=column_names) 195 | dfo_clean = dfo.fillna(method='ffill') 196 | dfn = pd.DataFrame() 197 | dfn = dfn.from_records(H, columns=column_names) 198 | dfn_clean = dfn.fillna(method='bfill') 199 | return dfo_clean, dfn_clean 200 | 201 | def get_model(self, timepoint): 202 | """ Return the model corresponding to this timepoint. """ 203 | return self.models[timepoint] 204 | 205 | def get_predictor(self, timepoint): 206 | """ Return the predictor corresponding to this timepoint. """ 207 | return self.predictors[timepoint] 208 | 209 | def number_distance_metrics(self): 210 | """ The number of distance metrics evaluated by calculate_distance. """ 211 | raise NotImplementedError, "Pure virtual function" 212 | 213 | def calculate_distance(self, vec1, vec2): 214 | """ Calculate distances between vector1 and vector2. """ 215 | raise NotImplementedError, "Pure virtual function" 216 | 217 | def load_models_and_predictors(self): 218 | raise NotImplementedError, "Pure virtual function" 219 | 220 | def is_present(self, timepoint, word): 221 | """ Check if the word is present in the vocabulary at this timepoint. """ 222 | raise NotImplementedError, "Pure virtual function" 223 | 224 | def get_vector(self, timepoint, word): 225 | """ Get the embedding for this word at the specified timepoint.""" 226 | raise NotImplementedError, "Pure virtual function" 227 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # langchangetrack documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | # If extensions (or modules to document with autodoc) are in another 20 | # directory, add these directories to sys.path here. If the directory is 21 | # relative to the documentation root, use os.path.abspath to make it 22 | # absolute, like shown here. 23 | #sys.path.insert(0, os.path.abspath('.')) 24 | 25 | # Get the project root dir, which is the parent dir of this 26 | cwd = os.getcwd() 27 | project_root = os.path.dirname(cwd) 28 | 29 | # Insert the project root dir as the first element in the PYTHONPATH. 30 | # This lets us ensure that the source package is imported, and that its 31 | # version is used. 32 | sys.path.insert(0, project_root) 33 | 34 | import langchangetrack 35 | 36 | # -- General configuration --------------------------------------------- 37 | 38 | # If your documentation needs a minimal Sphinx version, state it here. 39 | #needs_sphinx = '1.0' 40 | 41 | # Add any Sphinx extension module names here, as strings. They can be 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 43 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # The suffix of source filenames. 49 | source_suffix = '.rst' 50 | 51 | # The encoding of source files. 52 | #source_encoding = 'utf-8-sig' 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # General information about the project. 58 | project = u'langchangetrack' 59 | copyright = u'2015, Vivek Kulkarni' 60 | 61 | # The version info for the project you're documenting, acts as replacement 62 | # for |version| and |release|, also used in various other places throughout 63 | # the built documents. 64 | # 65 | # The short X.Y version. 66 | version = langchangetrack.__version__ 67 | # The full version, including alpha/beta/rc tags. 68 | release = langchangetrack.__version__ 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | #language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to 75 | # some non-false value, then it is used: 76 | #today = '' 77 | # Else, today_fmt is used as the format for a strftime call. 78 | #today_fmt = '%B %d, %Y' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | exclude_patterns = ['_build'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all 85 | # documents. 86 | #default_role = None 87 | 88 | # If true, '()' will be appended to :func: etc. cross-reference text. 89 | #add_function_parentheses = True 90 | 91 | # If true, the current module name will be prepended to all description 92 | # unit titles (such as .. function::). 93 | #add_module_names = True 94 | 95 | # If true, sectionauthor and moduleauthor directives will be shown in the 96 | # output. They are ignored by default. 97 | #show_authors = False 98 | 99 | # The name of the Pygments (syntax highlighting) style to use. 100 | pygments_style = 'sphinx' 101 | 102 | # A list of ignored prefixes for module index sorting. 103 | #modindex_common_prefix = [] 104 | 105 | # If true, keep warnings as "system message" paragraphs in the built 106 | # documents. 107 | #keep_warnings = False 108 | 109 | 110 | # -- Options for HTML output ------------------------------------------- 111 | 112 | # The theme to use for HTML and HTML Help pages. See the documentation for 113 | # a list of builtin themes. 114 | html_theme = 'default' 115 | 116 | # Theme options are theme-specific and customize the look and feel of a 117 | # theme further. For a list of options available for each theme, see the 118 | # documentation. 119 | #html_theme_options = {} 120 | 121 | # Add any paths that contain custom themes here, relative to this directory. 122 | #html_theme_path = [] 123 | 124 | # The name for this set of Sphinx documents. If None, it defaults to 125 | # " v documentation". 126 | #html_title = None 127 | 128 | # A shorter title for the navigation bar. Default is the same as 129 | # html_title. 130 | #html_short_title = None 131 | 132 | # The name of an image file (relative to this directory) to place at the 133 | # top of the sidebar. 134 | #html_logo = None 135 | 136 | # The name of an image file (within the static path) to use as favicon 137 | # of the docs. This file should be a Windows icon file (.ico) being 138 | # 16x16 or 32x32 pixels large. 139 | #html_favicon = None 140 | 141 | # Add any paths that contain custom static files (such as style sheets) 142 | # here, relative to this directory. They are copied after the builtin 143 | # static files, so a file named "default.css" will overwrite the builtin 144 | # "default.css". 145 | html_static_path = ['_static'] 146 | 147 | # If not '', a 'Last updated on:' timestamp is inserted at every page 148 | # bottom, using the given strftime format. 149 | #html_last_updated_fmt = '%b %d, %Y' 150 | 151 | # If true, SmartyPants will be used to convert quotes and dashes to 152 | # typographically correct entities. 153 | #html_use_smartypants = True 154 | 155 | # Custom sidebar templates, maps document names to template names. 156 | #html_sidebars = {} 157 | 158 | # Additional templates that should be rendered to pages, maps page names 159 | # to template names. 160 | #html_additional_pages = {} 161 | 162 | # If false, no module index is generated. 163 | #html_domain_indices = True 164 | 165 | # If false, no index is generated. 166 | #html_use_index = True 167 | 168 | # If true, the index is split into individual pages for each letter. 169 | #html_split_index = False 170 | 171 | # If true, links to the reST sources are added to the pages. 172 | #html_show_sourcelink = True 173 | 174 | # If true, "Created using Sphinx" is shown in the HTML footer. 175 | # Default is True. 176 | #html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. 179 | # Default is True. 180 | #html_show_copyright = True 181 | 182 | # If true, an OpenSearch description file will be output, and all pages 183 | # will contain a tag referring to it. The value of this option 184 | # must be the base URL from which the finished HTML is served. 185 | #html_use_opensearch = '' 186 | 187 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 188 | #html_file_suffix = None 189 | 190 | # Output file base name for HTML help builder. 191 | htmlhelp_basename = 'langchangetrackdoc' 192 | 193 | 194 | # -- Options for LaTeX output ------------------------------------------ 195 | 196 | latex_elements = { 197 | # The paper size ('letterpaper' or 'a4paper'). 198 | #'papersize': 'letterpaper', 199 | 200 | # The font size ('10pt', '11pt' or '12pt'). 201 | #'pointsize': '10pt', 202 | 203 | # Additional stuff for the LaTeX preamble. 204 | #'preamble': '', 205 | } 206 | 207 | # Grouping the document tree into LaTeX files. List of tuples 208 | # (source start file, target name, title, author, documentclass 209 | # [howto/manual]). 210 | latex_documents = [ 211 | ('index', 'langchangetrack.tex', 212 | u'langchangetrack Documentation', 213 | u'Vivek Kulkarni', 'manual'), 214 | ] 215 | 216 | # The name of an image file (relative to this directory) to place at 217 | # the top of the title page. 218 | #latex_logo = None 219 | 220 | # For "manual" documents, if this is true, then toplevel headings 221 | # are parts, not chapters. 222 | #latex_use_parts = False 223 | 224 | # If true, show page references after internal links. 225 | #latex_show_pagerefs = False 226 | 227 | # If true, show URL addresses after external links. 228 | #latex_show_urls = False 229 | 230 | # Documents to append as an appendix to all manuals. 231 | #latex_appendices = [] 232 | 233 | # If false, no module index is generated. 234 | #latex_domain_indices = True 235 | 236 | 237 | # -- Options for manual page output ------------------------------------ 238 | 239 | # One entry per manual page. List of tuples 240 | # (source start file, name, description, authors, manual section). 241 | man_pages = [ 242 | ('index', 'langchangetrack', 243 | u'langchangetrack Documentation', 244 | [u'Vivek Kulkarni'], 1) 245 | ] 246 | 247 | # If true, show URL addresses after external links. 248 | #man_show_urls = False 249 | 250 | 251 | # -- Options for Texinfo output ---------------------------------------- 252 | 253 | # Grouping the document tree into Texinfo files. List of tuples 254 | # (source start file, target name, title, author, 255 | # dir menu entry, description, category) 256 | texinfo_documents = [ 257 | ('index', 'langchangetrack', 258 | u'langchangetrack Documentation', 259 | u'Vivek Kulkarni', 260 | 'langchangetrack', 261 | 'One line description of project.', 262 | 'Miscellaneous'), 263 | ] 264 | 265 | # Documents to append as an appendix to all manuals. 266 | #texinfo_appendices = [] 267 | 268 | # If false, no module index is generated. 269 | #texinfo_domain_indices = True 270 | 271 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 272 | #texinfo_show_urls = 'footnote' 273 | 274 | # If true, do not generate a @detailmenu in the "Top" node's menu. 275 | #texinfo_no_detailmenu = False 276 | -------------------------------------------------------------------------------- /langchangetrack/tsconstruction/distributional/scripts/learn_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Benchmark for the quality of the joint space""" 5 | 6 | from argparse import ArgumentParser 7 | import logging 8 | import sys 9 | from io import open 10 | import os 11 | from os import path 12 | from time import time 13 | from glob import glob 14 | from collections import defaultdict 15 | from copy import deepcopy 16 | from random import shuffle 17 | import json 18 | import cPickle as pickle 19 | 20 | from sklearn.linear_model import LinearRegression 21 | from sklearn.neighbors import NearestNeighbors 22 | import numpy 23 | from numpy import asarray 24 | from langchangetrack.utils.LocalLinearRegression import LocalLinearRegression 25 | 26 | __author__ = "Rami Al-Rfou" 27 | __email__ = "rmyeid@gmail.com" 28 | 29 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 30 | 31 | reg_model = None 32 | K_NN = 1000 33 | 34 | 35 | class Mapping(object): 36 | 37 | """ Mapping between terms/phrases.""" 38 | 39 | def __init__(self, source=None, target=None): 40 | self.s_lang = source 41 | self.t_lang = target 42 | self.map = None 43 | 44 | 45 | class IdentityTranslations(Mapping): 46 | 47 | def __init__(self, source, target, se, te): 48 | super(IdentityTranslations, self).__init__(source, target) 49 | words = set(se.word_id.keys()) & set(te.word_id.keys()) 50 | D = {} 51 | for word in words: 52 | D[word] = word 53 | self.map = D 54 | 55 | 56 | class Embeddings(object): 57 | 58 | """ A list of words and their vector representatoins. 59 | 60 | We assume that the given words are sorted by their frequency. 61 | """ 62 | 63 | def __init__(self, lang, filename=None, vectors=None, words=None): 64 | 65 | self.lang = lang 66 | if filename: 67 | self.filename = filename 68 | self.read_file() 69 | 70 | if vectors != None: 71 | self.vectors = asarray(vectors) 72 | if words: 73 | if len(set(words)) == len(words): 74 | self.word_id = {w: i for i, w in enumerate(words)} 75 | else: 76 | logging.debug("We have duplicate words.") 77 | self.word_id = {u'{}_{}'.format(w, i): i for i, w in enumerate(words)} 78 | self.id_word = {i: w for w, i in self.word_id.iteritems()} 79 | self.words = [w for w, i in Embeddings.sorted_words(self.word_id)] 80 | 81 | def read_file(self): 82 | raise NotImplementedError("Implement an embeddings reader.") 83 | 84 | def get_vectors(self, words=None): 85 | if words: 86 | return asarray([self.vectors[self.word_id[w]] for w in words]) 87 | return self.vectors 88 | 89 | def __most_frequent(self, n, start=0): 90 | return [x for x, y in sorted(self.word_id.iteritems(), key=lambda(x, y): y)[start:n]] 91 | 92 | def most_frequent(self, n, start=0): 93 | return Embeddings(lang=self.lang, words=self.words[start:n], 94 | vectors=self.vectors[start:n]) 95 | 96 | def least_frequent_n(self, n): 97 | return [x for x, y in sorted(self.word_id.iteritems(), 98 | key=lambda(x, y): y, reverse=True)[:n]] 99 | 100 | def words_translations(self, other, mapping, segment): 101 | start, end = segment 102 | s_words = self.__most_frequent(n=end, start=start) 103 | 104 | map_ = mapping.map 105 | t_words = [map_[w] for w in s_words] 106 | exact = [(w1, w2) for (w1, w2) in zip(s_words, t_words) if w1.lower() == w2.lower()] 107 | logging.info("{} exact words translations in between {}-{} for " 108 | "{}-{} languages.".format(len(exact), start, end, mapping.s_lang, mapping.t_lang)) 109 | 110 | s_new_vectors = self.vectors[start:end] 111 | t_new_vectors = asarray([other.vectors[other.word_id[w]] for w in t_words]) 112 | 113 | source = Embeddings(vectors=s_new_vectors, words=s_words, lang=self.lang) 114 | target = Embeddings(vectors=t_new_vectors, words=t_words, lang=other.lang) 115 | return (source, target) 116 | 117 | @staticmethod 118 | def sorted_words(word_id): 119 | return sorted(word_id.iteritems(), key=lambda(x, y): y) 120 | 121 | def get_common(self, other, mapping): 122 | """ Limit the two embeddings to the terms that are covered by the mapping.""" 123 | 124 | self_oov = defaultdict(lambda: 0) 125 | other_oov = defaultdict(lambda: 0) 126 | self_word_id = deepcopy(self.word_id) 127 | other_word_id = deepcopy(other.word_id) 128 | new_words = [] 129 | map_ = mapping.map 130 | for i, w in enumerate(self.word_id): 131 | if w not in map_: 132 | self_oov[w] += 1 133 | del self_word_id[w] 134 | continue 135 | 136 | if map_[w] not in other.word_id: 137 | other_oov[map_[w]] += 1 138 | del self_word_id[w] 139 | 140 | for i, w in enumerate(other.word_id): 141 | if w not in map_: 142 | del other_word_id[w] 143 | 144 | logging.info("We could not find {} {} words in our dictionary.".format( 145 | len(self_oov), self.lang)) 146 | logging.info("We could not find {} {} words in our target words.".format( 147 | len(other_oov), other.lang)) 148 | logging.info("Our {} vocabulary has {} valid words.".format( 149 | self.lang, len(self_word_id))) 150 | 151 | sorted_self_word_id = Embeddings.sorted_words(self_word_id) 152 | self_vectors = asarray([self.vectors[i] for w, i in sorted_self_word_id]) 153 | self_words = [w for w, i in sorted_self_word_id] 154 | new_self = Embeddings(lang=self.lang, vectors=self_vectors, words=self_words) 155 | 156 | sorted_other_word_id = Embeddings.sorted_words(other_word_id) 157 | other_vectors = asarray([other.vectors[i] for w, i in sorted_other_word_id]) 158 | other_words = [w for w, i in sorted_other_word_id] 159 | new_other = Embeddings(lang=self.lang, vectors=other_vectors, words=other_words) 160 | 161 | return (new_self, new_other) 162 | 163 | def split(self, mapping, ignore_exact=True): 164 | """ Generates two embeddings that cover the mapping terms. 165 | 166 | If we have a1: b1, a2: b2 mappings in an embeddings space where {a1, b1, 167 | a2, b2} exists, we would like to generates two embeddings spaces one for 168 | {a1, a2} and another for {b1, b2}. 169 | 170 | Sometimes it is not desirable to include exact terms a3:a3 in the new 171 | embeddings. Hence, you need to ignore the exact terms. 172 | """ 173 | 174 | source_oov = defaultdict(lambda: 0) 175 | target_oov = defaultdict(lambda: 0) 176 | w_exact = defaultdict(lambda: 0) 177 | 178 | source_words = [] 179 | target_words = [] 180 | map_ = mapping.map 181 | for w, id_ in self.word_id.iteritems(): 182 | if w not in map_: 183 | source_oov[w] += 1 184 | continue 185 | 186 | if map_[w] not in self.word_id: 187 | target_oov[map_[w]] += 1 188 | continue 189 | 190 | if w.lower() == map_[w].lower(): 191 | w_exact[w] += 1 192 | if ignore_exact: 193 | continue 194 | 195 | source_words.append(w) 196 | target_words.append(map_[w]) 197 | 198 | logging.debug("We could not find {} source words in our dictionary.".format( 199 | len(source_oov))) 200 | logging.debug("We could not find {} target words in our target words.".format( 201 | len(target_oov))) 202 | logging.debug("{} words are exact between languages".format(len(w_exact))) 203 | logging.debug("We found {} pairs of words valid for testing.".format(len(source_words))) 204 | 205 | new_s_vectors = asarray([self.vectors[self.word_id[w]] for w in source_words]) 206 | source = Embeddings(vectors=new_s_vectors, words=source_words, 207 | lang=mapping.s_lang) 208 | 209 | new_t_vectors = asarray([self.vectors[self.word_id[w]] for w in target_words]) 210 | target = Embeddings(vectors=new_t_vectors, words=target_words, 211 | lang=mapping.t_lang) 212 | new_mapping = Mapping(source=mapping.s_lang, target=mapping.t_lang) 213 | new_mapping.map = dict(zip(source.words, target.words)) 214 | return (source, target, new_mapping) 215 | 216 | def common(self, other): 217 | """ Find common terms between languages. 218 | 219 | The post condition is that both embeddings vocabulary are in the same 220 | order. 221 | """ 222 | 223 | common_words = [] 224 | for word in self.word_id: 225 | if word in other.word_id: 226 | common_words.append(word) 227 | 228 | new_self_vectors = [] 229 | new_other_vectors = [] 230 | for word in common_words: 231 | new_self_vectors.append(self.vectors[self.word_id[word]]) 232 | new_other_vectors.append(other.vectors[other.word_id[word]]) 233 | 234 | new_self = Embeddings(vectors=asarray(new_self_vectors), words=common_words, 235 | lang=self.lang) 236 | 237 | new_other = Embeddings(vectors=asarray(new_other_vectors), words=common_words, 238 | lang=self.lang) 239 | 240 | return (new_self, new_other) 241 | 242 | 243 | class Word2VecEmbeddings(Embeddings): 244 | 245 | """ Word2Vec embeddings reader.""" 246 | 247 | def read_file(self, limit=-1): 248 | words = [] 249 | embeddings = [] 250 | with open(self.filename, 'rb') as f: 251 | words_number, size = [int(x) for x in f.readline().strip().split()][:2] 252 | for i, line in enumerate(f): 253 | try: 254 | ws = line.decode('utf-8').strip().split() 255 | words.append(' '.join(ws[:-size])) 256 | embeddings.append([float(x) for x in ws[-size:]]) 257 | if i == limit: 258 | break 259 | except Exception, e: 260 | print "Exception", i 261 | print "Exception", line 262 | self.word_id = {w: i for i, w in enumerate(words)} 263 | self.vectors = asarray(embeddings) 264 | assert len(self.word_id) == self.vectors.shape[0] 265 | 266 | 267 | class Evaluator(object): 268 | 269 | """ Evaluator of the alignment between two languages.""" 270 | 271 | def __init__(self, source_embeddings, target_embeddings, metric='l2', k=5): 272 | self.metric = metric 273 | self.source_embeddings = source_embeddings 274 | self.target_embeddings = target_embeddings 275 | self.k = k 276 | self.row_normalize = True 277 | self.col_normalize = False 278 | 279 | @staticmethod 280 | def cosine_knn(vectors, point, k): 281 | distances = numpy.dot(vectors, point) 282 | indices = list(reversed(distances.argsort()))[:k] 283 | return distances[indices], [indices] 284 | 285 | def norm(self, vectors): 286 | out = vectors 287 | if self.row_normalize: 288 | norms = (vectors ** 2).sum(axis=1) ** 0.5 289 | out = (vectors.T / norms).T 290 | 291 | if self.col_normalize: 292 | norms = (vectors ** 2).sum(axis=0) ** 0.5 293 | norms[norms == 0] = 1 294 | out = vectors / norms 295 | return out 296 | 297 | def precision_at_k(self, test_pairs): 298 | if self.metric == 'cosine': 299 | return self.precision_at_k_cosine(test_pairs) 300 | return self.precision_at_k_l2(test_pairs) 301 | 302 | def precision_at_k_l2(self, test_pairs): 303 | t_knn = NearestNeighbors(n_neighbors=self.k, algorithm='ball_tree', p=2) 304 | t_knn.fit(self.target_embeddings.vectors) 305 | 306 | right = 0 307 | index = 0 308 | for s, t in test_pairs: 309 | assert(s == t) 310 | point = self.source_embeddings.vectors[self.source_embeddings.word_id[s]] 311 | distances, indices = t_knn.kneighbors(point) 312 | 313 | t_words = [self.target_embeddings.id_word[i] for i in indices[0]] 314 | t = t.rsplit('_', 1)[0] 315 | t_words = [x.rsplit('_', 1)[0] for x in t_words] 316 | 317 | line = u"{: <20}{:<20}{:<50}".format(s, t, u' '.join(t_words)) 318 | logging.debug(line.encode('utf-8')) 319 | if t in t_words: 320 | right += 1 321 | index = index + 1 322 | return right / float(len(test_pairs)) 323 | 324 | def precision_at_k_cosine(self, test_pairs): 325 | s_vectors = self.norm(self.source_embeddings.vectors) 326 | t_vectors = self.norm(self.target_embeddings.vectors) 327 | 328 | right = 0 329 | for s, t in test_pairs: 330 | point = self.source_embeddings.vectors[self.source_embeddings.word_id[s]] 331 | distances, indices = Evaluator.cosine_knn(t_vectors, point, self.k) 332 | 333 | t_words = [self.target_embeddings.id_word[i] for i in indices[0]] 334 | 335 | t = t.rsplit('_', 1)[0] 336 | t_words = [x.rsplit('_', 1)[0] for x in t_words] 337 | 338 | line = u"{: <20}{:<20}{:<50}".format(s, t, u' '.join(t_words)) 339 | logging.debug(line.encode('utf-8')) 340 | if t in t_words: 341 | right += 1 342 | return right / float(len(test_pairs)) 343 | 344 | def evaluate(self, mapping, operation, training_segment, test_segment): 345 | 346 | (s_train, t_train) = self.source_embeddings.words_translations(self.target_embeddings, mapping, training_segment) 347 | (s_test, t_test) = self.source_embeddings.words_translations(self.target_embeddings, mapping, test_segment) 348 | 349 | s_train.vectors = self.norm(s_train.vectors) 350 | t_train.vectors = self.norm(t_train.vectors) 351 | s_test.vectors = self.norm(s_test.vectors) 352 | t_test.vectors = self.norm(t_test.vectors) 353 | 354 | if set(s_train.words).intersection(set(s_test.words)): 355 | print (u"Train and test words are overlapping") 356 | 357 | s_new, t_new = operation((s_train, t_train), (s_test, t_test)) 358 | 359 | return None 360 | 361 | 362 | def linear_regression(train_embeddings, test_embeddings): 363 | global reg_model 364 | s_embeddings, t_embeddings = train_embeddings 365 | s_test, t_test = test_embeddings 366 | 367 | reg = LinearRegression() 368 | reg.fit(s_embeddings.vectors, t_embeddings.vectors) 369 | pickle.dump(reg, open(reg_model, 'wb')) 370 | s = Embeddings(vectors=reg.predict(s_test.vectors), 371 | words=s_test.words, lang=s_embeddings.lang) 372 | return s, t_test 373 | 374 | 375 | def local_linear_regression(train_embeddings, test_embeddings): 376 | global reg_model 377 | print "Using local linear regression with k = ", K_NN 378 | s_embeddings, t_embeddings = train_embeddings 379 | s_test, t_test = test_embeddings 380 | reg = LocalLinearRegression(k_nn=K_NN) 381 | reg.fit(s_embeddings.vectors, t_embeddings.vectors) 382 | pickle.dump(reg, open(reg_model, 'wb')) 383 | return None, None 384 | 385 | 386 | def identity(train_vectors, all_vectors): 387 | return all_vectors 388 | 389 | 390 | def evaluate_word2vec(sl, tl, source_file, target_file, method): 391 | print "Proceeding to load embeddings" 392 | s_ = Word2VecEmbeddings(lang=sl, filename=source_file) 393 | t_ = Word2VecEmbeddings(lang=tl, filename=target_file) 394 | print "Loaded word embeddings" 395 | mapping = IdentityTranslations(source=sl, target=tl, se=s_, te=t_) 396 | print "Mapping done" 397 | s, t = s_.get_common(t_, mapping) 398 | print "Common vocab done" 399 | evaluator = Evaluator(source_embeddings=s, target_embeddings=t, metric='l2') 400 | print "Evaluator constructed" 401 | assert(s.vectors.shape == t.vectors.shape) 402 | print "Evaluating" 403 | if method == 'linear': 404 | p1 = evaluator.evaluate(mapping, linear_regression, (0, s.vectors.shape[0]), (0, s.vectors.shape[0])) 405 | elif method == 'locallinear': 406 | p1 = evaluator.evaluate(mapping, local_linear_regression, (0, s.vectors.shape[0]), (0, s.vectors.shape[0])) 407 | 408 | 409 | def main(args): 410 | global reg_model 411 | global K_NN 412 | reg_model = args.filename 413 | if args.method == 'linear': 414 | evaluate_word2vec('old', 'new', args.old_model, args.new_model, 'linear') 415 | elif args.method == 'locallinear': 416 | K_NN = int(args.knn_val) 417 | evaluate_word2vec('old', 'new', args.old_model, args.new_model, 'locallinear') 418 | 419 | if __name__ == "__main__": 420 | parser = ArgumentParser() 421 | parser.add_argument("-f", "--file", dest="filename", help="Input file") 422 | parser.add_argument("-o", "--old_model", dest="old_model", help="old model") 423 | parser.add_argument("-n", "--new_model", dest="new_model", help="new model") 424 | parser.add_argument("-k", "--knn", dest="knn_val", default=1000, type=int, help="K in KNN for local linear regression") 425 | parser.add_argument("-m", "--method", dest="method", help="method") 426 | parser.add_argument("-l", "--log", dest="log", help="log verbosity level", 427 | default="INFO") 428 | args = parser.parse_args() 429 | if args.log == 'DEBUG': 430 | sys.excepthook = debug 431 | numeric_level = getattr(logging, args.log.upper(), None) 432 | logging.basicConfig(level=numeric_level, format=LOGFORMAT) 433 | main(args) 434 | --------------------------------------------------------------------------------