├── .gitignore ├── .project ├── .pydevproject ├── .travis.yml ├── README.rst ├── resource └── unittest │ ├── config │ ├── neighbours_config.cfg │ └── sim_config.cfg │ ├── pipelines_test_resources │ ├── AN_mat.dm │ ├── A_mat.dm │ ├── N_mat.dm │ ├── aan_train_data.txt │ ├── an_train_data.txt │ ├── config1.txt │ ├── mat1.col │ ├── mat1.cols │ ├── mat1.pickle │ ├── mat1.row │ ├── mat1.sm │ ├── mat1.sm.gz │ ├── mat2.dm │ ├── mat2.dm.gz │ ├── mat3.cols │ ├── mat3.dm │ ├── mat3.sm │ ├── na_train_data.txt │ ├── neighbours_input.txt │ ├── pred1.txt │ ├── pred2.txt │ └── sim_input.txt │ └── space_test_resources │ ├── col1.col │ ├── col2.col │ ├── col3.col │ ├── col4.col │ ├── col5.col │ ├── data1.cols │ ├── data1.dense │ ├── data1.sparse │ ├── data10.dense │ ├── data10.sparse │ ├── data2.cols │ ├── data2.dense │ ├── data2.sparse │ ├── data3.cols │ ├── data3.dense │ ├── data3.sparse │ ├── data4.cols │ ├── data4.dense │ ├── data4.sparse │ ├── data5.cols │ ├── data7.cols │ ├── data7.dense │ ├── data7.sparse │ ├── data8.dense │ ├── data8.sparse │ ├── data9.cols │ ├── data9.dense │ ├── data9.sparse │ ├── row1.row │ ├── row2.row │ ├── row3.row │ ├── tmp.col │ ├── tmp.cols │ ├── tmp.dm │ ├── tmp.row │ ├── tmp.rows │ └── tmp.sm ├── setup.py ├── src ├── composes │ ├── __init__.py │ ├── composition │ │ ├── __init__.py │ │ ├── composition_model.py │ │ ├── dilation.py │ │ ├── full_additive.py │ │ ├── lexical_function.py │ │ ├── multiplicative.py │ │ └── weighted_additive.py │ ├── exception │ │ ├── __init__.py │ │ ├── illegal_state_error.py │ │ └── invalid_argument_error.py │ ├── matrix │ │ ├── __init__.py │ │ ├── dense_matrix.py │ │ ├── linalg.py │ │ ├── matrix.py │ │ └── sparse_matrix.py │ ├── semantic_space │ │ ├── __init__.py │ │ ├── operation.py │ │ ├── peripheral_space.py │ │ └── space.py │ ├── similarity │ │ ├── __init__.py │ │ ├── cos.py │ │ ├── dot_prod.py │ │ ├── euclidean.py │ │ ├── lin.py │ │ └── similarity.py │ ├── transformation │ │ ├── __init__.py │ │ ├── dim_reduction │ │ │ ├── __init__.py │ │ │ ├── dimensionality_reduction.py │ │ │ ├── nmf.py │ │ │ └── svd.py │ │ ├── feature_selection │ │ │ ├── __init__.py │ │ │ ├── feature_selection.py │ │ │ └── top_feature_selection.py │ │ └── scaling │ │ │ ├── __init__.py │ │ │ ├── epmi_weighting.py │ │ │ ├── normalization.py │ │ │ ├── plmi_weighting.py │ │ │ ├── plog_weighting.py │ │ │ ├── ppmi_weighting.py │ │ │ ├── row_normalization.py │ │ │ └── scaling.py │ └── utils │ │ ├── __init__.py │ │ ├── crossvalidation_utils.py │ │ ├── gen_utils.py │ │ ├── io_utils.py │ │ ├── log_utils.py │ │ ├── matrix_utils.py │ │ ├── mem_utils.py │ │ ├── num_utils.py │ │ ├── py_matrix_utils.py │ │ ├── regression_learner.py │ │ ├── scoring_utils.py │ │ └── space_utils.py ├── examples │ ├── __init__.py │ ├── cmd_ex01.sh │ ├── cmd_ex02.sh │ ├── cmd_ex03.sh │ ├── cmd_ex04.sh │ ├── cmd_ex05.sh │ ├── cmd_ex06.sh │ ├── cmd_ex07.sh │ ├── data │ │ ├── in │ │ │ ├── config1.cfg │ │ │ ├── config2.cfg │ │ │ ├── data_to_comp.txt │ │ │ ├── data_to_comp2.txt │ │ │ ├── ex01.cols │ │ │ ├── ex01.rows │ │ │ ├── ex01.sm │ │ │ ├── ex05.cols │ │ │ ├── ex05.sm │ │ │ ├── ex10.cols │ │ │ ├── ex10.rows │ │ │ ├── ex10.sm │ │ │ ├── ex19-n.cols │ │ │ ├── ex19-n.sm │ │ │ ├── ex19-svo.cols │ │ │ ├── ex19-svo.sm │ │ │ ├── sim_data.txt │ │ │ ├── sim_data2.txt │ │ │ ├── sim_data3.txt │ │ │ ├── train_data.txt │ │ │ ├── word_list.txt │ │ │ ├── word_pairs1.txt │ │ │ ├── word_pairs2.txt │ │ │ └── word_sims.txt │ │ └── out │ │ │ ├── COMPOSED_SS.ex10.pkl │ │ │ ├── PER_SS.ex05.pkl │ │ │ ├── PHRASE_SS.ex10.pkl │ │ │ ├── ex01.cols │ │ │ ├── ex01.dm │ │ │ ├── ex01.pkl │ │ │ ├── ex01.rows │ │ │ ├── ex01.sm │ │ │ ├── ex10.pkl │ │ │ ├── model01.params │ │ │ └── model01.pkl │ ├── ex01.py │ ├── ex02.py │ ├── ex03.py │ ├── ex04.py │ ├── ex05.py │ ├── ex06.py │ ├── ex07.py │ ├── ex08.py │ ├── ex09.py │ ├── ex10.py │ ├── ex11.py │ ├── ex12.py │ ├── ex13.py │ ├── ex14.py │ ├── ex15.py │ ├── ex16.py │ ├── ex17.py │ ├── ex18.py │ ├── ex19.py │ ├── ex20.py │ ├── exercise.sh │ └── full_example.py ├── pipelines │ ├── __init__.py │ ├── apply_composition.py │ ├── build_core_space.py │ ├── build_peripheral_space.py │ ├── compute_neighbours.py │ ├── compute_similarities.py │ ├── evaluate_similarities.py │ ├── pipeline_utils.py │ └── train_composition.py └── unitest │ ├── __init__.py │ ├── ac_pipeline_test.py │ ├── bcs_pipeline_test.py │ ├── bps_pipeline_test.py │ ├── conftest.py │ ├── crossvalidation_utils_test.py │ ├── dense_matrix_test.py │ ├── dilation_test.py │ ├── dimensionality_reduction_test.py │ ├── es_pipeline_test.py │ ├── feat_selection_test.py │ ├── full_aditive_test.py │ ├── lexical_function_test.py │ ├── linalg_test.py │ ├── matrix_utils_test.py │ ├── model_export_test.py │ ├── neighbour_pipeline_test.py │ ├── operation_test.py │ ├── peripheral_space_test.py │ ├── regression_learner_utils_test.py │ ├── sim_pipeline_test.py │ ├── similarity_test.py │ ├── space_test.py │ ├── sparse_matrix_test.py │ ├── tc_pipeline_test.py │ ├── utils_test.py │ ├── weighted_additive_test.py │ └── weighting_test.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | resource/unittest/pipelines_test_resources/ 4 | 5 | .Python 6 | .coverage 7 | 8 | .tox/ 9 | 10 | *.egg/ 11 | src/dissect.egg-info/ 12 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | gittoolkit 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | Default 4 | python 2.7 5 | 6 | /gittoolkit/src 7 | /gittoolkit/src 8 | 9 | 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | before_install: 5 | - sudo apt-get update -qq 6 | - sudo apt-get install -qq python-numpy python-scipy python-matplotlib 7 | - rm /home/travis/virtualenv/python2.7/lib/python2.7/no-global-site-packages.txt 8 | # command to install dependencies 9 | install: 10 | - pip install cython --use-mirrors 11 | - pip install . --use-mirrors 12 | # command to run tests 13 | script: python setup.py test 14 | after_success: 15 | - sudo apt-get install python-yaml 16 | - pip install coveralls pytest-cov . --use-mirrors 17 | - py.test --cov=composes --cov=pipelines --cov-report=term-missing src/unitest 18 | - coveralls 19 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | DIStributional SEmantics Composition Toolkit 2 | ============================================ 3 | 4 | 5 | For documentation, please, refer to http://clic.cimec.unitn.it/composes/toolkit/ 6 | -------------------------------------------------------------------------------- /resource/unittest/config/neighbours_config.cfg: -------------------------------------------------------------------------------- 1 | # configuration file for similarity pipeline 2 | [compute_neighbours] 3 | 4 | #input file 5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/neighbours_input.txt 6 | 7 | # similarity measure 8 | sim_measure=cos 9 | 10 | # output directory 11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/ 12 | 13 | # space file(s) 14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle,/home/thenghia.pham/git/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle 15 | 16 | # number of neighbours 17 | no_neighbours=3 18 | 19 | # log file 20 | log=/home/georgianadinu/work/localtoolkit/toolkit/log/sim_log.txt -------------------------------------------------------------------------------- /resource/unittest/config/sim_config.cfg: -------------------------------------------------------------------------------- 1 | # configuration file for similarity pipeline 2 | [compute_similarities] 3 | 4 | #input file 5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/sim_input.txt 6 | 7 | # similarity measure 8 | sim_measure=cos,dot_prod,lin 9 | 10 | # output directory 11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/ 12 | 13 | # space file(s) 14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl,/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl 15 | # columns 16 | columns=0,1 17 | 18 | # log file 19 | log=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/ -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/AN_mat.dm: -------------------------------------------------------------------------------- 1 | big_car 3 4 2 | big_man 5 6 3 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/A_mat.dm: -------------------------------------------------------------------------------- 1 | big 3 4 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/N_mat.dm: -------------------------------------------------------------------------------- 1 | car 3 4 2 | man 5 6 3 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/aan_train_data.txt: -------------------------------------------------------------------------------- 1 | big big_car big_big_car 2 | big big_man big_big_man -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/an_train_data.txt: -------------------------------------------------------------------------------- 1 | big car big_car 2 | big man big_man 3 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/config1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/config1.txt -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.col: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.cols: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.pickle: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (ccomposes.semantic_space.space 5 | Space 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_id2row' 14 | p6 15 | (lp7 16 | S'red' 17 | p8 18 | asS'_column2id' 19 | p9 20 | (dp10 21 | S'car' 22 | p11 23 | I0 24 | ssS'_operations' 25 | p12 26 | (lp13 27 | sS'_id2column' 28 | p14 29 | (lp15 30 | g11 31 | asS'_element_shape' 32 | p16 33 | (I1 34 | tp17 35 | sS'_cooccurrence_matrix' 36 | p18 37 | g0 38 | (ccomposes.matrix.sparse_matrix 39 | SparseMatrix 40 | p19 41 | g2 42 | Ntp20 43 | Rp21 44 | (dp22 45 | S'_mat' 46 | p23 47 | g0 48 | (cscipy.sparse.csr 49 | csr_matrix 50 | p24 51 | g2 52 | Ntp25 53 | Rp26 54 | (dp27 55 | S'format' 56 | p28 57 | S'csr' 58 | p29 59 | sS'_shape' 60 | p30 61 | (I1 62 | I1 63 | tp31 64 | sS'indptr' 65 | p32 66 | cnumpy.core.multiarray 67 | _reconstruct 68 | p33 69 | (cnumpy 70 | ndarray 71 | p34 72 | (I0 73 | tp35 74 | S'b' 75 | p36 76 | tp37 77 | Rp38 78 | (I1 79 | (I2 80 | tp39 81 | cnumpy 82 | dtype 83 | p40 84 | (S'i4' 85 | p41 86 | I0 87 | I1 88 | tp42 89 | Rp43 90 | (I3 91 | S'<' 92 | p44 93 | NNNI-1 94 | I-1 95 | I0 96 | tp45 97 | bI00 98 | S'\x00\x00\x00\x00\x01\x00\x00\x00' 99 | p46 100 | tp47 101 | bsS'indices' 102 | p48 103 | g33 104 | (g34 105 | (I0 106 | tp49 107 | g36 108 | tp50 109 | Rp51 110 | (I1 111 | (I1 112 | tp52 113 | g43 114 | I00 115 | S'\x00\x00\x00\x00' 116 | p53 117 | tp54 118 | bsS'maxprint' 119 | p55 120 | I50 121 | sS'data' 122 | p56 123 | g33 124 | (g34 125 | (I0 126 | tp57 127 | g36 128 | tp58 129 | Rp59 130 | (I1 131 | (I1 132 | tp60 133 | g40 134 | (S'f8' 135 | p61 136 | I0 137 | I1 138 | tp62 139 | Rp63 140 | (I3 141 | S'<' 142 | p64 143 | NNNI-1 144 | I-1 145 | I0 146 | tp65 147 | bI00 148 | S'\x00\x00\x00\x00\x00\x00\x08@' 149 | p66 150 | tp67 151 | bsbsbsS'_row2id' 152 | p68 153 | (dp69 154 | g8 155 | I0 156 | ssb. -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.row: -------------------------------------------------------------------------------- 1 | red 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.sm: -------------------------------------------------------------------------------- 1 | red car 3.000000 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat1.sm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat1.sm.gz -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat2.dm: -------------------------------------------------------------------------------- 1 | car 3 4 5 2 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat2.dm.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat2.dm.gz -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat3.cols: -------------------------------------------------------------------------------- 1 | f1 2 | f2 3 | f3 4 | f4 5 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat3.dm: -------------------------------------------------------------------------------- 1 | a 1 2 3 1 2 | b 2 4 6 1 3 | c 4 675 43 1 4 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/mat3.sm: -------------------------------------------------------------------------------- 1 | a f1 1 2 | a f2 2 3 | a f3 3 4 | a f4 1 5 | b f1 2 6 | b f2 4 7 | b f3 6 8 | b f4 1 9 | c f1 4 10 | c f2 675 11 | c f3 43 12 | c f4 1 13 | 14 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/na_train_data.txt: -------------------------------------------------------------------------------- 1 | car big big_car 2 | man big big_man 3 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/neighbours_input.txt: -------------------------------------------------------------------------------- 1 | a 2 | b 3 | c -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/pred1.txt: -------------------------------------------------------------------------------- 1 | 23 car 23 sdrs 2 | 4 man 4 sdfs 3 | 13 cad 13 sfd 4 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/pred2.txt: -------------------------------------------------------------------------------- 1 | 23 car 23 sdrs 2 | 4 man 4 sdfs 3 | 13 cad 13 sfd 4 | -------------------------------------------------------------------------------- /resource/unittest/pipelines_test_resources/sim_input.txt: -------------------------------------------------------------------------------- 1 | a b 1 2 | a c 0 3 | a a 1 4 | b c 1 -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/col1.col: -------------------------------------------------------------------------------- 1 | man 2 | car -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/col2.col: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/col3.col: -------------------------------------------------------------------------------- 1 | man 2 | car 3 | man 4 | car 5 | car 6 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/col4.col: -------------------------------------------------------------------------------- 1 | airplane 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/col5.col: -------------------------------------------------------------------------------- 1 | man sdrf 2 | car 3 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data1.cols: -------------------------------------------------------------------------------- 1 | car 2 | man 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data1.dense: -------------------------------------------------------------------------------- 1 | red 3 5 2 | blue 0 10 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data1.sparse: -------------------------------------------------------------------------------- 1 | red car 3 2 | red man 5 3 | blue man 10 4 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data10.dense: -------------------------------------------------------------------------------- 1 | car man 3 -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data10.sparse: -------------------------------------------------------------------------------- 1 | man car car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data2.cols: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data2.dense: -------------------------------------------------------------------------------- 1 | red 3 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data2.sparse: -------------------------------------------------------------------------------- 1 | red car 3 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data3.cols: -------------------------------------------------------------------------------- 1 | car 2 | man 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data3.dense: -------------------------------------------------------------------------------- 1 | red 5 0 2 | red 10 0 3 | blue 0 6 4 | 5 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data3.sparse: -------------------------------------------------------------------------------- 1 | red car 5 2 | red car 10 3 | blue man 6 4 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data4.cols: -------------------------------------------------------------------------------- 1 | car 2 | man 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data4.dense: -------------------------------------------------------------------------------- 1 | red 5 0 2 | blue 0 6 -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data4.sparse: -------------------------------------------------------------------------------- 1 | red car 5 2 | blue man 6 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data5.cols: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data5.cols -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data7.cols: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data7.dense: -------------------------------------------------------------------------------- 1 | red 0 -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data7.sparse: -------------------------------------------------------------------------------- 1 | red car 0 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data8.dense: -------------------------------------------------------------------------------- 1 | car 3 5 6 2 | man 3 5 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data8.sparse: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data8.sparse -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data9.cols: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data9.dense: -------------------------------------------------------------------------------- 1 | car 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/data9.sparse: -------------------------------------------------------------------------------- 1 | man car 4 5 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/row1.row: -------------------------------------------------------------------------------- 1 | red 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/row2.row: -------------------------------------------------------------------------------- 1 | blue 2 | red -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/row3.row: -------------------------------------------------------------------------------- 1 | blue 2 | red 3 | blue 4 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.col: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.col -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.cols: -------------------------------------------------------------------------------- 1 | f1 2 | f2 3 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.dm: -------------------------------------------------------------------------------- 1 | a 0 0 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.row: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.row -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.rows: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /resource/unittest/space_test_resources/tmp.sm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.sm -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | from setuptools import setup 5 | from setuptools.command.test import test as TestCommand 6 | 7 | 8 | class PyTest(TestCommand): 9 | def finalize_options(self): 10 | TestCommand.finalize_options(self) 11 | self.test_args = 'src/unitest' 12 | self.test_suite = True 13 | 14 | def run_tests(self): 15 | #import here, cause outside the eggs aren't loaded 16 | import pytest 17 | errno = pytest.main(self.test_args) 18 | sys.exit(errno) 19 | 20 | 21 | setup( 22 | name='dissect', 23 | version='0.1.0', 24 | description='COMPOSES DISSECT TOOLKIT', 25 | author='Georgiana Dinu, The Nghia Pham, Marco Baroni', 26 | author_email='georgiana.dinu@unitn.it,thenghia.pham@unitn.it', 27 | url='http://http://clic.cimec.unitn.it/composes/toolkit/', 28 | install_requires=['numpy', 'scipy', 'sparsesvd'], 29 | tests_require=['pytest>=2.4.2'], 30 | cmdclass={'test': PyTest}, 31 | package_dir={'': 'src'}, 32 | packages=[ 33 | 'composes', 34 | 'composes.composition', 35 | 'composes.matrix', 36 | 'composes.semantic_space', 37 | 'composes.exception', 38 | 'composes.similarity', 39 | 'composes.transformation', 40 | 'composes.utils', 41 | 'composes.transformation.dim_reduction', 42 | 'composes.transformation.feature_selection', 43 | 'composes.transformation.scaling', 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /src/composes/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | class NullHandler(logging.Handler): 4 | """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" 5 | def emit(self, record): 6 | pass 7 | 8 | logger = logging.getLogger(__name__) 9 | if len(logger.handlers) == 0: # To ensure reload() doesn't add another one 10 | logger.addHandler(NullHandler()) 11 | 12 | #logging.basicConfig(filename='composes.log', filemode='w+',level=logging.DEBUG, format = "") 13 | -------------------------------------------------------------------------------- /src/composes/composition/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/composition/__init__.py -------------------------------------------------------------------------------- /src/composes/composition/dilation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 15, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import numpy as np 7 | from composition_model import CompositionModel 8 | from composes.utils.num_utils import is_numeric 9 | from composes.utils.py_matrix_utils import nonzero_invert 10 | 11 | 12 | class Dilation(CompositionModel): 13 | """ 14 | Implements the dilation compositional model: 15 | 16 | :math:`\\vec{p} = (\\vec{u} \\cdot \\vec{u}) \\vec{v} + (\\lambda - 1) (\\vec{u} \\cdot \\vec{v}) \\vec{u}` 17 | 18 | where :math:`\\vec{p}` is the vector of the composed phrase, :math:`\\vec{u}, \\vec{v}` the vectors of the components 19 | and :math:`\\lambda` is a scalar. 20 | 21 | """ 22 | 23 | 24 | _name = "dilation" 25 | 26 | _lambda = 2 27 | 28 | 29 | def __init__(self, lambda_=None): 30 | """ 31 | Constructor. 32 | 33 | Args: 34 | lambda_ : numeric, value of the lambda parameter. Optional. 35 | """ 36 | 37 | if not lambda_ is None: 38 | if not is_numeric(lambda_): 39 | raise ValueError("Parameter not numeric: %s " %(type(lambda_))) 40 | else: 41 | self._lambda = lambda_ 42 | 43 | def _solve(self, arg1_mat, arg2_mat, phrase_mat): 44 | 45 | v1_row_norms = arg1_mat.norm(1) 46 | v1_row_sqr_norms = np.multiply(v1_row_norms, v1_row_norms) 47 | 48 | v2_minus_p = arg2_mat.scale_rows(v1_row_sqr_norms) - phrase_mat 49 | v1_dot_prod_v2_minus_p = arg1_mat.multiply(v2_minus_p).sum(1) 50 | 51 | v1_v2 = arg1_mat.multiply(arg2_mat).sum(1) 52 | v1_v2_sqr = np.multiply(v1_v2, v1_v2) 53 | 54 | nom = np.multiply(v1_v2_sqr, v1_row_sqr_norms).sum() 55 | denom = np.multiply(v1_v2, v1_dot_prod_v2_minus_p).sum() 56 | 57 | if nom != 0: 58 | self._lambda = 1 - denom/nom 59 | else: 60 | self._lambda = 2 61 | 62 | 63 | def _compose(self, arg1_mat, arg2_mat): 64 | # TO DO: this is inefficient here, we do 2 for s instead of one 65 | # we do a for in get_rows in parent.compose() and a for here 66 | # comp = ((self._lambda -1) * v1.multiply(v2).sum()/pow(v1.norm(),2)) * v1 + v2 67 | 68 | v1_row_norms = arg1_mat.norm(1) 69 | scale_factors1 = arg1_mat.multiply(arg2_mat).sum(1) 70 | scale_factors2 = np.multiply(v1_row_norms, v1_row_norms) 71 | 72 | arg1_mat_scaled = arg1_mat.scale_rows(scale_factors1) 73 | arg2_mat_scaled = arg2_mat.scale_rows(scale_factors2) 74 | 75 | #print "FACTORS u:", ((self._lambda -1)*scale_factors1).sum()/float(len(scale_factors1)) 76 | #print "FACTORS v:", (scale_factors2).sum()/float(len(scale_factors2)) 77 | 78 | result = (self._lambda - 1) * arg1_mat_scaled + arg2_mat_scaled 79 | 80 | return result 81 | 82 | def get_lambda(self): 83 | return self._lambda 84 | """ 85 | Lambda parameter. Default, set to lambda=2. 86 | """ 87 | 88 | 89 | def _export(self, filename): 90 | with open(filename, "w") as output_stream: 91 | output_stream.write("lambda\t%f" % self._lambda) 92 | -------------------------------------------------------------------------------- /src/composes/composition/full_additive.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 5, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from composition_model import CompositionModel 8 | from composes.utils.gen_utils import assert_is_instance 9 | from composes.utils.matrix_utils import is_array_or_matrix 10 | from composes.utils.matrix_utils import padd_matrix 11 | from composes.utils.matrix_utils import to_compatible_matrix_types 12 | from composes.utils.regression_learner import LstsqRegressionLearner 13 | from composes.utils.regression_learner import RegressionLearner 14 | from composes.utils.matrix_utils import resolve_type_conflict 15 | from composes.matrix.dense_matrix import DenseMatrix 16 | from composes.exception.illegal_state_error import IllegalStateError 17 | 18 | 19 | class FullAdditive(CompositionModel): 20 | """ 21 | Implements the full additive compositional model: 22 | 23 | :math:`\\vec{p} = A \\vec{u} + B \\vec{v}` 24 | 25 | where :math:`\\vec{p}` is the vector of the composed phrase, 26 | :math:`\\vec{u}, \\vec{v}`, the vectors of the components 27 | and :math:`A`, :math:`B` are two matrices. 28 | 29 | """ 30 | _name = "full_additive" 31 | _mat_a_t = None 32 | _mat_b_t = None 33 | 34 | 35 | def __init__(self, A=None, B=None, learner=LstsqRegressionLearner()): 36 | #TODO here; very important, should be able to set the intercept 37 | #when mat a and mat b are given , to true or false. now by default is 38 | #is false 39 | """ 40 | Constructor. 41 | 42 | Args: 43 | A= : matrix A, of matrix-like type (Matrix, ndarray, 44 | numpy matrix, scipy matrix). Optional (parameters can be set 45 | through training.) 46 | 47 | B= : matrix B, matrix-like type. Optional. 48 | 49 | learner= : regression learner object, of type RegressionLearner. 50 | Optional, default LstsqRegressionLearner. 51 | """ 52 | if A is not None and B is not None: 53 | mat_a = A 54 | mat_b = B 55 | if not is_array_or_matrix(mat_a): 56 | raise TypeError("expected matrix type, received: %s" 57 | % type(mat_a)) 58 | 59 | if not is_array_or_matrix(mat_b): 60 | raise TypeError("expected matrix type, received: %s" 61 | % type(mat_b)) 62 | 63 | mat_a, mat_b = to_compatible_matrix_types(mat_a, mat_b) 64 | self._mat_a_t = mat_a.transpose() 65 | self._mat_b_t = mat_b.transpose() 66 | self._has_intercept = False 67 | 68 | else: 69 | self._regression_learner = learner 70 | self._has_intercept = self._regression_learner.has_intercept() 71 | 72 | 73 | def _solve(self, arg1_mat, arg2_mat, phrase_mat): 74 | 75 | self._has_intercept = self._regression_learner.has_intercept() 76 | 77 | result = self._regression_learner.train(arg1_mat.hstack(arg2_mat), phrase_mat) 78 | 79 | self._mat_a_t = result[0:arg1_mat.shape[1], :] 80 | self._mat_b_t = result[arg1_mat.shape[1]:, :] 81 | 82 | 83 | def _compose(self, arg1_mat, arg2_mat): 84 | #NOTE when we get in this compose arg1 mat and arg2 mat have the same type 85 | [mat_a_t, mat_b_t, arg1_mat] = resolve_type_conflict([self._mat_a_t, 86 | self._mat_b_t, 87 | arg1_mat], 88 | type(arg1_mat)) 89 | if self._has_intercept: 90 | return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t 91 | else: 92 | return arg1_mat * mat_a_t + arg2_mat * mat_b_t 93 | 94 | def set_regression_learner(self, regression_learner): 95 | assert_is_instance(regression_learner, RegressionLearner) 96 | self._regression_learner = regression_learner 97 | 98 | def get_regression_learner(self): 99 | return self._regression_learner 100 | 101 | regression_learner = property(get_regression_learner, set_regression_learner) 102 | """ 103 | Regression method to be used in training, of type RegressionLearner. 104 | Default is LstsqRegressionLearner. 105 | """ 106 | 107 | def _build_id2column(self, arg1_space, arg2_space): 108 | return [] 109 | 110 | def _export(self, filename): 111 | if self._mat_a_t is None or self._mat_b_t is None: 112 | raise IllegalStateError("cannot export an untrained FullAdditive model.") 113 | 114 | with open(filename, "w") as output_stream: 115 | output_stream.write("A\n") 116 | output_stream.write(str(DenseMatrix(self._mat_a_t).mat.T)) 117 | output_stream.write("\nB\n") 118 | 119 | if self._has_intercept: 120 | output_stream.write(str(DenseMatrix(self._mat_b_t[:-1,]).mat.T)) 121 | output_stream.write("\nIntercept\n") 122 | output_stream.write(str(DenseMatrix(self._mat_b_t[-1,]).mat.T)) 123 | else: 124 | output_stream.write(str(DenseMatrix(self._mat_b_t).mat.T)) 125 | 126 | 127 | def get_mat_a_t(self): 128 | return self._mat_a_t 129 | mat_a_t = property(get_mat_a_t) 130 | """ 131 | Transpose of matrix A parameter, of type Matrix. 132 | """ 133 | 134 | def get_mat_b_t(self): 135 | return self._mat_b_t 136 | mat_b_t = property(get_mat_b_t) 137 | """ 138 | Transpose of matrix B parameter, of type Matrix. 139 | """ 140 | -------------------------------------------------------------------------------- /src/composes/composition/multiplicative.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 5, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from composition_model import CompositionModel 8 | from composes.exception.illegal_state_error import IllegalOperationError 9 | 10 | class Multiplicative(CompositionModel): 11 | """ 12 | Implements the component-wise multiplication compositional model: 13 | 14 | :math:`\\vec{p} = \\vec{u} \\cdot \\vec{v}` 15 | 16 | where :math:`\\vec{p}` is the vector of the composed phrase and 17 | :math:`\\vec{u}, \\vec{v}` are the vectors of the components. 18 | 19 | :math:`\\vec{u} \\cdot \\vec{v} = (u_1v_1,...,u_nv_n)` 20 | """ 21 | 22 | _name = "multiplicative" 23 | 24 | def __init__(self): 25 | """ 26 | Constructor 27 | """ 28 | 29 | def train(self): 30 | """ 31 | Current multiplicative model cannot be trained, it has no parameters. 32 | """ 33 | raise IllegalOperationError("Cannot train multiplicative model!") 34 | 35 | def _compose(self, arg1_mat, arg2_mat): 36 | return arg1_mat.multiply(arg2_mat) 37 | 38 | def export(self, filename): 39 | """ 40 | Current multiplicative model cannot be exported, it has no parameters. 41 | """ 42 | raise IllegalOperationError("cannot export a Multiplicative model.") 43 | -------------------------------------------------------------------------------- /src/composes/composition/weighted_additive.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 5, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from composition_model import CompositionModel 8 | from composes.matrix.dense_matrix import DenseMatrix 9 | from composes.utils.num_utils import is_numeric 10 | # from composes.utils.mem_utils import get_mem_usage 11 | from composes.utils.matrix_utils import resolve_type_conflict 12 | import numpy as np 13 | import math 14 | 15 | class WeightedAdditive(CompositionModel): 16 | """ 17 | Implements weighted additive compositional model: 18 | 19 | :math:`\\vec{p} = \\alpha \\vec{u} + \\beta \\vec{v}` 20 | 21 | where :math:`\\vec{p}` is the vector of the composed phrase and 22 | :math:`\\vec{u}, \\vec{v}` are the vectors of the components 23 | 24 | When :math:`\\alpha=\\beta=0.5` the model performs simple vector addition. 25 | """ 26 | 27 | _name = "weighted_additive" 28 | 29 | """ 30 | double, in interval [0,1] 31 | maximum overhead allowed: MAX_MEM_OVERHEAD ratio of peripheral space memory 32 | """ 33 | MAX_MEM_OVERHEAD = 0.2 34 | 35 | 36 | def __init__(self, alpha=None, beta=None): 37 | """ 38 | Constructor. 39 | 40 | Args: 41 | alpha: alpha parameter, numeric type. Optional, can be set through 42 | training 43 | beta: beta parameter, numeric type. Optional, can be set through 44 | training. 45 | 46 | Raises: 47 | TypeError if alpha or beta are not numeric. 48 | """ 49 | self._alpha = 0.5 50 | self._beta = 0.5 51 | if not alpha is None: 52 | if not is_numeric(alpha): 53 | raise TypeError("Parameter not numeric: %s " % (type(alpha))) 54 | else: 55 | self._alpha = alpha 56 | 57 | if not beta is None: 58 | if not is_numeric(beta): 59 | raise TypeError("Parameter not numeric: %s " % (type(beta))) 60 | else: 61 | self._beta = beta 62 | 63 | if not alpha is None and beta is None: 64 | self._beta = 1 - self._alpha 65 | 66 | 67 | def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list): 68 | 69 | # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead 70 | # the /3.0 is needed 71 | # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) 72 | chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] * self.MAX_MEM_OVERHEAD / 3.0) + 1 73 | 74 | arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (0, 0, 0, 0, 0) 75 | 76 | for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): 77 | beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) 78 | 79 | arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) 80 | arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) 81 | phrase_mat = phrase_space.get_rows(phrase_list[beg:end]) 82 | 83 | [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat, 84 | arg2_mat, 85 | phrase_mat], 86 | DenseMatrix) 87 | 88 | res = self._process(arg1_mat, arg2_mat, phrase_mat) 89 | arg1_arg2_dot += res[0] 90 | arg1_phrase_dot += res[1] 91 | arg2_phrase_dot += res[2] 92 | arg1_norm_sqr += res[3] 93 | arg2_norm_sqr += res[4] 94 | 95 | 96 | self._solve(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr) 97 | 98 | 99 | def _process(self, arg1_mat, arg2_mat, phrase_mat): 100 | 101 | # debug here 102 | # remove when done 103 | # print "Using %s MB " % (get_mem_usage()) 104 | 105 | arg1_arg2_dot = arg1_mat.multiply(arg2_mat).sum() 106 | arg1_phrase_dot = arg1_mat.multiply(phrase_mat).sum() 107 | arg2_phrase_dot = arg2_mat.multiply(phrase_mat).sum() 108 | 109 | arg1_norm_sqr = pow(arg1_mat.norm(), 2) 110 | arg2_norm_sqr = pow(arg2_mat.norm(), 2) 111 | 112 | return arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr 113 | 114 | def _solve(self, arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr): 115 | 116 | a = np.linalg.pinv(np.mat([[arg1_norm_sqr,arg1_arg2_dot], 117 | [arg1_arg2_dot,arg2_norm_sqr]])) 118 | a = a * np.mat([[arg1_phrase_dot],[arg2_phrase_dot]]) 119 | self._alpha = a[0, 0] 120 | self._beta = a[1, 0] 121 | 122 | 123 | def _compose(self, arg1_mat, arg2_mat): 124 | return self._alpha * arg1_mat + self._beta * arg2_mat 125 | 126 | def _export(self, filename): 127 | with open(filename, "w") as output_stream: 128 | output_stream.write("alpha\t%f\n" % self._alpha) 129 | output_stream.write("beta\t%f" % self._beta) 130 | 131 | def get_alpha(self): 132 | return self._alpha 133 | alpha = property(get_alpha) 134 | """ 135 | Alpha parameter, default 0.5. 136 | """ 137 | 138 | def get_beta(self): 139 | return self._beta 140 | beta = property(get_beta) 141 | """ 142 | Beta parameter, default 0.5. 143 | """ 144 | -------------------------------------------------------------------------------- /src/composes/exception/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/exception/__init__.py -------------------------------------------------------------------------------- /src/composes/exception/illegal_state_error.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jun 15, 2012 3 | 4 | @author: thenghia.pham 5 | ''' 6 | 7 | class IllegalStateError(Exception): 8 | ''' 9 | ''' 10 | def __init__(self, msg): 11 | self.__msg = msg 12 | 13 | 14 | class IllegalOperationError(Exception): 15 | ''' 16 | ''' 17 | def __init__(self, msg): 18 | self.__msg = msg -------------------------------------------------------------------------------- /src/composes/exception/invalid_argument_error.py: -------------------------------------------------------------------------------- 1 | 2 | class InvalidArgumentError(Exception): 3 | ''' 4 | ''' 5 | def __init__(self, msg): 6 | self.__msg = msg -------------------------------------------------------------------------------- /src/composes/matrix/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/matrix/__init__.py -------------------------------------------------------------------------------- /src/composes/matrix/matrix.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 17, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from composes.utils.num_utils import is_numeric 8 | from composes.utils.py_matrix_utils import is_array 9 | 10 | class Matrix(object): 11 | """ 12 | Provides a common interface for matrix implementations. 13 | 14 | Provides a common interface for different matrix implementations 15 | (sparse/dense). In vector space models, a matrix is used to encode 16 | a set of entities such as words or phrases (rows) described in terms 17 | of contextual features (columns). 18 | """ 19 | 20 | def __init__(self, *args, **kwargs): 21 | raise NotImplementedError() 22 | 23 | 24 | def __add__(self, matrix_): 25 | ''' + operation''' 26 | self._assert_same_type(matrix_) 27 | return type(self)(self.mat + matrix_.mat) 28 | 29 | def __sub__(self, matrix_): 30 | ''' - operation''' 31 | self._assert_same_type(matrix_) 32 | return type(self)(self.mat - matrix_.mat) 33 | 34 | def __neg__(self): 35 | ''' - operation''' 36 | return type(self)(-self.mat) 37 | 38 | def __mul__(self, factor): 39 | ''' * operation''' 40 | if is_numeric(factor): 41 | return type(self)(self.mat * factor) 42 | else: 43 | self._assert_same_type(factor) 44 | return type(self)(self.mat * factor.mat) 45 | 46 | def __div__(self, factor): 47 | ''' / operation''' 48 | if is_numeric(factor): 49 | if factor == 0: 50 | raise ZeroDivisionError("Division by zero") 51 | else: 52 | raise TypeError("expected numeric type, received %s" % (type(factor))) 53 | return type(self)(self.mat / float(factor)) 54 | 55 | def __rmul__(self, factor): 56 | ''' * operation''' 57 | if is_numeric(factor): 58 | return self.__mul__(factor) 59 | raise TypeError("expected numeric type, received %s" % (type(factor))) 60 | 61 | 62 | #TODO move all these asserts somewhere else 63 | def _assert_same_type(self, operand): 64 | if type(self) != type(operand): 65 | raise TypeError("expected matrix of type %s, received %s" % 66 | (type(self), type(operand))) 67 | 68 | def assert_same_shape(self, matrix_): 69 | """ 70 | Asserts that the matrix has the same shape as a second matrix. 71 | 72 | Args: 73 | matrix_: A second matrix of type Matrix. 74 | 75 | Raises: 76 | ValueError: If the current matrix and the argument matrix 77 | do not have the same shape. 78 | """ 79 | 80 | if self.mat.shape != matrix_.mat.shape: 81 | raise ValueError("inconsistent shapes: %s %s" 82 | % (str(self.mat.shape), str(matrix_.mat.shape) )) 83 | 84 | #TODO move all these asserts somewhere else 85 | def _assert_array(self, operand): 86 | if not is_array(operand): 87 | raise TypeError("expected array, received %s" % (type(operand))) 88 | 89 | 90 | def sum(self, axis=None): 91 | #return type is dense matrix of shape (1, dimy) or (dimx,1) 92 | #or a number if **kwargs is None 93 | return self.mat.sum(axis) 94 | 95 | def sorted_permutation(self, norm_function, axis_): 96 | """ 97 | Computes the permutation resulted when sorting the matrix 98 | on an axis, according to a function, in descending order. 99 | 100 | Sorts the rows or the columns (as given by axis) 101 | of a matrix according to a norm_function and returns 102 | the permutation of this as a np.array 103 | 104 | Args: 105 | norm_function: One of sum/length. A function that 106 | takes an axis as an argument (i.e. 0 or 1) and 107 | returns an array of values (i.e. sum of all rows 108 | if axis = 0 and norm_function = sum). 109 | 110 | axis_: axis value, one of 0/1 111 | 112 | Returns: 113 | perm_srtd: np.array containing the permutation of the 114 | sorting 115 | """ 116 | 117 | #norms = norm_function(axis=axis_) 118 | 119 | norms = norm_function(axis_).getA().flatten() 120 | perm_srtd = sorted(range(len(norms)), key = norms.__getitem__, 121 | reverse=True) 122 | 123 | return perm_srtd 124 | 125 | def get_mat(self): 126 | return self._mat 127 | 128 | def set_mat(self, mat_): 129 | self._mat = mat_ 130 | 131 | mat = property(get_mat, set_mat) 132 | """ 133 | Stores the actual matrix structure of the Matrix object. 134 | Of type numpy.matrix for DenseMatrix, and scipy.sparse.csr_matrix 135 | for SparseMatrix. 136 | """ 137 | 138 | def get_shape(self): 139 | return self.mat.shape 140 | 141 | shape = property(get_shape) 142 | """ 143 | Shape of the matrix, tuple with two elements. 144 | """ 145 | 146 | def copy(self): 147 | return type(self)(self.mat.copy()) 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /src/composes/semantic_space/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/semantic_space/__init__.py -------------------------------------------------------------------------------- /src/composes/semantic_space/peripheral_space.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 26, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from space import Space 8 | from numpy import array 9 | from composes.utils.space_utils import list2dict 10 | from composes.utils.space_utils import assert_dict_match_list 11 | from composes.utils.space_utils import assert_shape_consistent 12 | from composes.utils.space_utils import add_items_to_dict 13 | from composes.semantic_space.operation import FeatureSelectionOperation 14 | from composes.semantic_space.operation import DimensionalityReductionOperation 15 | from composes.utils.gen_utils import assert_is_instance 16 | from composes.matrix.matrix import Matrix 17 | 18 | class PeripheralSpace(Space): 19 | ''' 20 | classdocs 21 | ''' 22 | 23 | 24 | def __init__(self, core_space, matrix_, id2row, row2id=None): 25 | """ 26 | Constructor. 27 | 28 | Args: 29 | core_space: Space type, the core space that this is peripheral to. 30 | matrix_: Matrix type, the data matrix of the space 31 | id2row: list, the row elements 32 | row2id: dictionary, maps row strings to ids. Optional, built from 33 | id2row by default. 34 | 35 | Returns: 36 | A peripheral semantic space (type PeripheralSpace) on which the 37 | core space operations have been projected. Column indexing structures 38 | and operations are taken over from the core space. 39 | 40 | Raises: 41 | TypeError: if matrix_ or core_space are not of the correct type 42 | ValueError: if element shape is not consistent with 43 | the size of matrix rows 44 | if the matrix and the provided row and column 45 | indexing structures are not of consistent shapes. 46 | """ 47 | assert_is_instance(matrix_, Matrix) 48 | assert_is_instance(core_space, Space) 49 | assert_is_instance(id2row, list) 50 | # TODO: assert it is not a peripheral space here! 51 | 52 | if row2id is None: 53 | row2id = list2dict(id2row) 54 | else: 55 | assert_dict_match_list(row2id, id2row) 56 | 57 | column2id = core_space.column2id 58 | id2column = core_space.id2column 59 | 60 | self._operations = list(core_space.operations) 61 | self._row2id = row2id 62 | self._id2row = id2row 63 | self._column2id = column2id 64 | self._id2column = id2column 65 | 66 | self._cooccurrence_matrix = self._project_core_operations(matrix_) 67 | assert_shape_consistent(self.cooccurrence_matrix, self._id2row, 68 | self._id2column, self._row2id, self._column2id) 69 | 70 | self._element_shape = (self._cooccurrence_matrix.shape[1],) 71 | 72 | 73 | def _project_core_operations(self, matrix_): 74 | 75 | for operation in self._operations: 76 | if isinstance(operation, DimensionalityReductionOperation): 77 | self._id2column, self._column2id = [], {} 78 | 79 | if isinstance(operation, FeatureSelectionOperation): 80 | if operation.original_columns: 81 | self._id2column = list(array(operation.original_columns)[operation.selected_columns]) 82 | self._column2id = list2dict(self._id2column) 83 | else: 84 | self._id2column, self._column2id = [],{} 85 | 86 | matrix_ = operation.project(matrix_) 87 | return matrix_ 88 | 89 | 90 | def add_rows(self, matrix_, id2row): 91 | """ 92 | Adds rows to a peripheral space. 93 | 94 | Args: 95 | matrix_: Matrix type, the matrix of the elements to be added. 96 | id2row: list, string identifiers of the rows to be added. 97 | 98 | Modifies the current space by appending the new rows. 99 | All operations of the core space are projected to the new rows. 100 | 101 | Raises: 102 | ValueError: if attempting to add row strings which are already 103 | in the space. 104 | matrix of the new data is not consistent in shape 105 | with the current data matrix. 106 | """ 107 | 108 | try: 109 | self._row2id = add_items_to_dict(self.row2id, id2row) 110 | except ValueError: 111 | raise ValueError("Found duplicate keys when appending rows to\ 112 | peripheral space.") 113 | 114 | if matrix_.mat.shape[0] != len(id2row): 115 | raise ValueError("Matrix shape inconsistent with no. of rows:%s %s" 116 | % (matrix_.mat.shape, len(id2row))) 117 | 118 | self._id2row = self.id2row + id2row 119 | matrix_ = self._project_core_operations(matrix_) 120 | 121 | self._cooccurrence_matrix = self._cooccurrence_matrix.vstack(matrix_) 122 | assert_shape_consistent(self.cooccurrence_matrix, self.id2row, 123 | self.id2column, self.row2id, self.column2id) 124 | 125 | @classmethod 126 | def build(cls, core_space, **kwargs): 127 | """ 128 | Reads in data files and extracts the data to construct a semantic space. 129 | 130 | If the data is read in dense format and no columns are provided, 131 | the column indexing structures are set to empty. 132 | 133 | Args: 134 | data: file containing the counts 135 | format: format on the input data file: one of sm/dm 136 | rows: file containing the row elements. Optional, if not provided, 137 | extracted from the data file. 138 | cols: file containing the column elements 139 | 140 | Returns: 141 | A semantic space build from the input data files. 142 | 143 | Raises: 144 | ValueError: if one of data/format arguments is missing. 145 | if cols is missing and format is "sm" 146 | if the input columns provided are not consistent with 147 | the shape of the matrix (for "dm" format) 148 | 149 | """ 150 | 151 | sp = Space.build(**kwargs) 152 | 153 | mat = sp._cooccurrence_matrix 154 | id2row = sp.id2row 155 | row2id = sp.row2id 156 | return PeripheralSpace(core_space, mat, id2row, row2id) 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /src/composes/similarity/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/similarity/__init__.py -------------------------------------------------------------------------------- /src/composes/similarity/cos.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Oct 2, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | """ 6 | import numpy as np 7 | 8 | from composes.utils.py_matrix_utils import nonzero_invert 9 | 10 | from composes.similarity.similarity import Similarity 11 | from composes.similarity.dot_prod import DotProdSimilarity 12 | 13 | 14 | class CosSimilarity(Similarity): 15 | """ 16 | Computes the cosine similarity of two vectors. 17 | 18 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{<\\vec{u},\\vec{v}>}{\\sqrt{||\\vec{u}||||\\vec{v}||}}` 19 | 20 | """ 21 | 22 | def _sim(self, v1, v2): 23 | if v1.norm() == 0 or v2.norm() == 0: 24 | return 0.0 25 | s = DotProdSimilarity()._sim(v1, v2) / np.double(v1.norm() * v2.norm()) 26 | return s 27 | 28 | def _sims_to_matrix(self, vector, matrix_): 29 | sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_) 30 | 31 | vector_norm = vector.norm() 32 | row_norms = vector_norm * matrix_.norm(1) 33 | row_norms = nonzero_invert(row_norms) 34 | 35 | return sims.scale_rows(row_norms) 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/composes/similarity/dot_prod.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Oct 2, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | """ 6 | from composes.similarity.similarity import Similarity 7 | 8 | 9 | class DotProdSimilarity(Similarity): 10 | """ 11 | Computes the scalar product (dot product) of two vectors. 12 | 13 | :math:`sim(\\vec{u},\\vec{v}) = <\\vec{u},\\vec{v}> = \\sum_iu_iv_i` 14 | 15 | """ 16 | def _sim(self, v1, v2): 17 | return v1.multiply(v2).sum() 18 | 19 | def _sims_to_matrix(self, vector, matrix_): 20 | return matrix_ * vector.transpose() 21 | -------------------------------------------------------------------------------- /src/composes/similarity/euclidean.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Oct 2, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | """ 6 | from composes.similarity.similarity import Similarity 7 | 8 | 9 | class EuclideanSimilarity(Similarity): 10 | """ 11 | Computes the euclidean similarity of two vectors as the inverse of their 12 | euclidean distance. 13 | 14 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{1}{||\\vec{u}-\\vec{v}|| + 1}` 15 | """ 16 | 17 | def _sim(self, v1, v2): 18 | return 1 / (1 + (v1 - v2).norm()) 19 | -------------------------------------------------------------------------------- /src/composes/similarity/lin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Oct 2, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | """ 6 | import numpy as np 7 | 8 | from composes.similarity.similarity import Similarity 9 | 10 | 11 | class LinSimilarity(Similarity): 12 | """ 13 | Computes the Lin similarity of two vectors. 14 | 15 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{\\sum_{i \\in I}(u_i+v_i)}{\\sum_iu_i + \\sum_iv_i}` 16 | 17 | Where :math:`I=\\{i | u_i > 0 \\text{ and } v_i > 0\\}`, the set of components 18 | on which both vectors are strictly positive. 19 | 20 | """ 21 | 22 | def _sim(self, v1, v2): 23 | 24 | common = v1.multiply(v2) 25 | common.to_ones() 26 | denom = v1.sum() + v2.sum() 27 | 28 | if denom == 0: 29 | return 0 30 | else: 31 | return common.multiply(v1 + v2).sum() / np.double(denom) 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/composes/similarity/similarity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Oct 2, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | """ 6 | import numpy as np 7 | 8 | from composes.utils.matrix_utils import ( 9 | assert_is_array_or_matrix, 10 | to_compatible_matrix_types, 11 | ) 12 | 13 | 14 | class Similarity(object): 15 | 16 | def get_sim(self, v1, v2): 17 | 18 | assert_is_array_or_matrix(v1) 19 | assert_is_array_or_matrix(v2) 20 | 21 | # TODO: figure out where these asserts belong!! 22 | v1, v2 = to_compatible_matrix_types(v1, v2) 23 | v1.assert_same_shape(v2) 24 | 25 | return self._sim(v1, v2) 26 | 27 | def get_sims_to_matrix(self, vector, matrix_): 28 | 29 | assert_is_array_or_matrix(vector) 30 | assert_is_array_or_matrix(matrix_) 31 | 32 | vector, matrix_ = to_compatible_matrix_types(vector, matrix_) 33 | 34 | if vector.shape[1] != matrix_.shape[1] or vector.shape[0] != 1: 35 | raise ValueError( 36 | 'Inconsistent shapes {0} and {1}'.format(vector.shape, matrix_.shape) 37 | ) 38 | 39 | return self._sims_to_matrix(vector, matrix_) 40 | 41 | def _sims_to_matrix(self, vector, matrix_): 42 | 43 | result = np.zeros(shape=(matrix_.shape[0], 1)) 44 | for i in range(matrix_.shape[0]): 45 | result[i] = self._sim(vector, matrix_[i, :]) 46 | return type(matrix_)(result) 47 | -------------------------------------------------------------------------------- /src/composes/transformation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/__init__.py -------------------------------------------------------------------------------- /src/composes/transformation/dim_reduction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/dim_reduction/__init__.py -------------------------------------------------------------------------------- /src/composes/transformation/dim_reduction/dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 28, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | from composes.semantic_space.operation import DimensionalityReductionOperation 7 | 8 | class DimensionalityReduction(object): 9 | ''' 10 | classdocs 11 | ''' 12 | 13 | _name = "we are NOT stupid" 14 | 15 | def __init__(self, reduced_dimension): 16 | ''' 17 | Constructor 18 | ''' 19 | if reduced_dimension <= 0: 20 | raise ValueError("Cannot reduce to non-positive dimensionality: %d" 21 | % reduced_dimension) 22 | self._reduced_dimension = reduced_dimension 23 | 24 | def create_operation(self): 25 | return DimensionalityReductionOperation(self) 26 | 27 | def get_reduced_dimension(self): 28 | return self._reduced_dimension 29 | 30 | def get_name(self): 31 | return self._name 32 | 33 | def __str__(self): 34 | return self._name 35 | 36 | name = property(get_name) 37 | reduced_dimension = property(get_reduced_dimension) 38 | -------------------------------------------------------------------------------- /src/composes/transformation/dim_reduction/nmf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 1, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | import numpy as np 8 | from dimensionality_reduction import DimensionalityReduction 9 | from composes.matrix.linalg import Linalg 10 | from math import sqrt 11 | 12 | class Nmf(DimensionalityReduction): 13 | """ 14 | Performs Non-negative Matrix Factorization to reduced dimension :math:`k`. 15 | 16 | Given an input non-negative matrix :math:`X`, it computes the decomposition: 17 | 18 | :math:`X \\approx WH` where W and H are non-negative matrices which minimize 19 | :math:`||X-WH||_{2}` 20 | 21 | It returns the matrix W. 22 | """ 23 | 24 | _name = "nmf" 25 | 26 | def __init__(self, reduced_dimension): 27 | ''' 28 | Constructor 29 | ''' 30 | super(Nmf, self).__init__(reduced_dimension) 31 | 32 | def apply(self, matrix_): 33 | 34 | matrix_.assert_positive() 35 | #w_init, h_init = self.nndsvd_init(matrix_) 36 | w_init, h_init = self.v_col_init(matrix_) 37 | #w_init, h_init = self.random_init(matrix_) 38 | w, h = Linalg.nmf(matrix_, w_init, h_init) 39 | return w, Linalg.pinv(h) 40 | 41 | def random_init(self, matrix_): 42 | 43 | # TODO: implement the fancier but still fast init (from nimfa: v_col) 44 | rndcol = np.random.random_integers(0, matrix_.shape[1] - 1, 45 | self._reduced_dimension) 46 | 47 | rndrow = np.random.random_integers(0, matrix_.shape[0] - 1, 48 | self._reduced_dimension) 49 | 50 | #otherwise we would have had to convert to DenseMatrix/SparseMatrix 51 | #type(matrix_)(result) 52 | w = matrix_[:, rndcol] 53 | h = matrix_[rndrow, :] 54 | 55 | return w, h 56 | 57 | def v_col_init(self, matrix_): 58 | w = np.zeros((matrix_.shape[0], self._reduced_dimension)) 59 | h = np.zeros((self._reduced_dimension, matrix_.shape[1])) 60 | 61 | #in case there are less than 5 rows or columns 62 | p_col = matrix_.shape[1]//5 + 1 63 | p_row = matrix_.shape[0]//5 + 1 64 | for i in range(self._reduced_dimension): 65 | 66 | rndcol = np.random.random_integers(0, matrix_.shape[1] - 1, 67 | p_col) 68 | 69 | rndrow = np.random.random_integers(0, matrix_.shape[0] - 1, 70 | p_row) 71 | 72 | w[:, i] = (matrix_[:, rndcol].sum(1)/float(p_col)).flatten() 73 | h[i, :] = (matrix_[rndrow, :].sum(0)/float(p_row)).flatten() 74 | 75 | w = type(matrix_)(w) 76 | h = type(matrix_)(h) 77 | 78 | return w, h 79 | 80 | def nndsvd_init(self,matrix_): 81 | def matrix_abs(mat_): 82 | mat_p = mat_.get_non_negative() 83 | mat_n_abs = mat_p - mat_ 84 | return mat_p + mat_n_abs 85 | 86 | def padd_zeros(matrix_, axis, thickness): 87 | matrix_type = type(matrix_) 88 | if axis == 0: 89 | append_mat = matrix_type(np.zeros((thickness, matrix_.shape[1]))) 90 | return matrix_.vstack(append_mat) 91 | elif axis == 1: 92 | append_mat = matrix_type(np.zeros((matrix_.shape[0], thickness))) 93 | return matrix_.hstack(append_mat) 94 | 95 | u, s, v = Linalg.svd(matrix_, self._reduced_dimension); 96 | 97 | rank = u.shape[1] 98 | w = [[]]*rank 99 | h = [[]]*rank 100 | 101 | vt = v.transpose() 102 | 103 | w[0] = sqrt(s[0]) * matrix_abs(u[:,0]) 104 | h[0] = sqrt(s[0]) * matrix_abs(vt[0,:]) 105 | 106 | for i in range(1,rank): 107 | uu = u[:,i] 108 | vv = vt[i,:] 109 | uup = uu.get_non_negative() 110 | uun = uup - uu 111 | vvp = vv.get_non_negative() 112 | vvn = vvp - vv 113 | 114 | n_uup = uup.norm() 115 | n_uun = uun.norm() 116 | n_vvp = vvp.norm() 117 | n_vvn = vvn.norm() 118 | 119 | termp = n_uup * n_vvp; termn = n_uun * n_vvn 120 | if (termp >= termn): 121 | w[i] = sqrt(s[i] * termp) * uup / n_uup 122 | h[i] = sqrt(s[i] * termp) * vvp / n_vvp 123 | else: 124 | w[i] = sqrt(s[i] * termn) * uun / n_uun 125 | h[i] = sqrt(s[i] * termn) * vvn / n_vvn 126 | 127 | w = matrix_.nary_hstack(w) 128 | h = matrix_.nary_vstack(h) 129 | 130 | w.remove_small_values(0.0000000001) 131 | h.remove_small_values(0.0000000001) 132 | 133 | if (rank < self._reduced_dimension): 134 | w = padd_zeros(w, 1, self._reduced_dimension - rank) 135 | h = padd_zeros(h, 0, self._reduced_dimension - rank) 136 | return w,h 137 | -------------------------------------------------------------------------------- /src/composes/transformation/dim_reduction/svd.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 28, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from dimensionality_reduction import DimensionalityReduction 8 | from composes.matrix.linalg import Linalg 9 | 10 | class Svd(DimensionalityReduction): 11 | """ 12 | Performs truncated Singular Value Decomposition to a reduced dimension :math:`k`. 13 | 14 | Given an input matrix :math:`X`, it computes the decomposition: 15 | 16 | :math:`X = U \\Sigma V^{T}` 17 | 18 | It returns :math:`U \\Sigma` truncated to dimension :math:`min(k,rank(X))` 19 | """ 20 | 21 | _name = "svd" 22 | 23 | def __init__(self, reduced_dimension): 24 | ''' 25 | Constructor 26 | ''' 27 | super(Svd, self).__init__(reduced_dimension) 28 | 29 | def apply(self, matrix_): 30 | 31 | u, s, v = Linalg.svd(matrix_, self._reduced_dimension) 32 | return u.scale_columns(s), v 33 | 34 | -------------------------------------------------------------------------------- /src/composes/transformation/feature_selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/feature_selection/__init__.py -------------------------------------------------------------------------------- /src/composes/transformation/feature_selection/feature_selection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 5, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | from composes.semantic_space.operation import FeatureSelectionOperation 7 | 8 | class FeatureSelection(object): 9 | ''' 10 | classdocs 11 | ''' 12 | 13 | 14 | def __init__(self, reduced_dimension): 15 | 16 | if reduced_dimension <= 0: 17 | raise ValueError("Cannot reduce to non-positive dimensionality: %d" 18 | % reduced_dimension) 19 | self._reduced_dimension = reduced_dimension 20 | 21 | def create_operation(self): 22 | return FeatureSelectionOperation(self) 23 | 24 | def get_reduced_dimension(self): 25 | return self._reduced_dimension 26 | 27 | reduced_dimension = property(get_reduced_dimension) -------------------------------------------------------------------------------- /src/composes/transformation/feature_selection/top_feature_selection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 5, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | from warnings import warn 7 | from feature_selection import FeatureSelection 8 | 9 | class TopFeatureSelection(FeatureSelection): 10 | """ 11 | Sorts the columns of a space according to some criterion and returns a space 12 | containing only the top :math:`k` ones. 13 | 14 | Available criteria: 15 | 16 | sum: Default. Ranks columns according to the sum on their elements. 17 | 18 | length: Ranks columns according to their vector length. 19 | 20 | """ 21 | 22 | _name = "top_feature_selection" 23 | _valid_criteria = {"sum", "length"} 24 | 25 | def __init__(self, reduced_dimension, criterion='sum'): 26 | ''' 27 | Constructor 28 | ''' 29 | super(TopFeatureSelection, self).__init__(reduced_dimension) 30 | 31 | if criterion: 32 | if criterion not in self._valid_criteria: 33 | raise ValueError("Unrecognized criterion: %s" % criterion) 34 | self.criterion = criterion 35 | 36 | def apply(self, matrix_): 37 | 38 | if self.criterion == "sum": 39 | norm_function = matrix_.sum 40 | else: 41 | norm_function = matrix_.norm 42 | 43 | if self._reduced_dimension >= matrix_.shape[1]: 44 | warn("Reduced dimension larger than number of columns!") 45 | 46 | no_columns = min(self._reduced_dimension, matrix_.shape[1]) 47 | sorted_perm = matrix_.sorted_permutation(norm_function, 0) 48 | 49 | sorted_perm = sorted_perm[0:no_columns] 50 | matrix_ = matrix_[:, sorted_perm] 51 | 52 | return matrix_, sorted_perm 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/composes/transformation/scaling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/scaling/__init__.py -------------------------------------------------------------------------------- /src/composes/transformation/scaling/epmi_weighting.py: -------------------------------------------------------------------------------- 1 | 2 | from scaling import Scaling 3 | from composes.utils.py_matrix_utils import nonzero_invert 4 | 5 | class EpmiWeighting(Scaling): 6 | """ 7 | Exponential Point-wise Mutual Information. 8 | 9 | :math:`epmi(r,c) = \\frac{P(r,c)}{P(r)P(c)}` 10 | 11 | """ 12 | 13 | _name = 'epmi' 14 | _uses_column_stats = True 15 | 16 | def apply(self, matrix_, column_marginal=None): 17 | """ 18 | Performs epmi weighting. 19 | 20 | Args: 21 | matrix_ (Matrix): Input matrix 22 | 23 | column_marginal (np.ndarray): column marginals of the 24 | core matrix if the matrix is a peripheral matrix 25 | 26 | Returns: 27 | Matrix: the matrix after applying epmi. 28 | 29 | """ 30 | 31 | matrix_.assert_positive() 32 | row_sum = matrix_.sum(axis = 1) 33 | 34 | if not column_marginal is None: 35 | col_sum = column_marginal 36 | else: 37 | col_sum = matrix_.sum(axis = 0) 38 | 39 | total = col_sum.sum() 40 | 41 | row_sum = nonzero_invert(row_sum) 42 | col_sum = nonzero_invert(col_sum) 43 | col_sum = col_sum * total 44 | 45 | matrix_ = matrix_.scale_rows(row_sum) 46 | matrix_ = matrix_.scale_columns(col_sum) 47 | 48 | return matrix_ 49 | 50 | def get_column_stats(self, matrix_): 51 | return matrix_.sum(0) 52 | 53 | -------------------------------------------------------------------------------- /src/composes/transformation/scaling/normalization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 4, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | from numpy import double 7 | from warnings import warn 8 | from scaling import Scaling 9 | 10 | class Normalization(Scaling): 11 | """ 12 | Normalizes the a space according to a some criterion. 13 | 14 | Available criteria: 15 | 16 | sum: Default. The result matrix :math:`X` will satisfy: :math:`\\sum_{i,j} X_{ij}=1` 17 | 18 | length: The result matrix :math:`X` will satisfy: :math:`\\sqrt{\\sum_{i,j} X_{ij}^2}=1` 19 | 20 | """ 21 | _name = "row_normalization" 22 | _valid_criteria = ["sum", "length"] 23 | _uses_column_stats = True 24 | 25 | def __init__(self, criterion='sum'): 26 | ''' 27 | Constructor 28 | ''' 29 | if criterion: 30 | if criterion not in self._valid_criteria: 31 | raise ValueError("Unrecognized criterion: %s" % criterion) 32 | self.criterion = criterion 33 | 34 | 35 | def apply(self, matrix_, total=None): 36 | 37 | if total is None: 38 | if self.criterion == "length": 39 | total = matrix_.norm() 40 | else: 41 | total = matrix_.sum() 42 | 43 | if total == 0: 44 | warn("Could not normalize: sum/length of matrix is 0.") 45 | return matrix_ 46 | 47 | matrix_ = (1 / double(total)) * matrix_ 48 | return matrix_ 49 | 50 | def get_column_stats(self, matrix_): 51 | 52 | if self.criterion == "length": 53 | return matrix_.norm() 54 | else: 55 | return matrix_.sum() 56 | -------------------------------------------------------------------------------- /src/composes/transformation/scaling/plmi_weighting.py: -------------------------------------------------------------------------------- 1 | 2 | from scaling import Scaling 3 | from ppmi_weighting import PpmiWeighting 4 | 5 | class PlmiWeighting(Scaling): 6 | """ 7 | Positive Local Mutual Information. 8 | 9 | :math:`plmi(r,c)=ppmi(r,c)count(r,c)` 10 | 11 | """ 12 | 13 | _name = "plmi" 14 | _uses_column_stats = True 15 | 16 | def apply(self, matrix_, column_marginal=None): 17 | return matrix_.multiply(PpmiWeighting().apply(matrix_, 18 | column_marginal)) 19 | 20 | 21 | def get_column_stats(self, matrix_): 22 | return matrix_.sum(0) -------------------------------------------------------------------------------- /src/composes/transformation/scaling/plog_weighting.py: -------------------------------------------------------------------------------- 1 | 2 | from scaling import Scaling 3 | 4 | class PlogWeighting(Scaling): 5 | """ 6 | Positive Log Weighting 7 | 8 | :math:`plog(r,c)= log(r,c) \\text{ if } log(r,c) \\geq 0 \\text{ else } 0` 9 | """ 10 | 11 | _name = "plog" 12 | 13 | def apply(self, matrix_): 14 | ''' 15 | Performs positive log weighting. 16 | 17 | Args: 18 | matrix_ (Matrix): Input matrix 19 | column_marginal (array): column marginals of the core matrix if the matrix is a peripheral matrix 20 | 21 | Returns: 22 | Matrix: the matrix after applying plog 23 | 24 | ''' 25 | matrix_ = matrix_.copy() 26 | matrix_.plog() 27 | return matrix_ 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/composes/transformation/scaling/ppmi_weighting.py: -------------------------------------------------------------------------------- 1 | 2 | from scaling import Scaling 3 | from epmi_weighting import EpmiWeighting 4 | 5 | class PpmiWeighting(Scaling): 6 | """ 7 | Positive Point-wise Mutual Information. 8 | 9 | 10 | :math:`pmi(r,c) = log\\frac{P(r,c)}{P(r)P(c)}` 11 | 12 | :math:`ppmi(r,c)= pmi(r,c) \\text{ if } pmi(r,c)\\geq 0 \\text{ else } 0` 13 | """ 14 | 15 | _name = "ppmi" 16 | _uses_column_stats = True 17 | 18 | def apply(self, matrix_, column_marginal=None): 19 | 20 | matrix_ = EpmiWeighting().apply(matrix_, column_marginal) 21 | matrix_.plog() 22 | return matrix_ 23 | 24 | def get_column_stats(self, matrix_): 25 | return matrix_.sum(0) 26 | 27 | """ 28 | :math:`ppmi(r,c)=\\begin{cases}pmi(rc) & \\text{if }pmi(r,c)\\geq0 29 | 0 & \\text{otherwise}\\end{cases}` 30 | """ -------------------------------------------------------------------------------- /src/composes/transformation/scaling/row_normalization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 4, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from scaling import Scaling 8 | from composes.utils.py_matrix_utils import nonzero_invert 9 | 10 | class RowNormalization(Scaling): 11 | """ 12 | Normalizes the rows of a space according to a some criterion. 13 | 14 | Available criteria: 15 | 16 | length: Default. Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sqrt{\\sum_j X_{ij}^2}=1` 17 | 18 | 19 | sum: Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sum_j X_{ij}=1` 20 | 21 | """ 22 | _name = "row_normalization" 23 | _valid_criteria = ["sum", "length"] 24 | 25 | def __init__(self, criterion='length'): 26 | ''' 27 | Constructor 28 | ''' 29 | if criterion: 30 | if criterion not in self._valid_criteria: 31 | raise ValueError("Unrecognized criterion: %s" % criterion) 32 | self.criterion = criterion 33 | 34 | 35 | def apply(self, matrix_): 36 | 37 | if self.criterion == "length": 38 | row_norms = matrix_.norm(axis=1) 39 | else: 40 | row_norms = matrix_.sum(axis=1) 41 | 42 | inv_row_norm = nonzero_invert(row_norms) 43 | matrix_ = matrix_.scale_rows(inv_row_norm) 44 | return matrix_ 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/composes/transformation/scaling/scaling.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 20, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from composes.semantic_space.operation import ScalingOperation 8 | 9 | class Scaling(object): 10 | ''' 11 | classdocs 12 | ''' 13 | _name = "we are NOT stupid" 14 | _uses_column_stats = False 15 | 16 | def get_name(self): 17 | return self._name 18 | 19 | def get_uses_column_stats(self): 20 | return self._uses_column_stats 21 | 22 | def create_operation(self): 23 | return ScalingOperation(self) 24 | 25 | def __str__(self): 26 | return self._name 27 | 28 | name = property(get_name) 29 | uses_column_stats = property(get_uses_column_stats) -------------------------------------------------------------------------------- /src/composes/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/utils/__init__.py -------------------------------------------------------------------------------- /src/composes/utils/crossvalidation_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 9, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from random import shuffle 8 | 9 | def get_split_indices(range_len, fold): 10 | 11 | if fold <= 0: 12 | raise ValueError("Strictly positive number of folds required, received %s:" 13 | % fold) 14 | 15 | indices_list = [] 16 | if range_len < fold: 17 | return get_split_indices(range_len, range_len) 18 | 19 | range_ = range(range_len) 20 | shuffle(range_) 21 | current_index = 0 22 | for i in range(fold): 23 | if i < len(range_)%fold: 24 | slice_length = range_len // fold + 1 25 | else: 26 | slice_length = range_len // fold 27 | 28 | indices_list.append(range_[current_index:current_index + slice_length]) 29 | current_index += slice_length 30 | 31 | return indices_list 32 | 33 | def get_submatrix_list(matrix_, indices_list): 34 | return [matrix_[indices, :] for indices in indices_list] 35 | 36 | -------------------------------------------------------------------------------- /src/composes/utils/gen_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 21, 2013 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | from composes.exception.invalid_argument_error import InvalidArgumentError 7 | 8 | 9 | def assert_is_instance(object_, class_): 10 | if not isinstance(object_, class_): 11 | raise TypeError("expected %s, received %s" % (class_, type(object_))) 12 | 13 | 14 | def get_partitions(sorted_list, min_samples): 15 | prev_idx = 0 16 | range_list = [] 17 | for i in range(1, len(sorted_list)): 18 | if sorted_list[i] != sorted_list[i - 1]: 19 | if i - prev_idx >= min_samples: 20 | range_list.append((prev_idx, i)) 21 | 22 | prev_idx = i 23 | 24 | if len(sorted_list) - prev_idx >= min_samples: 25 | range_list.append((prev_idx, len(sorted_list))) 26 | 27 | keys = [sorted_list[range_list[i][0]] for i in xrange(len(range_list))] 28 | 29 | return keys, range_list -------------------------------------------------------------------------------- /src/composes/utils/log_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 15, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from numpy import double 8 | import logging 9 | from composes.utils.io_utils import create_parent_directories 10 | 11 | def config_logging(file_name, level = logging.INFO, format_ =""): 12 | if not file_name is None: 13 | create_parent_directories(file_name) 14 | logging.basicConfig(filename=file_name, level=level, format=format_) 15 | logging.debug("start logging") 16 | 17 | 18 | def get_ident(delim, ident_level): 19 | return delim * ident_level 20 | 21 | def print_matrix_info(logger_, matrix_, ident_level, intro_string): 22 | delim = " " 23 | ident = get_ident(delim, ident_level) 24 | logger_string = ident + intro_string 25 | ident = ident + delim 26 | 27 | logger_string += ("\n%sMatrix type:%s" % (ident, type(matrix_).__name__)) 28 | logger_string += ("\n%sMatrix shape:%sx%s" % (ident, matrix_.shape[0], 29 | matrix_.shape[1])) 30 | 31 | if type(matrix_).__name__ == "SparseMatrix": 32 | perc_nnz = 100 * matrix_.mat.nnz/double(matrix_.shape[0]*matrix_.shape[1]) 33 | logger_string += ("\n%sPerc. non-zero entries:%d" % (ident, perc_nnz)) 34 | 35 | logger_.info(logger_string) 36 | 37 | 38 | def get_learner_info(learner, ident): 39 | logger_string = "" 40 | 41 | if hasattr(learner, '_intercept'): 42 | logger_string += ("\n%sUsing intercept:%s" % (ident, learner._intercept)) 43 | 44 | if hasattr(learner, '_crossvalidation'): 45 | logger_string += ("\n%sUsing crossvalidation:%s" % (ident, learner._crossvalidation)) 46 | 47 | if learner._crossvalidation and hasattr(learner, '_folds'): 48 | logger_string += ("\n%sUsing number of folds:%s" % (ident, learner._folds)) 49 | 50 | return logger_string 51 | 52 | def print_composition_model_info(logger_, model, ident_level, intro_string): 53 | 54 | delim = " " 55 | ident = get_ident(delim, ident_level) 56 | logger_string = ident + intro_string 57 | ident = ident + delim 58 | 59 | logger_.info(logger_string) 60 | 61 | print_name(logger_, model, ident_level, "Composition model type:") 62 | 63 | logger_string = "" 64 | if hasattr(model, '_regression_learner'): 65 | logger_string += ("\n%sUsing regression:%s" % (ident, 66 | type(model.regression_learner).__name__)) 67 | logger_string += get_learner_info(model.regression_learner, ident + delim) 68 | 69 | logger_.info(logger_string) 70 | 71 | def print_transformation_info(logger_, trans, ident_level, intro_string): 72 | delim = " " 73 | ident = get_ident(delim, ident_level) 74 | logger_string = ident + intro_string 75 | ident = ident + delim 76 | 77 | logger_string += ("\n%sTransformation type:%s" % (ident, type(trans).__name__)) 78 | 79 | if hasattr(trans, '_reduced_dimension'): 80 | logger_string += ("\n%sReduced dimension:%s" % (ident, trans.reduced_dimension)) 81 | 82 | 83 | logger_.info(logger_string) 84 | 85 | def print_info(logger_, ident_level, text): 86 | delim = " " 87 | ident = get_ident(delim, ident_level) 88 | logger_string = ident + "" 89 | 90 | logger_string += "\n%s%s" % (ident, text) 91 | logger_.info(logger_string) 92 | 93 | def print_name(logger_, object_, ident_level, intro_string): 94 | delim = " " 95 | ident = get_ident(delim, ident_level) 96 | logger_string = ident + intro_string 97 | ident = ident + delim 98 | 99 | logger_string += ("\n%s%s" % (ident, type(object_).__name__)) 100 | 101 | logger_.info(logger_string) 102 | 103 | def print_time_info(logger_, end, beg, ident_level): 104 | delim = " " 105 | ident = get_ident(delim, ident_level) 106 | logger_string = ident 107 | logger_string += ("\n%sTiming:%s seconds" % (ident, end - beg)) 108 | 109 | logger_.info(logger_string) 110 | 111 | -------------------------------------------------------------------------------- /src/composes/utils/matrix_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from composes.matrix.sparse_matrix import SparseMatrix 4 | from composes.matrix.dense_matrix import DenseMatrix 5 | from composes.matrix.matrix import Matrix 6 | from scipy.sparse import issparse 7 | from py_matrix_utils import is_array 8 | from warnings import warn 9 | 10 | def to_matrix(matrix_): 11 | """ 12 | Converts an array-like structure to a DenseMatrix/SparseMatrix 13 | """ 14 | if issparse(matrix_): 15 | return SparseMatrix(matrix_) 16 | else: 17 | return DenseMatrix(matrix_) 18 | 19 | def is_array_or_matrix(data): 20 | return is_array(data) or isinstance(data, Matrix) 21 | 22 | 23 | def assert_is_array_or_matrix(data): 24 | if not is_array_or_matrix(data): 25 | raise TypeError("expected array-like or matrix, received %s" 26 | % (type(data))) 27 | 28 | def padd_matrix(matrix_, axis, value=1): 29 | matrix_type = type(matrix_) 30 | if axis == 0: 31 | append_mat = matrix_type(np.ones((1, matrix_.shape[1]))*value) 32 | return matrix_.vstack(append_mat) 33 | elif axis == 1: 34 | append_mat = matrix_type(np.ones((matrix_.shape[0], 1))*value) 35 | return matrix_.hstack(append_mat) 36 | else: 37 | raise ValueError("Invalid axis value:%s" % axis) 38 | 39 | 40 | def assert_same_shape(matrix1, matrix2, axis=None): 41 | 42 | if axis is None: 43 | if matrix1.shape != matrix2.shape: 44 | raise ValueError("Inconsistent shapes") 45 | else: 46 | if not axis in [0, 1]: 47 | raise ValueError("Invalid axis value: %s, expected 0 or 1." % axis) 48 | if matrix1.shape[axis] != matrix2.shape[axis]: 49 | raise ValueError("Inconsistent shapes") 50 | 51 | 52 | def to_compatible_matrix_types(v1, v2): 53 | 54 | if isinstance(v1, Matrix) and isinstance(v2, Matrix): 55 | v2 = type(v1)(v2) 56 | elif not isinstance(v1, Matrix) and isinstance(v2, Matrix): 57 | v1 = type(v2)(v1) 58 | elif not isinstance(v2, Matrix) and isinstance(v1, Matrix): 59 | v2 = type(v1)(v2) 60 | else: 61 | v1 = to_matrix(v1) 62 | v2 = type(v1)(v2) 63 | 64 | return v1, v2 65 | 66 | 67 | 68 | def get_type_of_largest(matrix_list): 69 | max_dim = 0 70 | max_type = None 71 | for matrix_ in matrix_list: 72 | if matrix_.shape[0] * matrix_.shape[1] > max_dim: 73 | max_type = type(matrix_) 74 | max_dim = matrix_.shape[0] * matrix_.shape[1] 75 | 76 | return max_type 77 | 78 | def resolve_type_conflict(matrix_list, matrix_type): 79 | new_matrix_list = [] 80 | 81 | if matrix_type_conflict(matrix_list): 82 | warn("Efficiency warning: matrices should have the same dense/sparse type!") 83 | for matrix_ in matrix_list: 84 | new_matrix_list.append(matrix_type(matrix_)) 85 | return new_matrix_list 86 | 87 | return list(matrix_list) 88 | 89 | 90 | def matrix_type_conflict(matrix_list): 91 | 92 | if not matrix_list: 93 | return False 94 | 95 | matrix_type = type(matrix_list[0]) 96 | for matrix_ in matrix_list: 97 | if not isinstance(matrix_, matrix_type): 98 | return True 99 | 100 | return False 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/composes/utils/mem_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 21, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | """ 8 | Wrappers around psutil functions that display memory usage information. 9 | """ 10 | import numpy as np 11 | from os import getpid 12 | import psutil 13 | 14 | def get_mem_usage(): 15 | p = psutil.Process(getpid()) 16 | return p.get_memory_info()[0]/np.double(1024*1024) -------------------------------------------------------------------------------- /src/composes/utils/num_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 18, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | from numbers import Number 8 | from numbers import Integral 9 | import numpy as np 10 | 11 | def is_numeric(operand): 12 | return isinstance(operand, (Number, np.number)) 13 | 14 | def is_integer(operand): 15 | return isinstance(operand, Integral) 16 | -------------------------------------------------------------------------------- /src/composes/utils/py_matrix_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 19, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import numpy as np 7 | from scipy.sparse import spdiags 8 | 9 | 10 | def array_to_csr_diagonal(array_): 11 | #array_ can't be a sparse matrix, if it is dense, it has to be a row matrix 12 | #(i.e. shape = (1, x)) 13 | 14 | flat_array = array_.flatten() 15 | array_size = flat_array.size 16 | csr_diag = spdiags(flat_array, [0], array_size, array_size, format = 'csr') 17 | return csr_diag 18 | 19 | def is_array(operand): 20 | return hasattr(operand, 'dtype') and hasattr(operand, 'shape') 21 | 22 | 23 | def nonzero_invert(matrix_): 24 | ''' 25 | Performs 1/x for all x, non-zero elements of the matrix. 26 | 27 | Params: 28 | matrix_: np.matrix 29 | ''' 30 | 31 | matrix_ = matrix_.astype(np.double) 32 | matrix_[matrix_ != 0] = np.array(1.0/matrix_[matrix_ != 0]).flatten() 33 | return matrix_ 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/composes/utils/regression_learner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from composes.matrix.linalg import Linalg 3 | 4 | 5 | class RegressionLearner(object): 6 | """ 7 | Implements a set of regression methods. 8 | 9 | Supported regression methods are least squares regression and 10 | ridge regression. Ridge regression can be used with generalized 11 | cross validation. (Hastie, Tibshirani and Friedman, Second edition, 12 | page 244) 13 | """ 14 | 15 | 16 | def __init__(self): 17 | ''' 18 | Constructor 19 | ''' 20 | 21 | def has_intercept(self): 22 | return self._intercept 23 | 24 | 25 | class LstsqRegressionLearner(RegressionLearner): 26 | """ 27 | This class performs Least Squares Regression. 28 | 29 | It finds the matrix X which solves: 30 | 31 | :math:`X = argmin(||AX - B||_2)` 32 | 33 | It can be used with intercept or without (by default intercept=True). 34 | 35 | """ 36 | 37 | def __init__(self, intercept=True): 38 | self._intercept = intercept 39 | 40 | def train(self, matrix_a, matrix_b): 41 | return Linalg.lstsq_regression(matrix_a, matrix_b, self._intercept) 42 | 43 | 44 | class RidgeRegressionLearner(RegressionLearner): 45 | """ 46 | This class performs Ridge Regression. 47 | 48 | It finds the matrix X which solves: 49 | 50 | :math:`X = argmin(||AX - B||_2 + \\lambda||X||_2)` 51 | 52 | It can be used with intercept or without (by default intercept=True). 53 | Cross validation can be used with default :math:`\\lambda` range of 54 | :math:`linspace(0, 5, 11)`. By default Generalized cross validation is performed. 55 | If cross validation is set False it requires the input of a :math:`\\lambda` value. 56 | 57 | """ 58 | 59 | def __init__(self, intercept=True, param_range=None, crossvalidation=True, param=None): 60 | self._intercept = intercept 61 | self._param_range = param_range if param_range is not None else np.linspace(0.0, 5, 11) 62 | 63 | self._param = param 64 | self._crossvalidation = crossvalidation 65 | 66 | if param: 67 | self._crossvalidation = False 68 | self._param = param 69 | 70 | if not self._crossvalidation and self._param is None: 71 | raise ValueError("Cannot run (no-crossvalidation) RidgeRegression with no lambda value!") 72 | 73 | 74 | def train(self, matrix_a, matrix_b): 75 | """ 76 | If cross validation is set to True, it performs generalized 77 | cross validation. (Hastie, Tibshirani and Friedman, Second edition, 78 | page 244). 79 | """ 80 | 81 | if not self._crossvalidation: 82 | return Linalg.ridge_regression(matrix_a, matrix_b, self._param, 83 | self._intercept)[0] 84 | 85 | else: 86 | min_err_param = 0 87 | min_err = np.Inf 88 | gcv_err = np.Inf 89 | 90 | N = matrix_a.shape[0] 91 | for param in self._param_range: 92 | 93 | mat_x, S_trace, err1 = Linalg.ridge_regression(matrix_a, matrix_b, param, 94 | self._intercept) 95 | 96 | nom = pow(1 - S_trace / N, 2) * N 97 | if nom != 0: 98 | gcv_err = (err1 * err1) / nom 99 | 100 | if gcv_err < min_err: 101 | min_err = gcv_err 102 | min_err_param = param 103 | 104 | #print "lambda:", min_err_param 105 | return Linalg.ridge_regression(matrix_a, matrix_b, min_err_param, 106 | self._intercept)[0] 107 | -------------------------------------------------------------------------------- /src/composes/utils/scoring_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 17, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | import numpy as np 8 | from scipy import stats 9 | 10 | 11 | def score(gold, prediction, method): 12 | if len(gold) != len(prediction): 13 | raise ValueError("The two arrays must have the same length!") 14 | 15 | gold = np.array(gold, dtype=np.double) 16 | prediction = np.array(prediction, dtype=np.double) 17 | 18 | if method == "pearson": 19 | return pearson(gold, prediction)[0] 20 | elif method == "spearman": 21 | return spearman(gold, prediction)[0] 22 | elif method == "auc": 23 | return auc(gold, prediction) 24 | else: 25 | raise NotImplementedError("Unknown scoring measure:%s" % method) 26 | 27 | def pearson(gold, prediction): 28 | return stats.pearsonr(gold, prediction) 29 | 30 | def spearman(gold, prediction): 31 | return stats.spearmanr(gold, prediction, None) 32 | 33 | def auc(gold, prediction): 34 | 35 | positive = float(gold[gold == 1].size) 36 | negative = float(gold.size - positive) 37 | 38 | total_count = gold.size 39 | point_set = np.empty(total_count, dtype = [('gold',float),('score',float)]) 40 | for i in range(total_count): 41 | if not gold[i] in (0,1): 42 | raise ValueError("For evaluating AUC, gold scores are required to be 0 or 1.") 43 | point_set[i]=(gold[i], prediction[i]) 44 | 45 | point_set.sort(order = 'score') 46 | 47 | xi = 1.0 48 | yi = 1.0 49 | xi_old = 1.0 50 | true_positive = positive 51 | false_positive = negative 52 | auc = 0 53 | 54 | for i in range(total_count): 55 | if (point_set[i][0] == 1): 56 | true_positive -= 1 57 | yi = true_positive / positive 58 | else: 59 | false_positive -= 1 60 | xi = false_positive / negative 61 | auc += (xi_old - xi) * yi 62 | xi_old = xi 63 | 64 | return auc 65 | -------------------------------------------------------------------------------- /src/composes/utils/space_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 26, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | 8 | def list2dict(list_): 9 | return_dict = {} 10 | 11 | for idx, word in enumerate(list_): 12 | if word in return_dict: 13 | raise ValueError("duplicate string found in list: %s" % (word)) 14 | return_dict[word] = idx 15 | 16 | return return_dict 17 | 18 | def add_items_to_dict(dict_, list_): 19 | 20 | no_els = len(dict_) 21 | for idx, el in enumerate(list_): 22 | if el in dict_: 23 | raise ValueError("Found duplicate keys when appending elements to\ 24 | dictionary.") 25 | dict_[el] = no_els + idx 26 | return dict_ 27 | 28 | def assert_dict_match_list(dict_, list_): 29 | 30 | match_err = ValueError("expected matching dictionary and list structures.") 31 | 32 | if not len(list_) == len(dict_): 33 | raise match_err 34 | for (k, v) in dict_.iteritems(): 35 | if not list_[v] == k: 36 | raise match_err 37 | 38 | 39 | def assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id): 40 | 41 | no_rows = matrix_.mat.shape[0] 42 | no_cols = matrix_.mat.shape[1] 43 | 44 | has_column_maps = column2id or id2column 45 | 46 | if not no_rows == len(id2row) or not no_rows == len(row2id): 47 | raise ValueError("expected consistent shapes: %d %d %d" 48 | % (no_rows, len(id2row), len(row2id))) 49 | 50 | if (has_column_maps and 51 | (not no_cols == len(id2column) or not no_cols == len(column2id))): 52 | raise ValueError("expected consistent shapes: %d %d %d" 53 | % (no_cols, len(id2column), len(column2id))) 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/__init__.py -------------------------------------------------------------------------------- /src/examples/cmd_ex01.sh: -------------------------------------------------------------------------------- 1 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm -o ../examples/data/out/ 2 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm --output_format dm -w ppmi,plog -r svd_2 -n none,row -o ../examples/data/out/ -l ../examples/data/out/ex01.log 3 | #or 4 | python2.7 build_core_space.py ../examples/data/in/config1.cfg 5 | python2.7 build_core_space.py ../examples/data/in/config2.cfg 6 | -------------------------------------------------------------------------------- /src/examples/cmd_ex02.sh: -------------------------------------------------------------------------------- 1 | python2.7 build_peripheral_space.py -i ../examples/data/in/ex05 --input_format sm -o ../examples/data/out/ -c ../examples/data/out/CORE_SS.ex01.ppmi.svd_2.pkl 2 | -------------------------------------------------------------------------------- /src/examples/cmd_ex03.sh: -------------------------------------------------------------------------------- 1 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs1.txt -c 1,2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos,euclidean 2 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs2.txt -c 1,2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos,euclidean 3 | -------------------------------------------------------------------------------- /src/examples/cmd_ex04.sh: -------------------------------------------------------------------------------- 1 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos 2 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos -------------------------------------------------------------------------------- /src/examples/cmd_ex05.sh: -------------------------------------------------------------------------------- 1 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m dilation --lambda 2 -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm 2 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m mult -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm 3 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm 4 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp2.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ --output_format dm 5 | -------------------------------------------------------------------------------- /src/examples/cmd_ex06.sh: -------------------------------------------------------------------------------- 1 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True 2 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -r ridge --lambda 0.0 -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True -------------------------------------------------------------------------------- /src/examples/cmd_ex07.sh: -------------------------------------------------------------------------------- 1 | python2.7 evaluate_similarities.py -i ../examples/data/in/sim_data.txt -c 3,5 -m pearson,spearman 2 | python2.7 evaluate_similarities.py --in_dir ../examples/data/in/ --filter sim_data -c 3,5 -m pearson,spearman 3 | -------------------------------------------------------------------------------- /src/examples/data/in/config1.cfg: -------------------------------------------------------------------------------- 1 | [build_core_space] 2 | 3 | #input file 4 | input=../examples/data/in/ex01 5 | 6 | # output directory 7 | output=../examples/data/out/ 8 | 9 | # input format 10 | input_format=sm 11 | 12 | -------------------------------------------------------------------------------- /src/examples/data/in/config2.cfg: -------------------------------------------------------------------------------- 1 | [build_core_space] 2 | 3 | #input file 4 | input=../examples/data/in/ex01 5 | 6 | # output directory 7 | output=../examples/out/ 8 | 9 | # input format 10 | input_format=sm 11 | 12 | # weighing schemes 13 | weighting=ppmi,plog 14 | 15 | # reductions 16 | reduction=svd_2 17 | 18 | # normalizations 19 | normalization=none,row 20 | 21 | # additional output format 22 | output_format=dm 23 | 24 | # log file 25 | log=../examples/data/out/ex01.log -------------------------------------------------------------------------------- /src/examples/data/in/data_to_comp.txt: -------------------------------------------------------------------------------- 1 | book book book__book 2 | car book car__book 3 | car car car__car 4 | -------------------------------------------------------------------------------- /src/examples/data/in/data_to_comp2.txt: -------------------------------------------------------------------------------- 1 | book history_book book__history_book 2 | car sports_car car__sports_car 3 | book sports_car book__sports_book 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/examples/data/in/ex01.cols: -------------------------------------------------------------------------------- 1 | red 2 | blue 3 | readable 4 | -------------------------------------------------------------------------------- /src/examples/data/in/ex01.rows: -------------------------------------------------------------------------------- 1 | car 2 | book 3 | 4 | -------------------------------------------------------------------------------- /src/examples/data/in/ex01.sm: -------------------------------------------------------------------------------- 1 | car red 5 2 | book readable 6 3 | car blue 1 4 | book red 3 5 | -------------------------------------------------------------------------------- /src/examples/data/in/ex05.cols: -------------------------------------------------------------------------------- 1 | red 2 | blue 3 | readable 4 | -------------------------------------------------------------------------------- /src/examples/data/in/ex05.sm: -------------------------------------------------------------------------------- 1 | sports_car red 5 2 | history_book readable 1 3 | history_book red 1 4 | -------------------------------------------------------------------------------- /src/examples/data/in/ex10.cols: -------------------------------------------------------------------------------- 1 | red 2 | blue 3 | readable 4 | -------------------------------------------------------------------------------- /src/examples/data/in/ex10.rows: -------------------------------------------------------------------------------- 1 | book 2 | car 3 | bike 4 | good -------------------------------------------------------------------------------- /src/examples/data/in/ex10.sm: -------------------------------------------------------------------------------- 1 | car red 5 2 | book readable 6 3 | car blue 1 4 | book red 3 5 | bike blue 4 6 | bike red 4 7 | good readable 3 8 | good blue 2 9 | good red 6 -------------------------------------------------------------------------------- /src/examples/data/in/ex19-n.cols: -------------------------------------------------------------------------------- 1 | book 2 | car -------------------------------------------------------------------------------- /src/examples/data/in/ex19-n.sm: -------------------------------------------------------------------------------- 1 | man book 5 2 | man car 2 3 | boy book 7 4 | boy car 1 5 | woman book 5 6 | woman car 2 7 | -------------------------------------------------------------------------------- /src/examples/data/in/ex19-svo.cols: -------------------------------------------------------------------------------- 1 | book 2 | car -------------------------------------------------------------------------------- /src/examples/data/in/ex19-svo.sm: -------------------------------------------------------------------------------- 1 | man_hate_boy car 4 2 | man_hate_boy book 3 3 | man_hate_man car 10 4 | boy_hate_boy book 2 5 | boy_hate_man car 6 6 | boy_hate_boy car 11 7 | -------------------------------------------------------------------------------- /src/examples/data/in/sim_data.txt: -------------------------------------------------------------------------------- 1 | book history_book 0.894427191 other_field 4 2 | car sports_car 0.980580675691 other_field 4 3 | book sports_car 0.4472135955 other_field 6 -------------------------------------------------------------------------------- /src/examples/data/in/sim_data2.txt: -------------------------------------------------------------------------------- 1 | book history_book 0.894427191 other_field 6 2 | car sports_car 0.980580675691 other_field 4 3 | book sports_car 0.4472135955 other_field 5 -------------------------------------------------------------------------------- /src/examples/data/in/sim_data3.txt: -------------------------------------------------------------------------------- 1 | book book 0.894427191 other_field 4 2 | car car 0.980580675691 other_field 4 3 | book car 0.4472135955 other_field 6 -------------------------------------------------------------------------------- /src/examples/data/in/train_data.txt: -------------------------------------------------------------------------------- 1 | book_function car my_car_book 2 | book_function book 2x_book 3 | -------------------------------------------------------------------------------- /src/examples/data/in/word_list.txt: -------------------------------------------------------------------------------- 1 | car 2 | book 3 | -------------------------------------------------------------------------------- /src/examples/data/in/word_pairs1.txt: -------------------------------------------------------------------------------- 1 | book book 2 | car book 3 | car car 4 | -------------------------------------------------------------------------------- /src/examples/data/in/word_pairs2.txt: -------------------------------------------------------------------------------- 1 | book history_book 2 | car sports_car 3 | book sports_car 4 | 5 | -------------------------------------------------------------------------------- /src/examples/data/in/word_sims.txt: -------------------------------------------------------------------------------- 1 | book book 7 2 | car car 7 3 | book car 2 -------------------------------------------------------------------------------- /src/examples/data/out/COMPOSED_SS.ex10.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (ccomposes.semantic_space.space 5 | Space 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_id2row' 14 | p6 15 | (lp7 16 | S'my_car_book' 17 | p8 18 | aS'my_special_book' 19 | p9 20 | asS'_column2id' 21 | p10 22 | (dp11 23 | S'blue' 24 | p12 25 | I1 26 | sS'readable' 27 | p13 28 | I2 29 | sS'red' 30 | p14 31 | I0 32 | ssS'_operations' 33 | p15 34 | (lp16 35 | sS'_id2column' 36 | p17 37 | (lp18 38 | g14 39 | ag12 40 | ag13 41 | asS'_element_shape' 42 | p19 43 | (I3 44 | tp20 45 | sS'_cooccurrence_matrix' 46 | p21 47 | g0 48 | (ccomposes.matrix.sparse_matrix 49 | SparseMatrix 50 | p22 51 | g2 52 | Ntp23 53 | Rp24 54 | (dp25 55 | S'_mat' 56 | p26 57 | g0 58 | (cscipy.sparse.csr 59 | csr_matrix 60 | p27 61 | g2 62 | Ntp28 63 | Rp29 64 | (dp30 65 | S'format' 66 | p31 67 | S'csr' 68 | p32 69 | sS'_shape' 70 | p33 71 | (I2 72 | I3 73 | tp34 74 | sS'indptr' 75 | p35 76 | cnumpy.core.multiarray 77 | _reconstruct 78 | p36 79 | (cnumpy 80 | ndarray 81 | p37 82 | (I0 83 | tp38 84 | S'b' 85 | p39 86 | tp40 87 | Rp41 88 | (I1 89 | (I3 90 | tp42 91 | cnumpy 92 | dtype 93 | p43 94 | (S'i4' 95 | p44 96 | I0 97 | I1 98 | tp45 99 | Rp46 100 | (I3 101 | S'<' 102 | p47 103 | NNNI-1 104 | I-1 105 | I0 106 | tp48 107 | bI00 108 | S'\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00' 109 | p49 110 | tp50 111 | bsS'indices' 112 | p51 113 | g36 114 | (g37 115 | (I0 116 | tp52 117 | g39 118 | tp53 119 | Rp54 120 | (I1 121 | (I5 122 | tp55 123 | g46 124 | I00 125 | S'\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00' 126 | p56 127 | tp57 128 | bsS'maxprint' 129 | p58 130 | I50 131 | sS'data' 132 | p59 133 | g36 134 | (g37 135 | (I0 136 | tp60 137 | g39 138 | tp61 139 | Rp62 140 | (I1 141 | (I5 142 | tp63 143 | g43 144 | (S'f8' 145 | p64 146 | I0 147 | I1 148 | tp65 149 | Rp66 150 | (I3 151 | S'<' 152 | p67 153 | NNNI-1 154 | I-1 155 | I0 156 | tp68 157 | bI00 158 | S'\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00(@' 159 | p69 160 | tp70 161 | bsbsbsS'_row2id' 162 | p71 163 | (dp72 164 | g8 165 | I0 166 | sg9 167 | I1 168 | ssb. -------------------------------------------------------------------------------- /src/examples/data/out/PER_SS.ex05.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (ccomposes.semantic_space.peripheral_space 5 | PeripheralSpace 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_id2row' 14 | p6 15 | (lp7 16 | S'sports_car' 17 | p8 18 | aS'history_book' 19 | p9 20 | asS'_column2id' 21 | p10 22 | (dp11 23 | S'blue' 24 | p12 25 | I1 26 | sS'readable' 27 | p13 28 | I2 29 | sS'red' 30 | p14 31 | I0 32 | ssS'_operations' 33 | p15 34 | (lp16 35 | g0 36 | (ccomposes.semantic_space.operation 37 | ScalingOperation 38 | p17 39 | g2 40 | Ntp18 41 | Rp19 42 | (dp20 43 | S'_ScalingOperation__scaling' 44 | p21 45 | g0 46 | (ccomposes.transformation.scaling.ppmi_weighting 47 | PpmiWeighting 48 | p22 49 | g2 50 | Ntp23 51 | Rp24 52 | sS'_ScalingOperation__column_stats' 53 | p25 54 | cnumpy.core.multiarray 55 | _reconstruct 56 | p26 57 | (cnumpy.matrixlib.defmatrix 58 | matrix 59 | p27 60 | (I0 61 | tp28 62 | S'b' 63 | p29 64 | tp30 65 | Rp31 66 | (I1 67 | (I1 68 | I3 69 | tp32 70 | cnumpy 71 | dtype 72 | p33 73 | (S'f8' 74 | p34 75 | I0 76 | I1 77 | tp35 78 | Rp36 79 | (I3 80 | S'<' 81 | p37 82 | NNNI-1 83 | I-1 84 | I0 85 | tp38 86 | bI01 87 | S'\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@' 88 | p39 89 | tp40 90 | bsbasS'_id2column' 91 | p41 92 | (lp42 93 | g14 94 | ag12 95 | ag13 96 | asS'_element_shape' 97 | p43 98 | (I3 99 | tp44 100 | sS'_cooccurrence_matrix' 101 | p45 102 | g0 103 | (ccomposes.matrix.sparse_matrix 104 | SparseMatrix 105 | p46 106 | g2 107 | Ntp47 108 | Rp48 109 | (dp49 110 | S'_mat' 111 | p50 112 | g0 113 | (cscipy.sparse.csr 114 | csr_matrix 115 | p51 116 | g2 117 | Ntp52 118 | Rp53 119 | (dp54 120 | S'format' 121 | p55 122 | S'csr' 123 | p56 124 | sS'_shape' 125 | p57 126 | (I2 127 | I3 128 | tp58 129 | sS'indptr' 130 | p59 131 | g26 132 | (cnumpy 133 | ndarray 134 | p60 135 | (I0 136 | tp61 137 | g29 138 | tp62 139 | Rp63 140 | (I1 141 | (I3 142 | tp64 143 | g33 144 | (S'i4' 145 | p65 146 | I0 147 | I1 148 | tp66 149 | Rp67 150 | (I3 151 | S'<' 152 | p68 153 | NNNI-1 154 | I-1 155 | I0 156 | tp69 157 | bI00 158 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00' 159 | p70 160 | tp71 161 | bsS'indices' 162 | p72 163 | g26 164 | (g60 165 | (I0 166 | tp73 167 | g29 168 | tp74 169 | Rp75 170 | (I1 171 | (I2 172 | tp76 173 | g67 174 | I00 175 | S'\x00\x00\x00\x00\x02\x00\x00\x00' 176 | p77 177 | tp78 178 | bsS'maxprint' 179 | p79 180 | I50 181 | sS'data' 182 | p80 183 | g26 184 | (g60 185 | (I0 186 | tp81 187 | g29 188 | tp82 189 | Rp83 190 | (I1 191 | (I2 192 | tp84 193 | g36 194 | I00 195 | S'\xaerF\xe8\x8f\x1d\xe4?"\x9a\x9a\xc7\xf7\x8f\xcc?' 196 | p85 197 | tp86 198 | bsbsbsS'_row2id' 199 | p87 200 | (dp88 201 | g9 202 | I1 203 | sg8 204 | I0 205 | ssb. -------------------------------------------------------------------------------- /src/examples/data/out/PHRASE_SS.ex10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/PHRASE_SS.ex10.pkl -------------------------------------------------------------------------------- /src/examples/data/out/ex01.cols: -------------------------------------------------------------------------------- 1 | red 2 | blue 3 | readable 4 | -------------------------------------------------------------------------------- /src/examples/data/out/ex01.dm: -------------------------------------------------------------------------------- 1 | car 5.0 1.0 0.0 2 | book 3.0 0.0 6.0 3 | -------------------------------------------------------------------------------- /src/examples/data/out/ex01.pkl: -------------------------------------------------------------------------------- 1 | ccopy_reg 2 | _reconstructor 3 | p0 4 | (ccomposes.semantic_space.space 5 | Space 6 | p1 7 | c__builtin__ 8 | object 9 | p2 10 | Ntp3 11 | Rp4 12 | (dp5 13 | S'_id2row' 14 | p6 15 | (lp7 16 | S'car' 17 | p8 18 | aS'book' 19 | p9 20 | asS'_column2id' 21 | p10 22 | (dp11 23 | S'blue' 24 | p12 25 | I1 26 | sS'readable' 27 | p13 28 | I2 29 | sS'red' 30 | p14 31 | I0 32 | ssS'_operations' 33 | p15 34 | (lp16 35 | sS'_id2column' 36 | p17 37 | (lp18 38 | g14 39 | ag12 40 | ag13 41 | asS'_element_shape' 42 | p19 43 | (I3 44 | tp20 45 | sS'_cooccurrence_matrix' 46 | p21 47 | g0 48 | (ccomposes.matrix.sparse_matrix 49 | SparseMatrix 50 | p22 51 | g2 52 | Ntp23 53 | Rp24 54 | (dp25 55 | S'_mat' 56 | p26 57 | g0 58 | (cscipy.sparse.csr 59 | csr_matrix 60 | p27 61 | g2 62 | Ntp28 63 | Rp29 64 | (dp30 65 | S'format' 66 | p31 67 | S'csr' 68 | p32 69 | sS'_shape' 70 | p33 71 | (I2 72 | I3 73 | tp34 74 | sS'indptr' 75 | p35 76 | cnumpy.core.multiarray 77 | _reconstruct 78 | p36 79 | (cnumpy 80 | ndarray 81 | p37 82 | (I0 83 | tp38 84 | S'b' 85 | p39 86 | tp40 87 | Rp41 88 | (I1 89 | (I3 90 | tp42 91 | cnumpy 92 | dtype 93 | p43 94 | (S'i4' 95 | p44 96 | I0 97 | I1 98 | tp45 99 | Rp46 100 | (I3 101 | S'<' 102 | p47 103 | NNNI-1 104 | I-1 105 | I0 106 | tp48 107 | bI00 108 | S'\x00\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00' 109 | p49 110 | tp50 111 | bsS'indices' 112 | p51 113 | g36 114 | (g37 115 | (I0 116 | tp52 117 | g39 118 | tp53 119 | Rp54 120 | (I1 121 | (I4 122 | tp55 123 | g46 124 | I00 125 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00' 126 | p56 127 | tp57 128 | bsS'maxprint' 129 | p58 130 | I50 131 | sS'data' 132 | p59 133 | g36 134 | (g37 135 | (I0 136 | tp60 137 | g39 138 | tp61 139 | Rp62 140 | (I1 141 | (I4 142 | tp63 143 | g43 144 | (S'f8' 145 | p64 146 | I0 147 | I1 148 | tp65 149 | Rp66 150 | (I3 151 | S'<' 152 | p67 153 | NNNI-1 154 | I-1 155 | I0 156 | tp68 157 | bI00 158 | S'\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x18@' 159 | p69 160 | tp70 161 | bsbsbsS'_row2id' 162 | p71 163 | (dp72 164 | g8 165 | I0 166 | sg9 167 | I1 168 | ssb. -------------------------------------------------------------------------------- /src/examples/data/out/ex01.rows: -------------------------------------------------------------------------------- 1 | car 2 | book 3 | -------------------------------------------------------------------------------- /src/examples/data/out/ex01.sm: -------------------------------------------------------------------------------- 1 | car red 5.000000 2 | car blue 1.000000 3 | book red 3.000000 4 | book readable 6.000000 5 | -------------------------------------------------------------------------------- /src/examples/data/out/ex10.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/ex10.pkl -------------------------------------------------------------------------------- /src/examples/data/out/model01.params: -------------------------------------------------------------------------------- 1 | alpha 1.000000 2 | beta 1.000000 -------------------------------------------------------------------------------- /src/examples/data/out/model01.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/model01.pkl -------------------------------------------------------------------------------- /src/examples/ex01.py: -------------------------------------------------------------------------------- 1 | #ex01.py 2 | #------- 3 | from composes.semantic_space.space import Space 4 | 5 | #create a space from co-occurrence counts in sparse format 6 | my_space = Space.build(data = "./data/in/ex01.sm", 7 | rows = "./data/in/ex01.rows", 8 | cols = "./data/in/ex01.cols", 9 | format = "sm") 10 | 11 | #export the space in sparse format 12 | my_space.export("./data/out/ex01", format = "sm") 13 | 14 | #export the space in dense format 15 | my_space.export("./data/out/ex01", format = "dm") 16 | -------------------------------------------------------------------------------- /src/examples/ex02.py: -------------------------------------------------------------------------------- 1 | #ex02.py 2 | #------- 3 | from composes.semantic_space.space import Space 4 | from composes.utils import io_utils 5 | 6 | #create a space from co-occurrence counts in sparse format 7 | my_space = Space.build(data = "./data/in/ex01.sm", 8 | rows = "./data/in/ex01.rows", 9 | cols = "./data/in/ex01.cols", 10 | format = "sm") 11 | 12 | #print the co-occurrence matrix of the space 13 | print my_space.cooccurrence_matrix 14 | 15 | #save the Space object in pickle format 16 | io_utils.save(my_space, "./data/out/ex01.pkl") 17 | 18 | #load the saved object 19 | my_space2 = io_utils.load("./data/out/ex01.pkl") 20 | 21 | #print the co-occurrence matrix of the loaded space 22 | print my_space2.cooccurrence_matrix 23 | 24 | -------------------------------------------------------------------------------- /src/examples/ex03.py: -------------------------------------------------------------------------------- 1 | #ex03.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting 5 | 6 | #create a space from co-occurrence counts in sparse format 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | 9 | #print the co-occurrence matrix of the space 10 | print my_space.cooccurrence_matrix 11 | 12 | #apply ppmi weighting 13 | my_space = my_space.apply(PpmiWeighting()) 14 | 15 | #print the co-occurrence matrix of the transformed space 16 | print my_space.cooccurrence_matrix 17 | 18 | -------------------------------------------------------------------------------- /src/examples/ex04.py: -------------------------------------------------------------------------------- 1 | #ex04.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.transformation.dim_reduction.svd import Svd 5 | 6 | #load a space 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | 9 | #print the co-occurrence matrix and the columns of the space 10 | print my_space.cooccurrence_matrix 11 | print my_space.id2column 12 | 13 | #apply svd reduction 14 | my_space = my_space.apply(Svd(2)) 15 | 16 | #print the transformed space 17 | print my_space.cooccurrence_matrix 18 | print my_space.id2column 19 | -------------------------------------------------------------------------------- /src/examples/ex05.py: -------------------------------------------------------------------------------- 1 | #ex05.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.semantic_space.peripheral_space import PeripheralSpace 5 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting 6 | 7 | 8 | #load a space and apply ppmi on it 9 | my_space = io_utils.load("./data/out/ex01.pkl") 10 | my_space = my_space.apply(PpmiWeighting()) 11 | 12 | print my_space.cooccurrence_matrix 13 | print my_space.id2row 14 | 15 | #create a peripheral space 16 | my_per_space = PeripheralSpace.build(my_space, 17 | data="./data/in/ex05.sm", 18 | cols="./data/in/ex05.cols", 19 | format="sm") 20 | 21 | print my_per_space.cooccurrence_matrix 22 | print my_per_space.id2row 23 | 24 | #save the space 25 | io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl") 26 | 27 | -------------------------------------------------------------------------------- /src/examples/ex06.py: -------------------------------------------------------------------------------- 1 | #ex06.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.similarity.cos import CosSimilarity 5 | 6 | #load a space 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | 9 | print my_space.cooccurrence_matrix 10 | print my_space.id2row 11 | 12 | #compute similarity between two words in the space 13 | print my_space.get_sim("car", "car", CosSimilarity()) 14 | print my_space.get_sim("car", "book", CosSimilarity()) 15 | -------------------------------------------------------------------------------- /src/examples/ex07.py: -------------------------------------------------------------------------------- 1 | #ex07.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.similarity.cos import CosSimilarity 5 | 6 | #load two spaces 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") 9 | 10 | print my_space.id2row 11 | print my_per_space.id2row 12 | 13 | #compute similarity between a word and a phrase in the two spaces 14 | print my_space.get_sim("car", "sports_car", CosSimilarity(), 15 | space2 = my_per_space) 16 | -------------------------------------------------------------------------------- /src/examples/ex08.py: -------------------------------------------------------------------------------- 1 | #ex08.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.similarity.cos import CosSimilarity 5 | 6 | #load a space 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | 9 | #get the top 2 neighbours of "car" 10 | print my_space.get_neighbours("car", 2, CosSimilarity()) 11 | -------------------------------------------------------------------------------- /src/examples/ex09.py: -------------------------------------------------------------------------------- 1 | #ex09.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.similarity.cos import CosSimilarity 5 | 6 | #load two spaces 7 | my_space = io_utils.load("./data/out/ex01.pkl") 8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") 9 | 10 | print my_space.id2row 11 | print my_space.cooccurrence_matrix 12 | print my_per_space.id2row 13 | print my_per_space.cooccurrence_matrix 14 | 15 | #get the top two neighbours of "car" in a peripheral space 16 | print my_space.get_neighbours("car", 2, CosSimilarity(), 17 | space2 = my_per_space) 18 | 19 | -------------------------------------------------------------------------------- /src/examples/ex10.py: -------------------------------------------------------------------------------- 1 | #ex10.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.weighted_additive import WeightedAdditive 5 | 6 | #load a space 7 | my_space = io_utils.load("./data/out/ex10.pkl") 8 | 9 | print my_space.id2row 10 | print my_space.cooccurrence_matrix 11 | 12 | # instantiate a weighted additive model 13 | my_comp = WeightedAdditive(alpha = 1, beta = 1) 14 | 15 | # use the model to compose words in my_space 16 | composed_space = my_comp.compose([("good", "book", "good_book"), 17 | ("good", "car", "good_car")], 18 | my_space) 19 | 20 | print composed_space.id2row 21 | print composed_space.cooccurrence_matrix 22 | 23 | #save the composed space 24 | io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl") 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/examples/ex11.py: -------------------------------------------------------------------------------- 1 | #ex11.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.weighted_additive import WeightedAdditive 5 | 6 | # instantiate a weighted additive model 7 | my_comp = WeightedAdditive(alpha = 1, beta = 1) 8 | 9 | #save it to pickle 10 | io_utils.save(my_comp, "./data/out/model01.pkl") 11 | 12 | #print its parameters 13 | my_comp.export("./data/out/model01.params") 14 | 15 | -------------------------------------------------------------------------------- /src/examples/ex12.py: -------------------------------------------------------------------------------- 1 | #ex12.py 2 | #------- 3 | from composes.utils import io_utils 4 | 5 | #load a previously saved weighted additive model 6 | my_comp = io_utils.load("./data/out/model01.pkl") 7 | 8 | #print its parameters 9 | print "alpha:", my_comp.alpha 10 | print "beta:", my_comp.beta 11 | 12 | #load two spaces 13 | my_space = io_utils.load("./data/out/ex10.pkl") 14 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") 15 | 16 | #apply the composition model to them 17 | composed_space = my_comp.compose([("good", "history_book", "good_history_book")], 18 | (my_space, my_per_space)) 19 | 20 | print composed_space.id2row 21 | print composed_space.cooccurrence_matrix 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/examples/ex13.py: -------------------------------------------------------------------------------- 1 | #ex13.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.weighted_additive import WeightedAdditive 5 | 6 | 7 | #training data 8 | train_data = [("good", "car", "good_car"), 9 | ("good", "book", "good_book") 10 | ] 11 | 12 | #load an argument space 13 | arg_space = io_utils.load("./data/out/ex10.pkl") 14 | print arg_space.id2row 15 | print arg_space.cooccurrence_matrix 16 | 17 | #load a phrase space 18 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 19 | print phrase_space.id2row 20 | print phrase_space.cooccurrence_matrix 21 | 22 | #train a weighted additive model on the data 23 | my_comp = WeightedAdditive() 24 | my_comp.train(train_data, arg_space, phrase_space) 25 | 26 | #print its parameters 27 | print "alpha:", my_comp.alpha 28 | print "beta:", my_comp.beta 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/examples/ex14.py: -------------------------------------------------------------------------------- 1 | #ex14.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.dilation import Dilation 5 | 6 | #training data 7 | train_data = [("good", "car", "good_car"), 8 | ("good", "book", "good_book") 9 | ] 10 | 11 | #load an argument space 12 | arg_space = io_utils.load("./data/out/ex10.pkl") 13 | 14 | #load a phrase space 15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 16 | print "Training phrase space" 17 | print phrase_space.id2row 18 | print phrase_space.cooccurrence_matrix 19 | 20 | #train a Dilation model on the data 21 | my_comp = Dilation() 22 | my_comp.train(train_data, arg_space, phrase_space) 23 | 24 | #print its parameters 25 | print "\nlambda:", my_comp._lambda 26 | 27 | #use the model to compose the train data 28 | composed_space = my_comp.compose([("good", "bike", "good_bike")], 29 | arg_space) 30 | print "\nComposed space:" 31 | print composed_space.id2row 32 | print composed_space.cooccurrence_matrix -------------------------------------------------------------------------------- /src/examples/ex15.py: -------------------------------------------------------------------------------- 1 | #ex15.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.full_additive import FullAdditive 5 | 6 | #training data 7 | train_data = [("good", "car", "good_car"), 8 | ("good", "book", "good_book") 9 | ] 10 | 11 | #load an argument space 12 | arg_space = io_utils.load("./data/out/ex10.pkl") 13 | 14 | #load a phrase space 15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 16 | print "Training phrase space" 17 | print phrase_space.id2row 18 | print phrase_space.cooccurrence_matrix 19 | 20 | #train a FullAdditive model on the data 21 | my_comp = FullAdditive() 22 | my_comp.train(train_data, arg_space, phrase_space) 23 | 24 | #print its parameters 25 | print "\nA:", my_comp._mat_a_t.transpose() 26 | print "B:", my_comp._mat_b_t.transpose() 27 | 28 | #use the model to compose the train data 29 | composed_space = my_comp.compose([("good", "bike", "good_bike")], 30 | arg_space) 31 | print "\nComposed space:" 32 | print composed_space.id2row 33 | print composed_space.cooccurrence_matrix 34 | -------------------------------------------------------------------------------- /src/examples/ex16.py: -------------------------------------------------------------------------------- 1 | #ex16.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.lexical_function import LexicalFunction 5 | from composes.similarity.cos import CosSimilarity 6 | 7 | #training data 8 | #trying to learn a "good" function 9 | train_data = [("good_function", "car", "good_car"), 10 | ("good_function", "book", "good_book") 11 | ] 12 | 13 | #load argument and phrase space 14 | arg_space = io_utils.load("./data/out/ex10.pkl") 15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 16 | 17 | #train a lexical function model on the data 18 | my_comp = LexicalFunction() 19 | my_comp.train(train_data, arg_space, phrase_space) 20 | 21 | #print its parameters 22 | print "\nLexical function space:" 23 | print my_comp.function_space.id2row 24 | cooc_mat = my_comp.function_space.cooccurrence_matrix 25 | cooc_mat.reshape(my_comp.function_space.element_shape) 26 | print cooc_mat 27 | 28 | #similarity within the learned functional space 29 | print "\nSimilarity between good and good in the function space:" 30 | print my_comp.function_space.get_sim("good_function", "good_function", 31 | CosSimilarity()) -------------------------------------------------------------------------------- /src/examples/ex17.py: -------------------------------------------------------------------------------- 1 | #ex17.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.lexical_function import LexicalFunction 5 | from composes.utils.regression_learner import RidgeRegressionLearner 6 | 7 | #training data 8 | #trying to learn a "good" function 9 | train_data = [("good_function", "car", "good_car"), 10 | ("good_function", "book", "good_book") 11 | ] 12 | 13 | #load argument and phrase space 14 | arg_space = io_utils.load("./data/out/ex10.pkl") 15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 16 | 17 | print "\nDefault regression:" 18 | my_comp = LexicalFunction() 19 | print type(my_comp.regression_learner).__name__ 20 | my_comp.train(train_data, arg_space, phrase_space) 21 | 22 | #print its parameters 23 | print "Lexical function space:" 24 | print my_comp.function_space.id2row 25 | cooc_mat = my_comp.function_space.cooccurrence_matrix 26 | cooc_mat.reshape(my_comp.function_space.element_shape) 27 | print cooc_mat 28 | 29 | print "\nRidge Regression with lambda = 2" 30 | rr_learner=RidgeRegressionLearner(param = 2, 31 | intercept = False, 32 | crossvalidation=False) 33 | my_comp = LexicalFunction(learner = rr_learner) 34 | my_comp.train(train_data, arg_space, phrase_space) 35 | 36 | #print its parameters 37 | print "Lexical function space:" 38 | print my_comp.function_space.id2row 39 | cooc_mat = my_comp.function_space.cooccurrence_matrix 40 | cooc_mat.reshape(my_comp.function_space.element_shape) 41 | print cooc_mat 42 | -------------------------------------------------------------------------------- /src/examples/ex18.py: -------------------------------------------------------------------------------- 1 | #ex18.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.composition.lexical_function import LexicalFunction 5 | 6 | #training data 7 | #trying to learn a "book" function 8 | train_data = [("good_function", "car", "good_car"), 9 | ("good_function", "book", "good_book") 10 | ] 11 | 12 | #load argument and phrase space 13 | arg_space = io_utils.load("./data/out/ex10.pkl") 14 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") 15 | 16 | #train a lexical function model on the data 17 | my_comp = LexicalFunction() 18 | my_comp.train(train_data, arg_space, phrase_space) 19 | 20 | #apply the trained model 21 | comp_sp1 = my_comp.compose([("good_function", "car", 22 | "good_car")], 23 | arg_space) 24 | 25 | #apply the trained model a second time 26 | comp_sp2 = my_comp.compose([("good_function", "good_car", 27 | "good_good_car")], 28 | comp_sp1) 29 | 30 | 31 | #print the composed spaces: 32 | print "\nComposed space 1:" 33 | print comp_sp1.id2row 34 | print comp_sp1.cooccurrence_matrix 35 | 36 | print "\nComposed space 2:" 37 | print comp_sp2.id2row 38 | print comp_sp2.cooccurrence_matrix 39 | -------------------------------------------------------------------------------- /src/examples/ex19.py: -------------------------------------------------------------------------------- 1 | #ex19.py 2 | #------- 3 | from composes.semantic_space.space import Space 4 | from composes.composition.lexical_function import LexicalFunction 5 | from composes.utils.regression_learner import LstsqRegressionLearner 6 | 7 | #training data1: VO N -> SVO 8 | train_vo_data = [("hate_boy", "man", "man_hate_boy"), 9 | ("hate_man", "man", "man_hate_man"), 10 | ("hate_boy", "boy", "boy_hate_boy"), 11 | ("hate_man", "boy", "boy_hate_man") 12 | ] 13 | 14 | #training data2: V N -> VO 15 | train_v_data = [("hate", "man", "hate_man"), 16 | ("hate", "boy", "hate_boy") 17 | ] 18 | 19 | #load N and SVO spaces 20 | n_space = Space.build(data = "./data/in/ex19-n.sm", 21 | cols = "./data/in/ex19-n.cols", 22 | format = "sm") 23 | 24 | svo_space = Space.build(data = "./data/in/ex19-svo.sm", 25 | cols = "./data/in/ex19-svo.cols", 26 | format = "sm") 27 | 28 | print "\nInput SVO training space:" 29 | print svo_space.id2row 30 | print svo_space.cooccurrence_matrix 31 | 32 | #1. train a model to learn VO functions on train data: VO N -> SVO 33 | print "\nStep 1 training" 34 | vo_model = LexicalFunction(learner=LstsqRegressionLearner()) 35 | vo_model.train(train_vo_data, n_space, svo_space) 36 | 37 | #2. train a model to learn V functions on train data: V N -> VO 38 | # where VO space: function space learned in step 1 39 | print "\nStep 2 training" 40 | vo_space = vo_model.function_space 41 | v_model = LexicalFunction(learner=LstsqRegressionLearner()) 42 | v_model.train(train_v_data, n_space, vo_space) 43 | 44 | #print the learned model 45 | print "\n3D Verb space" 46 | print v_model.function_space.id2row 47 | print v_model.function_space.cooccurrence_matrix 48 | 49 | 50 | #3. use the trained models to compose new SVO sentences 51 | 52 | #3.1 use the V model to create new VO combinations 53 | vo_composed_space = v_model.compose([("hate", "woman", "hate_woman"), 54 | ("hate", "man", "hate_man")], 55 | n_space) 56 | 57 | #3.2 the new VO combinations will be used as functions: 58 | # load the new VO combinations obtained through composition into 59 | # a new composition model 60 | expanded_vo_model = LexicalFunction(function_space=vo_composed_space, 61 | intercept=v_model._has_intercept) 62 | 63 | #3.3 use the new VO combinations by composing them with subject nouns 64 | # in order to obtain new SVO sentences 65 | svo_composed_space = expanded_vo_model.compose([("hate_woman", "woman", "woman_hates_woman"), 66 | ("hate_man", "man", "man_hates_man")], 67 | n_space) 68 | 69 | #print the composed spaces: 70 | print "\nVO composed space:" 71 | print vo_composed_space.id2row 72 | print vo_composed_space.cooccurrence_matrix 73 | 74 | #print the composed spaces: 75 | print "\nSVO composed space:" 76 | print svo_composed_space.id2row 77 | print svo_composed_space.cooccurrence_matrix 78 | 79 | -------------------------------------------------------------------------------- /src/examples/ex20.py: -------------------------------------------------------------------------------- 1 | #ex20.py 2 | #------- 3 | from composes.utils import io_utils 4 | from composes.utils import scoring_utils 5 | from composes.similarity.cos import CosSimilarity 6 | 7 | #read in a space 8 | my_space = io_utils.load("data/out/ex01.pkl") 9 | 10 | #compute similarities of a list of word pairs 11 | fname = "data/in/word_sims.txt" 12 | word_pairs = io_utils.read_tuple_list(fname, fields=[0,1]) 13 | predicted = my_space.get_sims(word_pairs, CosSimilarity()) 14 | 15 | #compute correlations 16 | gold = io_utils.read_list(fname, field=2) 17 | print "Spearman" 18 | print scoring_utils.score(gold, predicted, "spearman") 19 | print "Pearson" 20 | print scoring_utils.score(gold, predicted, "pearson") -------------------------------------------------------------------------------- /src/examples/exercise.sh: -------------------------------------------------------------------------------- 1 | # set pythonpath 2 | export PYTHONPATH=/home/thenghia.pham/git/toolkit/src:$PYTHONPATH 3 | export TOOLKIT_DIR=/home/thenghia.pham/git/toolkit 4 | export OUT_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/data/tutorial 5 | export DATA_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial 6 | export LOG_FILE=$OUT_DIR/log/exercises.log 7 | 8 | #************************************************************************************** 9 | echo Step1 10 | echo STARTING BUILDING CORE 11 | export CORE_IN_FILE_PREFIX=CORE_SS.verbnoun.core 12 | export CORE_OUT_DIR=$OUT_DIR/core 13 | 14 | # run build core space pipeline 15 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_core_space.py -i $DATA_DIR/$CORE_IN_FILE_PREFIX --input_format=pkl -o $CORE_OUT_DIR -w ppmi -s top_sum_2000 -r svd_100 --output_format=dm -l $LOG_FILE 16 | 17 | echo FINISHED BUILDING CORE 18 | 19 | #************************************************************************************** 20 | echo Step2 21 | echo STARTING PERIPHERAL PIPELINE 22 | export CORE_SPC=CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl 23 | 24 | export PER_RAW_FILE=$DATA_DIR/per.raw.SV 25 | export PER_OUT_DIR=$OUT_DIR/per 26 | 27 | # run build peripheral space pipeline 28 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_peripheral_space.py -i $PER_RAW_FILE --input_format sm -c $CORE_OUT_DIR/$CORE_SPC -o $PER_OUT_DIR dm -l $LOG_FILE 29 | 30 | echo FINISHED PERIPHERAL PIPELINE 31 | 32 | #************************************************************************************** 33 | echo step3 34 | echo STARTING TRAINING 35 | 36 | export MODEL_DIR=$OUT_DIR/trained 37 | export TRAIN_FILE=$DATA_DIR/ML08_SV_train.txt 38 | export PER_SPC=PER_SS.per.raw.SV.CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl 39 | export MODEL=lexical_func 40 | 41 | # run training pipeline 42 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/train_composition.py -i $TRAIN_FILE -m $MODEL -o $MODEL_DIR -a $CORE_OUT_DIR/$CORE_SPC -p $PER_OUT_DIR/$PER_SPC --regression ridge --intercept True --crossvalidation False --lambda 2.0 -l $LOG_FILE 43 | 44 | echo FINISHED TRAINING 45 | #************************************************************************************** 46 | echo step 4 47 | echo STARTING COMPOSING SPACE 48 | 49 | export TRNED_MODEL=TRAINED_COMP_MODEL.lexical_func.ML08_SV_train.txt.pkl 50 | export COMP_DIR=$OUT_DIR/composed 51 | export COMP_FILE=$DATA_DIR/ML08nvs_test.txt 52 | 53 | # run apply composition pipeline 54 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/apply_composition.py -i $COMP_FILE --load_model $MODEL_DIR/$TRNED_MODEL -o $COMP_DIR -a $CORE_OUT_DIR/$CORE_SPC -l $LOG_FILE 55 | 56 | echo FINISHED COMPOSING SPACE 57 | #************************************************************************************** 58 | echo step 5 59 | echo STARTING COMPUTING SIMS 60 | 61 | export COMP_SPC=COMPOSED_SS.LexicalFunction.ML08nvs_test.txt.pkl 62 | export SIM_DIR=$OUT_DIR/similarity 63 | export TEST_FILE=$DATA_DIR/ML08data_new.txt 64 | 65 | # create output directory for similarity if the directory doesn't exist 66 | if [ ! -d "$SIM_DIR" ]; then 67 | mkdir $SIM_DIR 68 | fi 69 | 70 | # run sim pipeline 71 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/compute_similarities.py -i $TEST_FILE -s $COMP_DIR/$COMP_SPC -o $SIM_DIR -m cos,lin,dot_prod,euclidean -c 1,2 -l $LOG_FILE 72 | 73 | echo FINISH COMPUTE SIMS 74 | #************************************************************************************** 75 | echo step 6 76 | echo STARTING EVAL SIMS 77 | 78 | # run evaluation pipeline 79 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/evaluate_similarities.py --in_dir $SIM_DIR -m spearman,pearson -c 3,4 -l $LOG_FILE 80 | echo FINISH EVAL SIMS 81 | -------------------------------------------------------------------------------- /src/examples/full_example.py: -------------------------------------------------------------------------------- 1 | from composes.similarity.cos import CosSimilarity 2 | from composes.semantic_space.peripheral_space import PeripheralSpace 3 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting 4 | from composes.transformation.dim_reduction.svd import Svd 5 | from composes.transformation.feature_selection.top_feature_selection import TopFeatureSelection 6 | from composes.composition.lexical_function import LexicalFunction 7 | from composes.composition.full_additive import FullAdditive 8 | from composes.composition.weighted_additive import WeightedAdditive 9 | from composes.composition.multiplicative import Multiplicative 10 | from composes.composition.dilation import Dilation 11 | from composes.utils.regression_learner import RidgeRegressionLearner 12 | 13 | import composes.utils.io_utils as io_utils 14 | import composes.utils.scoring_utils as scoring_utils 15 | 16 | #load a core space 17 | print "Loading the data..." 18 | data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/" 19 | 20 | space_file = data_path + "CORE_SS.verbnoun.core.pkl" 21 | space = io_utils.load(space_file) 22 | 23 | print "Applying PPMI..." 24 | space = space.apply(PpmiWeighting()) 25 | 26 | print "Applying feature selection..." 27 | space = space.apply(TopFeatureSelection(2000)) 28 | 29 | print "Applying SVD..." 30 | space = space.apply(Svd(100)) 31 | 32 | print "Creating peripheral space.." 33 | per_space = PeripheralSpace.build(space, 34 | data = data_path + "per.raw.SV.sm", 35 | cols = data_path + "per.raw.SV.cols", 36 | format = "sm" 37 | ) 38 | 39 | #reading in train data 40 | train_data_file = data_path + "ML08_SV_train.txt" 41 | train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2]) 42 | 43 | print "Training Lexical Function composition model..." 44 | comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2)) 45 | comp_model.train(train_data, space, per_space) 46 | 47 | print "Composing phrases..." 48 | test_phrases_file = data_path + "ML08nvs_test.txt" 49 | test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) 50 | composed_space = comp_model.compose(test_phrases, space) 51 | 52 | print "Reading similarity test data..." 53 | test_similarity_file = data_path + "ML08data_new.txt" 54 | test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1]) 55 | gold = io_utils.read_list(test_similarity_file, field=2) 56 | 57 | print "Computing similarity with lexical function..." 58 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 59 | 60 | #use this composed space to assign similarities 61 | print "Scoring lexical function..." 62 | print scoring_utils.score(gold, pred, "spearman") 63 | 64 | 65 | print "Training Full Additive composition model..." 66 | comp_model = FullAdditive(learner = RidgeRegressionLearner(param=2)) 67 | comp_model.train(train_data, space, per_space) 68 | composed_space = comp_model.compose(test_phrases, space) 69 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 70 | print scoring_utils.score(gold, pred, "spearman") 71 | 72 | print "Training Weighted Additive composition model..." 73 | comp_model = WeightedAdditive() 74 | comp_model.train(train_data, space, per_space) 75 | print "alpha, beta:", comp_model.alpha, comp_model.beta 76 | composed_space = comp_model.compose(test_phrases, space) 77 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 78 | print scoring_utils.score(gold, pred, "spearman") 79 | 80 | print "Training Dilation composition model..." 81 | comp_model = Dilation() 82 | comp_model.train(train_data, space, per_space) 83 | print "lambda:", comp_model._lambda 84 | composed_space = comp_model.compose(test_phrases, space) 85 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 86 | print scoring_utils.score(gold, pred, "spearman") 87 | 88 | print "Multiplicative composition model..." 89 | comp_model = Multiplicative() 90 | composed_space = comp_model.compose(test_phrases, space) 91 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 92 | print scoring_utils.score(gold, pred, "spearman") 93 | 94 | print "Simple additive composition model..." 95 | comp_model = WeightedAdditive(1,1) 96 | composed_space = comp_model.compose(test_phrases, space) 97 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 98 | print scoring_utils.score(gold, pred, "spearman") 99 | 100 | print "Simple dilation composition model..." 101 | comp_model = Dilation() 102 | composed_space = comp_model.compose(test_phrases, space) 103 | pred = composed_space.get_sims(test_pairs, CosSimilarity()) 104 | print scoring_utils.score(gold, pred, "spearman") 105 | -------------------------------------------------------------------------------- /src/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/pipelines/__init__.py -------------------------------------------------------------------------------- /src/pipelines/compute_neighbours.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 17, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | ''' 7 | Created on Oct 17, 2012 8 | 9 | @author: Georgiana Dinu, Pham The Nghia 10 | ''' 11 | 12 | ''' 13 | Created on Jun 12, 2012 14 | 15 | @author: thenghia.pham 16 | ''' 17 | 18 | 19 | import sys 20 | import getopt 21 | from ConfigParser import ConfigParser 22 | from composes.semantic_space.space import Space 23 | from composes.similarity.cos import CosSimilarity 24 | from composes.similarity.lin import LinSimilarity 25 | from composes.similarity.dot_prod import DotProdSimilarity 26 | from composes.similarity.euclidean import EuclideanSimilarity 27 | from composes.utils import io_utils 28 | from composes.utils import log_utils 29 | import pipeline_utils as utils 30 | import logging 31 | logger = logging.getLogger("test vector space construction pipeline") 32 | 33 | 34 | 35 | def usage(errno=0): 36 | print >>sys.stderr,\ 37 | """Usage: 38 | python compute_similarities.py [options] [config_file] 39 | 40 | Options: 41 | -i --input : input file. 42 | -o --output : output directory. 43 | -s --space : file of semantic space. The second 44 | word of a word pair is interpreted in the second space argument, 45 | if provided. 46 | -m --sim_measure : similarity measure 47 | -n --no_neighbours : number of neighbours to be returned 48 | -l --log : log file. Optional. 49 | -h --help : help 50 | 51 | Arguments: 52 | config_file: , used as default values for configuration options above. 53 | If you don't specify these options in [options] the value from the 54 | config_file will be used. 55 | 56 | Example: 57 | """ 58 | sys.exit(errno) 59 | 60 | 61 | def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): 62 | sim_dict = {"cos": CosSimilarity(), 63 | "lin": LinSimilarity(), 64 | "dot_prod": DotProdSimilarity(), 65 | "euclidean": EuclideanSimilarity()} 66 | 67 | if not sim_measure in sim_dict: 68 | raise ValueError("Similarity measure:%s not defined" % sim_measure) 69 | 70 | space = io_utils.load(space_files[0], Space) 71 | space2 = None 72 | space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) 73 | if len(space_files) == 2: 74 | space2 = io_utils.load(space_files[1], Space) 75 | space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) 76 | 77 | sim = sim_dict[sim_measure] 78 | 79 | descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) 80 | out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) 81 | io_utils.create_parent_directories(out_file) 82 | 83 | data = io_utils.read_list(in_file) 84 | 85 | print "Computing neighbours: %s" % sim_measure 86 | with open(out_file,"w") as out_stream: 87 | for word in data: 88 | out_stream.write("%s\n" % word) 89 | result = space.get_neighbours(word, no_neighbours, sim, space2) 90 | for neighbour, neighbour_sim in result: 91 | out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim)) 92 | 93 | def main(sys_argv): 94 | try: 95 | opts, argv = getopt.getopt(sys_argv[1:], "hi:o:s:m:n:l:", 96 | ["help", "input=", "output=", "sim_measures=", 97 | "space=", "log=", "no_neighbours="]) 98 | except getopt.GetoptError, err: 99 | print str(err) 100 | usage() 101 | sys.exit(1) 102 | 103 | section = "compute_neighbours" 104 | 105 | out_dir = None 106 | in_file = None 107 | sim_measure = None 108 | spaces = None 109 | log_file = None 110 | no_neighbours = "20" 111 | 112 | 113 | if (len(argv) == 1): 114 | config_file = argv[0] 115 | with open(config_file) as f: 116 | pass 117 | config = ConfigParser() 118 | config.read(config_file) 119 | out_dir = utils.config_get(section, config, "output", None) 120 | in_file = utils.config_get(section, config, "input", None) 121 | sim_measure = utils.config_get(section, config, "sim_measure", None) 122 | spaces = utils.config_get(section, config, "space", None) 123 | if not spaces is None: 124 | spaces = spaces.split(",") 125 | no_neighbours = utils.config_get(section, config, "no_neighbours", no_neighbours) 126 | log_file = utils.config_get(section, config, "log", None) 127 | 128 | for opt, val in opts: 129 | if opt in ("-i", "--input"): 130 | in_file = val 131 | elif opt in ("-o", "--output"): 132 | out_dir = val 133 | elif opt in ("-m", "--sim_measure"): 134 | sim_measure = val 135 | elif opt in ("-s", "--space"): 136 | spaces = val.split(",") 137 | elif opt in ("-n", "--no_neighbours"): 138 | no_neighbours = val 139 | elif opt in ("-l", "--log"): 140 | log_file = val 141 | elif opt in ("-h", "--help"): 142 | usage() 143 | sys.exit(0) 144 | else: 145 | usage(1) 146 | 147 | log_utils.config_logging(log_file) 148 | 149 | no_neighbours = int(no_neighbours) 150 | 151 | utils.assert_option_not_none(in_file, "Input file required", usage) 152 | utils.assert_option_not_none(out_dir, "Output directory required", usage) 153 | utils.assert_option_not_none(sim_measure, "Similarity measure required", usage) 154 | utils.assert_option_not_none(spaces, "Semantic space file required", usage) 155 | 156 | compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, spaces) 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | main(sys.argv) -------------------------------------------------------------------------------- /src/pipelines/evaluate_similarities.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 17, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | ''' 7 | Created on Oct 17, 2012 8 | 9 | @author: Georgiana Dinu, Pham The Nghia 10 | ''' 11 | 12 | ''' 13 | Created on Jun 12, 2012 14 | 15 | @author: thenghia.pham 16 | ''' 17 | 18 | 19 | import sys 20 | import getopt 21 | import os 22 | from ConfigParser import ConfigParser 23 | from composes.utils import scoring_utils 24 | from composes.utils import log_utils 25 | import pipeline_utils as utils 26 | 27 | import logging 28 | logger = logging.getLogger("test vector space construction pipeline") 29 | 30 | 31 | 32 | def usage(errno=0): 33 | print >>sys.stderr,\ 34 | """Usage: 35 | python compute_similarities.py [options] [config_file] 36 | 37 | Options: 38 | -i --input : input file. 39 | --in_dir: : input directory, all files that pass the --filter are tested. 40 | -i value is ignored. Optional. 41 | --filter: : when --in_dir, it acts as a filter on the files to be tested: 42 | only files containing this substring are tested. Optional, 43 | default all files in in_dir are tested. 44 | -m --correlation_measure : comma-separated correlation measures 45 | -c --columns <(int,int)>: pair of columns, indicating which columns contain 46 | the words to be compared 47 | -l --log : log file. Optional, default ./build_core_space.log 48 | -h --help : help 49 | 50 | Arguments: 51 | config_file: , used as default values for configuration options above. 52 | If you don't specify these options in [options] the value from the 53 | config_file will be used. 54 | 55 | Example: 56 | """ 57 | sys.exit(errno) 58 | 59 | def evaluate_sim(in_file, columns, corr_measures): 60 | 61 | if not len(columns) == 2: 62 | raise ValueError("Column description unrecognized!") 63 | col0 = int(columns[0]) - 1 64 | col1 = int(columns[1]) - 1 65 | 66 | gold = [] 67 | prediction = [] 68 | with open(in_file) as in_stream: 69 | for line in in_stream: 70 | if not line.strip() == "": 71 | elems = line.strip().split() 72 | gold.append(float(elems[col0])) 73 | prediction.append(float(elems[col1])) 74 | 75 | for corr_measure in corr_measures: 76 | print "CORRELATION:%s" % corr_measure 77 | corr = scoring_utils.score(gold, prediction, corr_measure) 78 | print "\t%f" % corr 79 | 80 | 81 | def evaluate_sim_batch(in_dir, columns, corr_measures, filter_=""): 82 | 83 | if not os.path.exists(in_dir): 84 | raise ValueError("Input directory not found: %s" % in_dir) 85 | 86 | if not in_dir.endswith("/"): 87 | in_dir = in_dir + "/" 88 | 89 | for file_ in os.listdir(in_dir): 90 | if file_.find(filter_) != -1: 91 | print file_ 92 | evaluate_sim(in_dir + file_, columns, corr_measures) 93 | 94 | 95 | def main(sys_argv): 96 | try: 97 | opts, argv = getopt.getopt(sys_argv[1:], "hi:m:c:l:", 98 | ["help", "input=", "correlation_measure=", 99 | "columns=", "log=", "in_dir=", "filter="]) 100 | 101 | except getopt.GetoptError, err: 102 | print str(err) 103 | usage() 104 | sys.exit(1) 105 | 106 | in_file = None 107 | in_dir = None 108 | filter_ = "" 109 | corr_measures = None 110 | columns = None 111 | log_file = None 112 | 113 | section = "evaluate_similarities" 114 | 115 | if (len(argv) == 1): 116 | config_file = argv[0] 117 | config = ConfigParser() 118 | config.read(config_file) 119 | in_file = utils.config_get(section, config, "input", None) 120 | in_dir = utils.config_get(section, config, "in_dir", None) 121 | filter_ = utils.config_get(section, config, "filter", filter_) 122 | corr_measures = utils.config_get(section, config, "correlation_measure", None) 123 | if not corr_measures is None: 124 | corr_measures = corr_measures.split(",") 125 | columns = utils.config_get(section, config, "columns", None) 126 | if not columns is None: 127 | columns = columns.split(",") 128 | log_file = utils.config_get(section, config, "log", None) 129 | 130 | for opt, val in opts: 131 | if opt in ("-i", "--input"): 132 | in_file = val 133 | elif opt in ("-m", "--correlation_measure"): 134 | corr_measures = val.split(",") 135 | elif opt in ("-c", "--columns"): 136 | columns = val.split(",") 137 | elif opt == "--in_dir": 138 | in_dir = val 139 | elif opt == "--filter": 140 | filter_ = val 141 | elif opt in ("-l", "--log"): 142 | log_file = val 143 | elif opt in ("-h", "--help"): 144 | usage() 145 | sys.exit(0) 146 | else: 147 | usage(1) 148 | 149 | log_utils.config_logging(log_file) 150 | 151 | utils.assert_option_not_none(corr_measures, "Correlation measures required", usage) 152 | utils.assert_option_not_none(columns, "Columns to be read from input file required", usage) 153 | 154 | if len(columns) != 2: 155 | raise ValueError("Columns (-c) field should contain two comma-separated integers (e.g. -c 3,4)") 156 | 157 | if not in_dir is None: 158 | evaluate_sim_batch(in_dir, columns, corr_measures, filter_) 159 | else: 160 | utils.assert_option_not_none(in_file, "Input file required", usage) 161 | evaluate_sim(in_file, columns, corr_measures) 162 | 163 | if __name__ == '__main__': 164 | main(sys.argv) 165 | -------------------------------------------------------------------------------- /src/pipelines/pipeline_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 20, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | 7 | def assert_bool(option, message, usage): 8 | if option not in (True, False): 9 | print message 10 | usage(1) 11 | 12 | def assert_option_not_none(option, message, usage): 13 | if option is None: 14 | print message 15 | usage(1) 16 | 17 | def assert_xor_options(option1, option2, message, usage): 18 | if not ((option1 is None) ^ (option2 is None)): 19 | print message 20 | usage(1) 21 | 22 | def config_get(section, config, option, default): 23 | return config.get(section, option) if config.has_option(section, option) else default 24 | -------------------------------------------------------------------------------- /src/unitest/__init__.py: -------------------------------------------------------------------------------- 1 | current_file = __file__ 2 | toolkit_dir = "/".join(current_file.split("/")[0:-3]) 3 | data_dir = toolkit_dir + "/resource/unittest/" -------------------------------------------------------------------------------- /src/unitest/bps_pipeline_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 18, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | 8 | import numpy as np 9 | 10 | from pipelines import build_peripheral_space as bps 11 | from pipelines import build_core_space as bcs 12 | from composes.semantic_space.space import Space 13 | 14 | from unitest import data_dir 15 | import pytest 16 | 17 | 18 | class Test(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.dir_ = data_dir + "pipelines_test_resources/" 22 | 23 | def _test_equal_spaces_structs(self, sp, new_sp): 24 | self.assertListEqual(sp.id2row, new_sp.id2row) 25 | self.assertListEqual(sp.id2column, new_sp.id2column) 26 | self.assertDictEqual(sp.row2id, new_sp.row2id) 27 | self.assertDictEqual(sp.column2id, new_sp.column2id) 28 | 29 | def _test_equal_spaces_dense(self, sp, new_sp): 30 | 31 | self._test_equal_spaces_structs(sp, new_sp) 32 | np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat, new_sp.cooccurrence_matrix.mat, 6) 33 | 34 | def _test_equal_spaces_sparse(self, sp, new_sp): 35 | 36 | self._test_equal_spaces_structs(sp, new_sp) 37 | np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat.todense(), new_sp.cooccurrence_matrix.mat.todense(), 6) 38 | 39 | def test_raises(self): 40 | with pytest.raises(SystemExit): 41 | bps.main(["build_peripheral_space.py", "-h"]) 42 | 43 | with pytest.raises(SystemExit): 44 | bps.main([ 45 | "build_peripheral_space.py", 46 | "-l", '/tmp/test_build_peripheral_space.log', 47 | "-h", 48 | ]) 49 | 50 | def tttest_simple_sparse_batch(self): 51 | 52 | bps.main(["build_peripheral_space.py", 53 | "-l", self.dir_ + "log1.txt", 54 | "-i", self.dir_ + "mat1", 55 | "-o", self.dir_, 56 | "--core_in_dir", self.dir_, 57 | "--core_filter", "CORE_SS.mat1.pkl", 58 | "--input_format", "sm", 59 | "--output_format", "sm" 60 | ]) 61 | 62 | s1 = Space.build(data=self.dir_ + "mat1.sm", 63 | cols=self.dir_ + "mat1.cols", 64 | format="sm") 65 | s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm", 66 | cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols", 67 | format="sm") 68 | s3 = Space.build(data=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.sm", 69 | cols=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.cols", 70 | format="sm") 71 | 72 | self._test_equal_spaces_sparse(s1, s2) 73 | self._test_equal_spaces_sparse(s1, s3) 74 | 75 | def test_simple_sparse(self): 76 | 77 | bps.main(["build_peripheral_space.py", 78 | "-l", self.dir_ + "log1.txt", 79 | "-i", self.dir_ + "mat1", 80 | "-o", self.dir_, 81 | "-c", self.dir_ + "CORE_SS.mat1.pkl", 82 | "--input_format", "sm", 83 | "--output_format", "sm" 84 | ]) 85 | 86 | s1 = Space.build(data=self.dir_ + "mat1.sm", 87 | cols=self.dir_ + "mat1.cols", 88 | format="sm") 89 | s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm", 90 | cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols", 91 | format="sm") 92 | 93 | self._test_equal_spaces_sparse(s1, s2) 94 | 95 | def test_simple_dense(self): 96 | bps.main(["build_peripheral_space.py", 97 | "-l", self.dir_ + "log1.txt", 98 | "-i", self.dir_ + "mat2", 99 | "-o", self.dir_, 100 | "-c", self.dir_ + "CORE_SS.mat2.pkl", 101 | "--input_format", "dm", 102 | "--output_format", "dm" 103 | ]) 104 | s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") 105 | s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm") 106 | 107 | self._test_equal_spaces_dense(s1, s2) 108 | 109 | def test_simple_ops(self): 110 | 111 | bcs.main(["build_core_space.py", 112 | "-l", self.dir_ + "log1.txt", 113 | "-i", self.dir_ + "mat3", 114 | "-w", "raw", 115 | "-s", "top_sum_3,top_length_3,top_sum_4", 116 | "-r", "svd_2,svd_1", 117 | "-o", self.dir_, 118 | "--input_format", "dm", 119 | "--output_format", "dm" 120 | ]) 121 | 122 | core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2", 123 | "CORE_SS.mat3.raw.top_sum_3.svd_1", 124 | "CORE_SS.mat3.raw.top_length_3.svd_2", 125 | "CORE_SS.mat3.raw.top_length_3.svd_1", 126 | "CORE_SS.mat3.raw.top_sum_4.svd_2", 127 | "CORE_SS.mat3.raw.top_sum_4.svd_1" 128 | ] 129 | 130 | core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats] 131 | 132 | for i, core_mat in enumerate(core_mats): 133 | bps.main(["build_peripheral_space.py", 134 | "-l", self.dir_ + "log1.txt", 135 | "-i", self.dir_ + "mat3", 136 | "-o", self.dir_, 137 | "-c", self.dir_ + core_mat + ".pkl", 138 | "--input_format", "dm", 139 | "--output_format", "dm" 140 | ]) 141 | 142 | s1 = core_spaces[i] 143 | data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" 144 | s2 = Space.build(data=data_file, format="dm") 145 | self._test_equal_spaces_dense(s1, s2) 146 | 147 | bps.main(["build_peripheral_space.py", 148 | "-l", self.dir_ + "log1.txt", 149 | "-i", self.dir_ + "mat3", 150 | "-o", self.dir_, 151 | "-c", self.dir_ + core_mat + ".pkl", 152 | "--input_format", "sm", 153 | "--output_format", "dm" 154 | ]) 155 | 156 | s1 = core_spaces[i] 157 | data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" 158 | s2 = Space.build(data=data_file, format="dm") 159 | 160 | self._test_equal_spaces_dense(s1, s2) 161 | -------------------------------------------------------------------------------- /src/unitest/conftest.py: -------------------------------------------------------------------------------- 1 | 2 | import py 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def toolkit_dir(): 8 | return py.path.local(__file__).dirpath().join('..', '..') 9 | 10 | 11 | @pytest.fixture 12 | def data_dir(toolkit_dir): 13 | return toolkit_dir.join('resource', 'unittest') 14 | 15 | 16 | @pytest.fixture 17 | def config_dir(tmpdir): 18 | return tmpdir.mkdir('config') 19 | 20 | 21 | @pytest.fixture 22 | def pipelines_test_resources(data_dir): 23 | return data_dir.join('pipelines_test_resources') 24 | 25 | 26 | @pytest.fixture 27 | def sim_input(pipelines_test_resources): 28 | return str(pipelines_test_resources.join('sim_input.txt')) 29 | 30 | -------------------------------------------------------------------------------- /src/unitest/crossvalidation_utils_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 9, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | from composes.utils.crossvalidation_utils import get_split_indices 8 | 9 | class Test(unittest.TestCase): 10 | 11 | 12 | def test_get_split_indicec(self): 13 | 14 | test_cases = [(10, 3, 4), (9, 10, 1), (10, 10, 1), (109, 10, 11), (1, 1, 1)] 15 | 16 | for range_, fold, max_len in test_cases: 17 | 18 | indices = get_split_indices(range_, fold) 19 | self.assertGreaterEqual(fold, len(indices)) 20 | 21 | for i in range(len(indices)): 22 | self.assertTrue(len(indices[i]) >= range_//fold or fold >= range_) 23 | self.assertGreaterEqual(max_len, len(indices[i])) 24 | 25 | 26 | indices = get_split_indices(10, 3) 27 | self.assertEqual(len(indices[0]), 4) 28 | self.assertEqual(len(indices[1]), 3) 29 | self.assertEqual(len(indices[2]), 3) 30 | 31 | if __name__ == "__main__": 32 | #import sys;sys.argv = ['', 'Test.test_get_split_indicec'] 33 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/dense_matrix_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 17, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | import numpy.testing 9 | from scipy.sparse import csr_matrix 10 | from scipy.sparse import csc_matrix 11 | from composes.matrix.sparse_matrix import SparseMatrix 12 | from composes.matrix.dense_matrix import DenseMatrix 13 | 14 | class TestDenseMatrix(unittest.TestCase): 15 | 16 | 17 | def setUp(self): 18 | self.a = np.array([[1,2,3],[4,0,5]]) 19 | self.b = np.array([[0,0,0],[0,0,0]]) 20 | 21 | self.c = np.array([[0,0],[0,0],[0,0]]) 22 | self.d = np.array([[1,0],[0,1]]) 23 | self.e = np.array([1,10]) 24 | self.f = np.array([1,10,100]) 25 | 26 | self.matrix_a = DenseMatrix(self.a) 27 | self.matrix_b = DenseMatrix(self.b) 28 | 29 | self.matrix_c = DenseMatrix(self.c) 30 | self.matrix_d = DenseMatrix(self.d) 31 | 32 | def tearDown(self): 33 | pass 34 | 35 | def test_init(self): 36 | nparr = self.a 37 | test_cases = [nparr, 38 | np.mat(nparr), 39 | csr_matrix(nparr), 40 | csc_matrix(nparr), 41 | SparseMatrix(nparr)] 42 | 43 | for inmat in test_cases: 44 | outmat = DenseMatrix(inmat) 45 | self.assertIsInstance(outmat.mat, np.matrix) 46 | numpy.testing.assert_array_equal(nparr, np.array(outmat.mat)) 47 | 48 | 49 | def test_add(self): 50 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])), 51 | (self.matrix_a, self.matrix_b, self.matrix_a.mat) 52 | ] 53 | 54 | 55 | for (term1, term2, expected) in test_cases: 56 | sum_ = term1 + term2 57 | numpy.testing.assert_array_equal(sum_.mat, expected) 58 | self.assertIsInstance(sum_, type(term1)) 59 | 60 | def test_add_raises(self): 61 | test_cases = [(self.matrix_a, self.a), 62 | (self.matrix_a, SparseMatrix(self.a))] 63 | 64 | for (term1, term2) in test_cases: 65 | self.assertRaises(TypeError, term1.__add__, term2) 66 | 67 | def test_div(self): 68 | test_cases = [(self.matrix_a, 2, np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]])), 69 | (self.matrix_c, 2, np.mat(self.c)) 70 | ] 71 | 72 | for (term1, term2, expected) in test_cases: 73 | sum_ = term1 / term2 74 | numpy.testing.assert_array_equal(sum_.mat, expected) 75 | self.assertIsInstance(sum_, DenseMatrix) 76 | 77 | def test_div_raises(self): 78 | test_cases = [(self.matrix_a, self.a, TypeError), 79 | (self.matrix_a, SparseMatrix(self.a), TypeError), 80 | (self.matrix_a, "3", TypeError), 81 | (self.matrix_a, 0, ZeroDivisionError) 82 | ] 83 | 84 | for (term1, term2, error_type) in test_cases: 85 | self.assertRaises(error_type, term1.__div__, term2) 86 | 87 | 88 | def test_mul(self): 89 | test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])), 90 | (self.matrix_d, self.matrix_a, self.matrix_a.mat), 91 | (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])), 92 | (2, self.matrix_a, np.mat([[2,4,6],[8,0,10]])), 93 | (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]])), 94 | (np.int64(2), self.matrix_a, np.mat([[2,4,6],[8,0,10]])) 95 | ] 96 | 97 | for (term1, term2, expected) in test_cases: 98 | sum_ = term1 * term2 99 | numpy.testing.assert_array_equal(sum_.mat, expected) 100 | self.assertIsInstance(sum_, DenseMatrix) 101 | 102 | def test_mul_raises(self): 103 | test_cases = [(self.matrix_a, self.a), 104 | (self.matrix_a, SparseMatrix(self.a)), 105 | (self.matrix_a, "3"), 106 | ("3", self.matrix_a)] 107 | 108 | for (term1, term2) in test_cases: 109 | self.assertRaises(TypeError, term1.__mul__, term2) 110 | 111 | def test_multiply(self): 112 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[1,4,9],[16,0,25]])), 113 | (self.matrix_a, self.matrix_b, np.mat(self.b)) 114 | ] 115 | 116 | for (term1, term2, expected) in test_cases: 117 | mult1 = term1.multiply(term2) 118 | mult2 = term2.multiply(term1) 119 | 120 | numpy.testing.assert_array_equal(mult1.mat, expected) 121 | numpy.testing.assert_array_equal(mult2.mat, expected) 122 | 123 | self.assertIsInstance(mult1, DenseMatrix) 124 | self.assertIsInstance(mult2, DenseMatrix) 125 | 126 | def test_multiply_raises(self): 127 | 128 | test_cases = [(self.matrix_a, self.matrix_d, ValueError), 129 | (self.matrix_a, self.a, TypeError), 130 | (self.matrix_a, SparseMatrix(self.a), TypeError), 131 | ] 132 | 133 | for (term1, term2, error_type) in test_cases: 134 | self.assertRaises(error_type, term1.multiply, term2) 135 | 136 | def test_scale_rows(self): 137 | outcome = np.mat([[1,2,3],[40,0,50]]) 138 | test_cases = [(self.matrix_a, self.e, outcome), 139 | (self.matrix_a, np.mat(self.e).T, outcome), 140 | ] 141 | 142 | for (term1, term2, expected) in test_cases: 143 | term1 = term1.scale_rows(term2) 144 | numpy.testing.assert_array_equal(term1.mat, expected) 145 | 146 | def test_scale_columns(self): 147 | test_cases = [(self.matrix_a, self.f, np.mat([[1,20,300],[4,0,500]]))] 148 | 149 | for (term1, term2, expected) in test_cases: 150 | term1 = term1.scale_columns(term2) 151 | numpy.testing.assert_array_equal(term1.mat, expected) 152 | 153 | 154 | def test_scale_raises(self): 155 | test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows), 156 | (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns), 157 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows), 158 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns), 159 | (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows), 160 | ] 161 | for (term1, term2, error_type, function) in test_cases: 162 | self.assertRaises(error_type, function, term2) 163 | 164 | 165 | def test_plog(self): 166 | m = DenseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]])) 167 | m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]]) 168 | a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]]) 169 | test_cases = [(self.matrix_a.copy(), a_expected), 170 | (m, m_expected) 171 | ] 172 | 173 | for (term, expected) in test_cases: 174 | term.plog() 175 | numpy.testing.assert_array_almost_equal(term.mat, expected, 3) 176 | 177 | if __name__ == "__main__": 178 | #import sys;sys.argv = ['', 'Test.testName'] 179 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/dilation_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 15, 2012 3 | 4 | @author: nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from composes.matrix.dense_matrix import DenseMatrix 9 | #from composes.composition.dilation_1 import DilationModel 10 | from composes.composition.dilation import Dilation 11 | 12 | class Test(unittest.TestCase): 13 | 14 | def setUp(self): 15 | self.m11 = DenseMatrix(np.mat([[4],[2]])) 16 | self.m21 = DenseMatrix(np.mat([[3],[6]])) 17 | #self.ph1 = DenseMatrix(np.mat([[5],[10]])) 18 | self.ph1 = DenseMatrix(np.mat([[80],[40]])) 19 | 20 | self.m12 = DenseMatrix(np.mat([[2,0],[3,0]])) 21 | self.m22 = DenseMatrix(np.mat([[3,3],[6,4]])) 22 | #self.ph2 = DenseMatrix(np.mat([[5,2],[10,5]])) 23 | self.ph2 = DenseMatrix(np.mat([[20,8],[90,45]])) 24 | 25 | self.m13 = DenseMatrix(np.mat([[4,3],[3,4]])) 26 | self.m23 = DenseMatrix(np.mat([[0,5],[0,5]])) 27 | #self.ph3 = DenseMatrix(np.mat([[12,14],[12,21]])) 28 | self.ph3 = DenseMatrix(np.mat([[300,350],[300,525]])) 29 | 30 | self.m14 = DenseMatrix(np.mat([[4,3],[3,4],[0,0]])) 31 | self.m24 = DenseMatrix(np.mat([[0,5],[0,5],[0,0]])) 32 | #self.ph4 = DenseMatrix(np.mat([[12,14],[12,21],[0,0]])) 33 | self.ph4 = DenseMatrix(np.mat([[300,350],[300,525],[0,0]])) 34 | 35 | self.m15 = DenseMatrix(np.mat([[2,0],[0,0],[3,0]])) 36 | self.m25 = DenseMatrix(np.mat([[3,3],[0,0],[6,4]])) 37 | #self.ph5 = DenseMatrix(np.mat([[5,2],[0,0],[10,5]])) 38 | self.ph5 = DenseMatrix(np.mat([[20,8],[0,0],[90,45]])) 39 | 40 | self.m16 = DenseMatrix(np.mat([[0,0],[0,0]])) 41 | self.m26 = DenseMatrix(np.mat([[0,0],[0,0]])) 42 | self.ph6 = DenseMatrix(np.mat([[0,0],[0,0]])) 43 | 44 | self.m17 = DenseMatrix(np.mat([[2,0],[3,0]])) 45 | self.m27 = DenseMatrix(np.mat([[0,1],[0,2]])) 46 | #self.ph7 = DenseMatrix(np.mat([[4,5],[5,4]])) 47 | self.ph7 = DenseMatrix(np.mat([[16,20],[45,36]])) 48 | 49 | def test_train_exact(self): 50 | test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)), 51 | (self.m12, self.m22, self.ph2, 5 / (3.0)), 52 | (self.m13, self.m23, self.ph3, 6), 53 | (self.m14, self.m24, self.ph4, 6), 54 | (self.m15, self.m25, self.ph5, 5 / (3.0)), 55 | (self.m16, self.m26, self.ph6, 2), 56 | (self.m17, self.m27, self.ph7, 2) 57 | ] 58 | 59 | for arg1, arg2, phrase, lambda_ in test_cases: 60 | m = Dilation() 61 | m._solve(arg1, arg2, phrase) 62 | self.assertAlmostEqual(m._lambda, lambda_) 63 | # 64 | def test_compose_exact(self): 65 | 66 | test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)), 67 | (self.m13, self.m23, self.ph3, 6), 68 | (self.m14, self.m24, self.ph4, 6) 69 | ] 70 | for arg1, arg2, phrase, lambda_ in test_cases: 71 | 72 | m = Dilation() 73 | m._solve(arg1, arg2, phrase) 74 | res = m._compose(arg1, arg2) 75 | np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2) 76 | 77 | m = Dilation(lambda_) 78 | res = m._compose(arg1, arg2) 79 | np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2) 80 | 81 | 82 | def test_train_random(self): 83 | test_cases = [1.0,2.0,3.0] 84 | rows = 4 85 | cols = 3 86 | m1 = np.random.rand(rows,cols) 87 | m2 = np.random.rand(rows,cols) 88 | 89 | 90 | for lambda_ in test_cases: 91 | m = Dilation(lambda_) 92 | result_p = m._compose(DenseMatrix(m1), DenseMatrix(m2)) 93 | 94 | m = Dilation() 95 | m._solve(DenseMatrix(m1),DenseMatrix(m2),result_p) 96 | self.assertAlmostEqual(lambda_, m._lambda) 97 | 98 | 99 | if __name__ == "__main__": 100 | #import sys;sys.argv = ['', 'Test.testName'] 101 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/dimensionality_reduction_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 28, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from composes.transformation.dim_reduction.svd import Svd 9 | from composes.transformation.dim_reduction.nmf import Nmf 10 | from composes.matrix.linalg import Linalg 11 | from composes.matrix.dense_matrix import DenseMatrix 12 | from composes.matrix.sparse_matrix import SparseMatrix 13 | 14 | class DimReductionTest(unittest.TestCase): 15 | 16 | 17 | def setUp(self): 18 | pass 19 | 20 | 21 | def tearDown(self): 22 | pass 23 | 24 | def test_nmf(self): 25 | test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double), 26 | np.mat([[1,0,0]], dtype = np.double)] 27 | 28 | for in_mat in test_cases: 29 | red = Nmf(2) 30 | d_mat = DenseMatrix(in_mat) 31 | #wd_init, hd_init = red.random_init(d_mat) 32 | wd_init, hd_init = red.v_col_init(d_mat) 33 | 34 | s_mat = SparseMatrix(in_mat) 35 | ws_init = SparseMatrix(wd_init) 36 | hs_init = SparseMatrix(hd_init) 37 | 38 | wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init) 39 | ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init) 40 | 41 | #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD 42 | #print wd_mat.mat 43 | #print hd_mat.mat 44 | #print ws_mat.mat.todense() 45 | #print hs_mat.mat.todense() 46 | print "V:", in_mat 47 | print "WH:", (ws_mat*hs_mat).mat.todense() 48 | 49 | np.testing.assert_array_almost_equal(wd_mat.mat, 50 | ws_mat.mat.todense(), 2) 51 | np.testing.assert_array_almost_equal(hd_mat.mat, 52 | hs_mat.mat.todense(), 2) 53 | 54 | def test_svd(self): 55 | test_cases = [(DenseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])), 56 | np.mat([[ 2.19272110e+00, 3.03174768e+00, 0], 57 | [ 4.38544220e+00, 6.06349536e+00, 0], 58 | [ 6.76369708e+02, -4.91431927e-02, 0]]), 59 | np.mat([[0.0059,0.9979,0.0636], 60 | [0.3255,-0.0621,0.9434], 61 | [0.945,0.015,-0.325]]).transpose())] 62 | 63 | 64 | 65 | for x, us_expected, v_expected in test_cases: 66 | 67 | svd_red = Svd(2) 68 | us, transmat = svd_red.apply(x) 69 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2) 70 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2) 71 | 72 | svd_red = Svd(3) 73 | us, transmat = svd_red.apply(x) 74 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2) 75 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2) 76 | 77 | svd_red = Svd(6) 78 | us, transmat = svd_red.apply(x) 79 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2) 80 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2) 81 | 82 | svd_red = Svd(1) 83 | us, transmat = svd_red.apply(x) 84 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:1], 2) 85 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:1], 2) 86 | 87 | 88 | test_cases = [(SparseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])), 89 | np.mat([[ 2.19272110e+00, 3.03174768e+00, 0], 90 | [ 4.38544220e+00, 6.06349536e+00, 0], 91 | [ 6.76369708e+02, -4.91431927e-02, 0]]), 92 | np.mat([[0.0059,0.9979,0.0636], 93 | [0.3255,-0.0621,0.9434], 94 | [0.945,0.015,-0.325]]).transpose())] 95 | 96 | 97 | for x, us_expected, v_expected in test_cases: 98 | us_expected = np.abs(us_expected) 99 | v_expected = np.abs(v_expected) 100 | 101 | svd_red = Svd(2) 102 | us, transmat = svd_red.apply(x) 103 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2) 104 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2) 105 | 106 | svd_red = Svd(3) 107 | us, transmat = svd_red.apply(x) 108 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2) 109 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2) 110 | 111 | svd_red = Svd(6) 112 | us, transmat = svd_red.apply(x) 113 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2) 114 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2) 115 | 116 | svd_red = Svd(1) 117 | us, transmat = svd_red.apply(x) 118 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:1], 2) 119 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:1], 2) 120 | 121 | if __name__ == "__main__": 122 | #import sys;sys.argv = ['', 'Test.test_svd'] 123 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/es_pipeline_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 19, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | from unitest import data_dir 8 | from pipelines import evaluate_similarities as es 9 | 10 | class Test(unittest.TestCase): 11 | 12 | 13 | def setUp(self): 14 | self.dir_ = data_dir + "pipelines_test_resources/" 15 | 16 | 17 | def tearDown(self): 18 | pass 19 | 20 | 21 | def test_simple(self): 22 | 23 | es.main(["evaluate_similarities.py", 24 | "-l", self.dir_ + "log1.txt", 25 | "-i", self.dir_ + "pred1.txt", 26 | "-m", "pearson,spearman", 27 | "-c", "1,3", 28 | ]) 29 | 30 | es.main(["evaluate_similarities.py", 31 | "-l", self.dir_ + "log1.txt", 32 | "-i", self.dir_ + "pred1.txt", 33 | "--in_dir", self.dir_, 34 | "--filter", "pred", 35 | "-m", "pearson,spearman", 36 | "-c", "1,3", 37 | ]) 38 | 39 | if __name__ == "__main__": 40 | #import sys;sys.argv = ['', 'Test.testName'] 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /src/unitest/matrix_utils_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 12, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from composes.matrix.dense_matrix import DenseMatrix 9 | from composes.matrix.sparse_matrix import SparseMatrix 10 | from composes.utils.matrix_utils import resolve_type_conflict 11 | from scipy.sparse import csr_matrix 12 | 13 | class Test(unittest.TestCase): 14 | 15 | 16 | def test_resolve_type_conflict(self): 17 | 18 | arr = np.mat([1,2]) 19 | 20 | a = DenseMatrix(arr) 21 | b = SparseMatrix(arr) 22 | 23 | [c,d] = resolve_type_conflict([a,b], DenseMatrix) 24 | [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix) 25 | h = resolve_type_conflict([], DenseMatrix) 26 | 27 | [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix) 28 | 29 | self.assertIsInstance(c, DenseMatrix) 30 | self.assertIsInstance(d, DenseMatrix) 31 | self.assertIsInstance(e, DenseMatrix) 32 | self.assertIsInstance(f, DenseMatrix) 33 | self.assertIsInstance(g, DenseMatrix) 34 | self.assertListEqual([], h) 35 | 36 | self.assertIsInstance(g, DenseMatrix) 37 | 38 | self.assertIsInstance(u, DenseMatrix) 39 | self.assertIsInstance(v, DenseMatrix) 40 | 41 | 42 | 43 | if __name__ == "__main__": 44 | #import sys;sys.argv = ['', 'Test.testName'] 45 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/model_export_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 16, 2012 3 | 4 | @author: nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from unitest import data_dir 9 | from composes.matrix.dense_matrix import DenseMatrix 10 | from composes.semantic_space.space import Space 11 | 12 | from composes.composition.weighted_additive import WeightedAdditive 13 | from composes.composition.full_additive import FullAdditive 14 | from composes.composition.dilation import Dilation 15 | from composes.composition.lexical_function import LexicalFunction 16 | from composes.exception.illegal_state_error import IllegalStateError 17 | 18 | class ModelExportingTest(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.prefix = data_dir + "output/model" 22 | def test_weighted_additive(self): 23 | 24 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) 25 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) 26 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) 27 | self.row = ["a", "b"] 28 | self.ft = ["f1","f2"] 29 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) 30 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) 31 | m = WeightedAdditive() 32 | m.export(self.prefix + ".add1") 33 | m.train([("a","a","a_a")], self.space1, self.space2) 34 | m.export(self.prefix + ".add2") 35 | 36 | def test_full_additive(self): 37 | 38 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) 39 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) 40 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) 41 | self.row = ["a", "b"] 42 | self.ft = ["f1","f2"] 43 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) 44 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) 45 | m = FullAdditive() 46 | self.assertRaises(IllegalStateError, m.export,self.prefix + ".full1") 47 | m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2) 48 | 49 | m.export(self.prefix + ".full2") 50 | 51 | def test_dilation(self): 52 | 53 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) 54 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) 55 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) 56 | self.row = ["a", "b"] 57 | self.ft = ["f1","f2"] 58 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) 59 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) 60 | m = Dilation() 61 | m.export(self.prefix + ".dil1") 62 | m.train([("a","b","a_b")], self.space1, self.space2) 63 | m.export(self.prefix + ".dil2") 64 | 65 | def test_lexical_function(self): 66 | 67 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) 68 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) 69 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) 70 | self.row = ["a", "b"] 71 | self.ft = ["f1","f2"] 72 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) 73 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) 74 | m = LexicalFunction() 75 | m._MIN_SAMPLES = 1 76 | self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1") 77 | m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2) 78 | m.export(self.prefix + ".lf2") 79 | 80 | 81 | 82 | if __name__ == "__main__": 83 | #import sys;sys.argv = ['', 'Test.test_weighted_additive'] 84 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/neighbour_pipeline_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 19, 2012 3 | 4 | @author: thenghia.pham 5 | ''' 6 | import unittest 7 | from unitest import data_dir 8 | from unitest import toolkit_dir 9 | import pipelines.compute_neighbours as find_neighbours 10 | from pipelines import build_core_space as bcs 11 | 12 | import pytest 13 | 14 | 15 | def read_neighbours_list(file_name): 16 | result = [] 17 | word = None 18 | neighbours = [] 19 | with open(file_name) as f: 20 | for line in f: 21 | line = line.strip() 22 | if (line != ""): 23 | elements = line.split() 24 | if (len(elements) == 1): 25 | if word != None: 26 | result.append((word,neighbours)) 27 | neighbours = [] 28 | else: 29 | word = elements[0] 30 | else: 31 | neighbours.append((elements[0],elements[1])) 32 | if word != None: 33 | result.append((word,neighbours)) 34 | return result 35 | 36 | 37 | @pytest.mark.xfail(run=False) 38 | class NeighboursPipelineTest(unittest.TestCase): 39 | 40 | 41 | def setUp(self): 42 | self.dir_ = data_dir 43 | self.log_dir = toolkit_dir + "/log/" 44 | 45 | #create the spaces required in the tests 46 | bcs.main(["build_core_space.py", 47 | "-l", self.dir_ + "pipelines_test_resources/log1.txt", 48 | "-i", self.dir_ + "pipelines_test_resources/mat3", 49 | "-w", "raw", 50 | "-s", "top_sum_3", 51 | "-r", "svd_2", 52 | "-o", self.dir_ + "pipelines_test_resources/", 53 | "--input_format", "dm" 54 | ]) 55 | 56 | def test_find_neighbours(self): 57 | """ 58 | find_neighbours.main(["compute_neighbours.py", 59 | "-l", self.log_dir + "neighbours_log.txt", 60 | "-i", self.dir_ + "neighbours_input.txt", 61 | "-m", "dot_prod", 62 | "-n", "3", 63 | "-s", self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.pkl", 64 | "-o", self.dir_ 65 | ]) 66 | 67 | find_neighbours.main(["compute_neighbours.py", 68 | "%sconfig/neighbours_config.cfg" %self.dir_ 69 | ]) 70 | 71 | find_neighbours.main(["compute_neighbours.py", 72 | "-m", "lin", 73 | "%sconfig/neighbours_config.cfg" %self.dir_ 74 | ]) 75 | 76 | find_neighbours.main(["compute_neighbours.py", 77 | "-m", "euclidean", 78 | "%sconfig/neighbours_config.cfg" %self.dir_ 79 | ]) 80 | 81 | find_neighbours.main(["compute_neighbours.py", 82 | "-m", "euclidean", 83 | "--space", "%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_), 84 | "%sconfig/neighbours_config.cfg" %self.dir_ 85 | ]) 86 | """ 87 | find_neighbours.main(["compute_neighbours.py", 88 | "-m", "euclidean", 89 | "-n", "2", 90 | "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_), 91 | "%sconfig/neighbours_config.cfg" %self.dir_ 92 | ]) 93 | find_neighbours.main(["compute_neighbours.py", 94 | "-m", "euclidean", 95 | "-i", self.dir_ + "pipelines_test_resources/neighbours_input.txt", 96 | "-n", "2", 97 | "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_), 98 | "-o" "/home/georgianadinu/work/FAKE_PATH" 99 | ]) 100 | #neighbours_list = read_neighbours_list(self.dir_ + "NEIGHBOURS.neighbours_input.txt.euclidean") 101 | #print len(neighbours_list) 102 | 103 | 104 | def tearDown(self): 105 | pass 106 | 107 | 108 | def testName(self): 109 | pass 110 | 111 | 112 | if __name__ == "__main__": 113 | #import sys;sys.argv = ['', 'Test.testName'] 114 | unittest.main() 115 | -------------------------------------------------------------------------------- /src/unitest/operation_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 26, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting 9 | from composes.transformation.dim_reduction.svd import Svd 10 | from composes.transformation.dim_reduction.nmf import Nmf 11 | from composes.semantic_space.operation import ScalingOperation 12 | from composes.semantic_space.operation import DimensionalityReductionOperation 13 | from composes.matrix.dense_matrix import DenseMatrix 14 | from composes.matrix.sparse_matrix import SparseMatrix 15 | from composes.exception.illegal_state_error import IllegalStateError 16 | 17 | 18 | class Test(unittest.TestCase): 19 | 20 | 21 | def setUp(self): 22 | self.m1 = np.array([[1,2,3]]) 23 | self.m2 = np.array([[3]]) 24 | self.m3 = np.array([[4,2,6]]) 25 | self.m4 = np.array([[2]]) 26 | 27 | self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]]) 28 | self.us = np.mat([[ 2.19272110e+00, 3.03174768e+00], 29 | [ 4.38544220e+00, 6.06349536e+00], 30 | [ 6.76369708e+02, -4.91431927e-02]]) 31 | 32 | self.xnmf = np.mat([[1,2,3],[2,4,6],[4,17,13]]) 33 | 34 | def tearDown(self): 35 | pass 36 | 37 | 38 | def test_apply_dimensionality_reduction(self): 39 | 40 | test_cases =[(self.x, self.us)] 41 | red = Svd(2) 42 | 43 | for in_mat, expected_us_mat in test_cases: 44 | op = DimensionalityReductionOperation(red) 45 | tmp_mat = in_mat.copy() 46 | 47 | out_us_mat = op.apply(DenseMatrix(in_mat)).mat 48 | np.testing.assert_array_almost_equal(expected_us_mat, out_us_mat, 2) 49 | 50 | np.testing.assert_array_equal(in_mat, tmp_mat) 51 | self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat)) 52 | self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat)) 53 | 54 | 55 | def test_project_dimensionality_reduction(self): 56 | 57 | test_cases =[(self.x, self.us)] 58 | red = Svd(2) 59 | 60 | for in_mat, expected_us_mat in test_cases: 61 | op = DimensionalityReductionOperation(red) 62 | tmp_mat = in_mat.copy() 63 | 64 | self.assertRaises(IllegalStateError, op.project, DenseMatrix(in_mat)) 65 | 66 | op.apply(DenseMatrix(in_mat)).mat 67 | out_proj_mat = op.project(DenseMatrix(in_mat)).mat 68 | np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat, 2) 69 | 70 | np.testing.assert_array_equal(in_mat, tmp_mat) 71 | 72 | self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat)) 73 | 74 | out_proj_mat2 = op.project(DenseMatrix(in_mat)).mat 75 | np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat2, 2) 76 | 77 | def test_project_dimensionality_reduction_nmf(self): 78 | 79 | test_cases = [self.xnmf] 80 | red = Nmf(2) 81 | 82 | for in_mat in test_cases: 83 | d_in_mat = DenseMatrix(in_mat) 84 | op = DimensionalityReductionOperation(red) 85 | tmp_mat = in_mat.copy() 86 | 87 | self.assertRaises(IllegalStateError, op.project, d_in_mat) 88 | 89 | out_core_mat = op.apply(d_in_mat).mat 90 | out_proj_mat = op.project(d_in_mat).mat 91 | np.testing.assert_array_almost_equal(out_proj_mat, out_core_mat, 5) 92 | 93 | np.testing.assert_array_equal(in_mat, tmp_mat) 94 | 95 | self.assertRaises(IllegalStateError, op.apply, d_in_mat) 96 | 97 | out_proj_mat2 = op.project(d_in_mat).mat 98 | np.testing.assert_array_almost_equal(out_proj_mat2, out_core_mat, 5) 99 | 100 | 101 | def test_apply_weighting_operation(self): 102 | test_cases = [(self.m1, np.array([[0,0,0]])), 103 | (self.m2, np.array([[0]]))] 104 | w = PpmiWeighting() 105 | for in_mat, expected_mat in test_cases: 106 | op = ScalingOperation(w) 107 | tmp_mat = in_mat.copy() 108 | out_mat = op.apply(DenseMatrix(in_mat)).mat 109 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7) 110 | np.testing.assert_array_equal(in_mat, tmp_mat) 111 | self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat)) 112 | 113 | def test_project_weighting_operation(self): 114 | test_cases = [(self.m1, self.m3, 115 | np.array([[0.69314718,0,0]])), 116 | (self.m2, self.m4, np.array([[0]]))] 117 | w = PpmiWeighting() 118 | for (core_mat, per_mat, expected_mat) in test_cases: 119 | op = ScalingOperation(w) 120 | tmp_mat = per_mat.copy() 121 | 122 | self.assertRaises(IllegalStateError, op.project, 123 | DenseMatrix(per_mat)) 124 | 125 | op.apply(DenseMatrix(core_mat)) 126 | out_mat = op.project(DenseMatrix(per_mat)).mat 127 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7) 128 | np.testing.assert_array_equal(per_mat, tmp_mat) 129 | 130 | out_mat = op.project(DenseMatrix(per_mat)).mat 131 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7) 132 | 133 | if __name__ == "__main__": 134 | #import sys;sys.argv = ['', 'Test.testName'] 135 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/regression_learner_utils_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 9, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from composes.utils.regression_learner import RidgeRegressionLearner 9 | from composes.utils.regression_learner import LstsqRegressionLearner 10 | from composes.matrix.dense_matrix import DenseMatrix 11 | from composes.utils.matrix_utils import padd_matrix 12 | 13 | class Test(unittest.TestCase): 14 | 15 | 16 | def test_trivial_crossvalidation(self): 17 | 18 | for i in range(1, 10): 19 | m_a = DenseMatrix(np.mat(np.random.random((i + 1,4)))) 20 | m_b = DenseMatrix(np.mat(np.random.random((i + 1,4)))) 21 | tmp_a = m_a.mat.copy() 22 | tmp_b = m_b.mat.copy() 23 | 24 | learner = RidgeRegressionLearner(param_range=[3], intercept=False) 25 | solution = learner.train(m_a, m_b) 26 | 27 | learner2 = RidgeRegressionLearner(param = 3, intercept=False) 28 | solution2 = learner2.train(m_a, m_b) 29 | 30 | np.testing.assert_array_equal(tmp_a, m_a.mat) 31 | np.testing.assert_array_equal(tmp_b, m_b.mat) 32 | np.testing.assert_array_equal(solution.mat, solution2.mat) 33 | 34 | learner = RidgeRegressionLearner(param_range=[3], intercept=False) 35 | solution = learner.train(m_a, m_b) 36 | 37 | np.testing.assert_array_equal(tmp_a, m_a.mat) 38 | np.testing.assert_array_equal(tmp_b, m_b.mat) 39 | np.testing.assert_array_equal(solution.mat, solution2.mat) 40 | 41 | learner = RidgeRegressionLearner(param_range=[0], intercept=False) 42 | solution = learner.train(m_a, m_b) 43 | 44 | learner2 = LstsqRegressionLearner(intercept=False) 45 | solution2 = learner2.train(m_a, m_b) 46 | 47 | np.testing.assert_array_almost_equal(solution.mat, solution2.mat, 3) 48 | 49 | 50 | def test_crossvalidation(self): 51 | 52 | a = DenseMatrix(np.matrix([[1, 1],[2, 3],[4, 6]])) 53 | b = DenseMatrix(np.matrix([[12, 15, 18],[21, 27, 33],[35, 46, 57]])) 54 | res = DenseMatrix(np.matrix([[1, 2, 3],[4, 5, 6],[7, 8, 9]])) 55 | 56 | learner = RidgeRegressionLearner(intercept=True, param_range=[0]) 57 | learner2 = LstsqRegressionLearner(intercept=False) 58 | 59 | res1 = learner2.train(a, b) 60 | res2 = learner.train(a, b) 61 | 62 | np.testing.assert_array_almost_equal(res2.mat[:-1,:], res[0:2,:].mat, 6) 63 | np.testing.assert_array_almost_equal(res2.mat[-1,:], res[2:3,:].mat, 6) 64 | 65 | new_a = padd_matrix(a, 1) 66 | self.assertGreater(((a * res1) - b).norm(), ((new_a * res2) - b).norm()) 67 | 68 | 69 | if __name__ == "__main__": 70 | #import sys;sys.argv = ['', 'Test.test_trivial_cases'] 71 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/sparse_matrix_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 18, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | import numpy.testing 9 | from scipy.sparse import csr_matrix 10 | from scipy.sparse import csc_matrix 11 | from scipy.sparse.sputils import isintlike 12 | from composes.matrix.sparse_matrix import SparseMatrix 13 | from composes.matrix.dense_matrix import DenseMatrix 14 | 15 | 16 | 17 | class TestSparseMatrix(unittest.TestCase): 18 | 19 | def setUp(self): 20 | self.a = np.array([[1,2,3],[4,0,5]]) 21 | self.b = np.array([[0,0,0],[0,0,0]]) 22 | 23 | self.c = np.array([[0,0],[0,0],[0,0]]) 24 | self.d = np.array([[1,0],[0,1]]) 25 | self.e = np.array([1,10]) 26 | self.f = np.array([1,10,100]) 27 | 28 | self.matrix_a = SparseMatrix(self.a) 29 | self.matrix_b = SparseMatrix(self.b) 30 | 31 | self.matrix_c = SparseMatrix(self.c) 32 | self.matrix_d = SparseMatrix(self.d) 33 | 34 | 35 | def tearDown(self): 36 | pass 37 | 38 | def test_reshape(self): 39 | 40 | test_cases = [(self.matrix_a, (1,6), self.a.reshape((1,6))), 41 | (self.matrix_a, (3,2), self.a.reshape((3,2))), 42 | (self.matrix_b, (1,6), self.b.reshape((1,6))), 43 | (self.matrix_b, (6,1), self.b.reshape((6,1))), 44 | (self.matrix_b, (2,3), self.b.reshape((2,3))), 45 | ] 46 | 47 | for mat, shape, expected in test_cases: 48 | mat.reshape(shape) 49 | np.testing.assert_array_equal(mat.mat.todense(), expected) 50 | self.assertTupleEqual(shape, mat.shape) 51 | 52 | 53 | def test_reshape_raises(self): 54 | 55 | test_cases = [(3,0), (3,3), 3, (3,3,3), ("3","5"), (2,"4")] 56 | 57 | for shape in test_cases: 58 | self.assertRaises(ValueError, self.matrix_a.reshape, shape) 59 | 60 | 61 | def test_init(self): 62 | nparr = self.a 63 | test_cases = [nparr, 64 | np.mat(nparr), 65 | csr_matrix(nparr), 66 | csc_matrix(nparr), 67 | DenseMatrix(nparr)] 68 | 69 | for inmat in test_cases: 70 | outmat = SparseMatrix(inmat) 71 | self.assertIsInstance(outmat.mat, csr_matrix) 72 | numpy.testing.assert_array_equal(nparr, 73 | np.array(outmat.mat.todense())) 74 | 75 | def test_add(self): 76 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])), 77 | (self.matrix_a, self.matrix_b, np.mat(self.a)) 78 | ] 79 | 80 | for (term1, term2, expected) in test_cases: 81 | sum_ = term1 + term2 82 | numpy.testing.assert_array_equal(sum_.mat.todense(), expected) 83 | self.assertIsInstance(sum_, type(term1)) 84 | 85 | def test_add_raises(self): 86 | test_cases = [(self.matrix_a, self.a), 87 | (self.matrix_a, DenseMatrix(self.a))] 88 | 89 | for (term1, term2) in test_cases: 90 | self.assertRaises(TypeError, term1.__add__, term2) 91 | 92 | def test_mul(self): 93 | test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])), 94 | (self.matrix_d, self.matrix_a, self.matrix_a.mat.todense()), 95 | (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])), 96 | (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]])) 97 | ] 98 | 99 | for (term1, term2, expected) in test_cases: 100 | sum_ = term1 * term2 101 | numpy.testing.assert_array_equal(sum_.mat.todense(), expected) 102 | self.assertIsInstance(sum_, type(term1)) 103 | 104 | def test_mul_raises(self): 105 | test_cases = [(self.matrix_a, self.a), 106 | (self.matrix_a, DenseMatrix(self.a)), 107 | (self.matrix_a, "3")] 108 | 109 | for (term1, term2) in test_cases: 110 | self.assertRaises(TypeError, term1.__mul__, term2) 111 | 112 | def test_get_item(self): 113 | 114 | out_mat = SparseMatrix(self.a)[0,:] 115 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:])) 116 | 117 | out_int = SparseMatrix(self.a)[0,1] 118 | self.assertEqual(out_int, 2) 119 | 120 | out_mat = SparseMatrix(self.a)[0,1:2] 121 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,1:2])) 122 | 123 | out_mat = SparseMatrix(self.a)[0] 124 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:])) 125 | 126 | 127 | def test_scale_rows(self): 128 | outcome = np.mat([[1,2,3],[40,0,50]]) 129 | test_cases = [(self.matrix_a.copy(), self.e, outcome), 130 | (self.matrix_a.copy(), np.mat(self.e).T, outcome), 131 | ] 132 | 133 | for (term1, term2, expected) in test_cases: 134 | term1 = term1.scale_rows(term2) 135 | numpy.testing.assert_array_equal(term1.mat.todense(), expected) 136 | 137 | def test_scale_columns(self): 138 | test_cases = [(self.matrix_a.copy(), self.f, np.mat([[1,20,300],[4,0,500]]))] 139 | 140 | for (term1, term2, expected) in test_cases: 141 | term1 = term1.scale_columns(term2) 142 | numpy.testing.assert_array_equal(term1.mat.todense(), expected) 143 | self.assertIsInstance(term1.mat, csr_matrix) 144 | 145 | def test_scale_raises(self): 146 | test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows), 147 | (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns), 148 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows), 149 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns), 150 | (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows), 151 | ] 152 | for (term1, term2, error_type, function) in test_cases: 153 | self.assertRaises(error_type, function, term2) 154 | 155 | def test_plog(self): 156 | m = SparseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]])) 157 | m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]]) 158 | a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]]) 159 | test_cases = [(self.matrix_a.copy(), a_expected), 160 | (m, m_expected) 161 | ] 162 | 163 | for (term, expected) in test_cases: 164 | term.plog() 165 | numpy.testing.assert_array_almost_equal(term.mat.todense(), expected, 3) 166 | 167 | if __name__ == "__main__": 168 | #import sys;sys.argv = ['', 'Test.testName'] 169 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/tc_pipeline_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 19, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | from unitest import data_dir 9 | import pipelines.train_composition as tc 10 | import pipelines.build_core_space as bcs 11 | from composes.utils import io_utils 12 | from composes.semantic_space.space import Space 13 | 14 | class Test(unittest.TestCase): 15 | 16 | 17 | def setUp(self): 18 | self.dir_ = data_dir + "pipelines_test_resources/" 19 | 20 | #use as a conversion tool, creates the files we want 21 | bcs.main(["build_core_space.py", 22 | "-l", self.dir_ + "log1.txt", 23 | "-i", self.dir_ + "N_mat", 24 | "-o", self.dir_, 25 | "--input_format", "dm", 26 | ]) 27 | 28 | bcs.main(["build_core_space.py", 29 | "-l", self.dir_ + "log1.txt", 30 | "-i", self.dir_ + "AN_mat", 31 | "-o", self.dir_, 32 | "--input_format", "dm", 33 | ]) 34 | 35 | def tearDown(self): 36 | pass 37 | 38 | def _test_equal_spaces_structs(self, sp, new_sp): 39 | self.assertListEqual(sp.id2row, new_sp.id2row) 40 | self.assertListEqual(sp.id2column, new_sp.id2column) 41 | self.assertDictEqual(sp.row2id, new_sp.row2id) 42 | self.assertDictEqual(sp.column2id, new_sp.column2id) 43 | 44 | def _test_equal_spaces_dense(self, sp, new_sp): 45 | 46 | self._test_equal_spaces_structs(sp, new_sp) 47 | np.testing.assert_array_equal(sp.cooccurrence_matrix.mat, 48 | new_sp.cooccurrence_matrix.mat) 49 | 50 | def _test_equal_spaces_sparse(self, sp, new_sp): 51 | 52 | self._test_equal_spaces_structs(sp, new_sp) 53 | np.testing.assert_array_equal(sp.cooccurrence_matrix.mat.todense(), 54 | new_sp.cooccurrence_matrix.mat.todense()) 55 | 56 | def test_simple_lstsq_inter(self): 57 | 58 | tc.main(["train_composition.py", 59 | "-l", self.dir_ + "log1.txt", 60 | "-i", self.dir_ + "an_train_data.txt", 61 | "-o", self.dir_, 62 | "-m", "lexical_func", 63 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl", 64 | "-a", self.dir_ + "CORE_SS.N_mat.pkl", 65 | "-r", "lstsq", 66 | "--intercept", "True", 67 | "--export_params", "True", 68 | ]) 69 | 70 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") 71 | new_space = trained.function_space 72 | 73 | np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, 74 | np.mat([[0.66666667,0.33333333, 75 | -0.33333333,0.33333333, 76 | 0.66666667,0.33333333]]), 77 | 7) 78 | 79 | self.assertTupleEqual(new_space.element_shape, (2,3)) 80 | self.assertListEqual(new_space.id2row, ["big"]) 81 | self.assertListEqual(new_space.id2column, []) 82 | 83 | 84 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 85 | format="dm") 86 | 87 | self._test_equal_spaces_dense(a_space, new_space) 88 | 89 | 90 | def test_simple_lstsq_no_inter(self): 91 | tc.main(["train_composition.py", 92 | "-l", self.dir_ + "log1.txt", 93 | "-i", self.dir_ + "an_train_data.txt", 94 | "-o", self.dir_, 95 | "-m", "lexical_func", 96 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl", 97 | "-a", self.dir_ + "CORE_SS.N_mat.pkl", 98 | "-r", "lstsq", 99 | "--intercept", "False", 100 | "--export_params", "True" 101 | ]) 102 | 103 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") 104 | new_space = trained.function_space 105 | np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, 106 | np.mat([1,0,0,1]), 10) 107 | self.assertTupleEqual(new_space.element_shape, (2,2)) 108 | self.assertListEqual(new_space.id2row, ["big"]) 109 | self.assertListEqual(new_space.id2column, []) 110 | 111 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 112 | format="dm") 113 | 114 | self._test_equal_spaces_dense(a_space, new_space) 115 | 116 | tc.main(["train_composition.py", 117 | "-l", self.dir_ + "log1.txt", 118 | "-i", self.dir_ + "an_train_data.txt", 119 | "-o", self.dir_, 120 | "-m", "lexical_func", 121 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl", 122 | "-a", self.dir_ + "CORE_SS.N_mat.pkl", 123 | "-r", "ridge", 124 | "--lambda", "0", 125 | "--crossvalidation", "False", 126 | "--intercept", "False", 127 | "--export_params", "True" 128 | ]) 129 | 130 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") 131 | new_space2 = trained.function_space 132 | np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, 133 | np.mat([1,0,0,1]), 10) 134 | self.assertTupleEqual(new_space2.element_shape, (2,2)) 135 | self.assertListEqual(new_space2.id2row, ["big"]) 136 | self.assertListEqual(new_space2.id2column, []) 137 | 138 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 139 | format="dm") 140 | 141 | self._test_equal_spaces_dense(a_space, new_space2) 142 | 143 | 144 | 145 | if __name__ == "__main__": 146 | #import sys;sys.argv = ['', 'Test.testName'] 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /src/unitest/utils_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 26, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | from composes.utils.space_utils import list2dict 8 | #from composes.utils.py_matrix_utils import coo 9 | 10 | class UtilsTest(unittest.TestCase): 11 | 12 | 13 | def test_list2dict(self): 14 | test_cases = [(["a","v","d"], {"a":0, "v":1, "d":2}), ([], {})] 15 | 16 | for list_, expected in test_cases: 17 | outcome = list2dict(list_) 18 | self.assertDictEqual(outcome, expected) 19 | 20 | self.assertRaises(ValueError, list2dict, ["a","v","a"]) 21 | 22 | #def test_coo(self): 23 | # coo() 24 | 25 | if __name__ == "__main__": 26 | #import sys;sys.argv = ['', 'Test.testName'] 27 | unittest.main() -------------------------------------------------------------------------------- /src/unitest/weighting_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 20, 2012 3 | 4 | @author: Georgiana Dinu, Pham The Nghia 5 | ''' 6 | import unittest 7 | import numpy as np 8 | import numpy.testing 9 | from composes.matrix.dense_matrix import DenseMatrix 10 | from composes.matrix.sparse_matrix import SparseMatrix 11 | from composes.transformation.scaling.epmi_weighting import EpmiWeighting 12 | from composes.transformation.scaling.plog_weighting import PlogWeighting 13 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting 14 | from composes.transformation.scaling.plmi_weighting import PlmiWeighting 15 | from composes.transformation.scaling.row_normalization import RowNormalization 16 | from composes.transformation.scaling.normalization import Normalization 17 | 18 | class Test(unittest.TestCase): 19 | 20 | 21 | def setUp(self): 22 | self.a = np.array([[1,2,3],[4,0,5]]) 23 | self.b = np.array([[1,2,3]]) 24 | 25 | self.c = np.array([[0,0],[0,0],[0,0]]) 26 | self.d = np.array([[1,-1],[0,1]]) 27 | 28 | self.e = np.array([[1,2,3],[1,0,0]]) 29 | self.f = np.array([1,10,100]) 30 | 31 | 32 | 33 | def tearDown(self): 34 | pass 35 | 36 | 37 | def single_case_test(self, matrix_, expected, w): 38 | 39 | matrix_copy = matrix_.copy() 40 | dm = DenseMatrix(matrix_) 41 | sm = SparseMatrix(matrix_) 42 | 43 | out1 = w.apply(dm) 44 | out2 = w.apply(sm) 45 | 46 | numpy.testing.assert_array_almost_equal(out1.mat, expected, 7) 47 | numpy.testing.assert_array_almost_equal(out2.mat.todense(), expected, 7) 48 | 49 | numpy.testing.assert_array_equal(dm.mat, matrix_copy) 50 | numpy.testing.assert_array_equal(matrix_, matrix_copy) 51 | numpy.testing.assert_array_equal(sm.mat.todense(), matrix_copy) 52 | 53 | def single_case_raises_test(self, matrix_, error_type, w): 54 | dm = DenseMatrix(matrix_) 55 | sm = SparseMatrix(matrix_) 56 | 57 | self.assertRaises(error_type, w.apply, dm) 58 | self.assertRaises(error_type, w.apply, sm) 59 | 60 | 61 | def test_epmi(self): 62 | w = EpmiWeighting() 63 | test_cases = [(self.b, np.mat([[1,1,1]])), 64 | (self.c, self.c) 65 | ] 66 | for matrix_, expected in test_cases: 67 | self.single_case_test(matrix_, expected, w) 68 | 69 | def test_plog(self): 70 | w = PlogWeighting() 71 | test_cases = [(np.mat([[1,1,1]]), np.mat([[0,0,0]])), 72 | (self.c, self.c) 73 | ] 74 | for matrix_, expected in test_cases: 75 | self.single_case_test(matrix_, expected, w) 76 | 77 | def test_ppmi(self): 78 | w = PpmiWeighting() 79 | test_cases = [(self.b, np.mat([[0,0,0]])), 80 | (self.c, self.c) 81 | ] 82 | 83 | for matrix_, expected in test_cases: 84 | self.single_case_test(matrix_, expected, w) 85 | 86 | 87 | def test_plmi(self): 88 | w = PlmiWeighting() 89 | test_cases = [(self.b, np.mat([[0,0,0]])), 90 | (self.c, self.c), 91 | (self.e, np.mat([[0.,0.30830136,0.46245204], 92 | [1.25276297,0.,0.]])) 93 | ] 94 | 95 | for matrix_, expected in test_cases: 96 | self.single_case_test(matrix_, expected, w) 97 | 98 | def test_row_norm(self): 99 | w = RowNormalization() 100 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])), 101 | (self.c, self.c), 102 | (self.e, np.mat([[0.26726124,0.53452248,0.80178373], 103 | [1.,0.,0.]])) 104 | ] 105 | 106 | for matrix_, expected in test_cases: 107 | self.single_case_test(matrix_, expected, w) 108 | 109 | w = RowNormalization(criterion = "length") 110 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])), 111 | (self.c, self.c), 112 | (self.e, np.mat([[0.26726124,0.53452248,0.80178373], 113 | [1.,0.,0.]])) 114 | ] 115 | 116 | for matrix_, expected in test_cases: 117 | self.single_case_test(matrix_, expected, w) 118 | 119 | w = RowNormalization(criterion = "sum") 120 | test_cases = [(self.b, np.mat([[0.16666667,0.33333333,0.5]])), 121 | (self.c, self.c), 122 | (self.e, np.mat([[0.16666667,0.33333333,0.5], 123 | [1.,0.,0.]])) 124 | ] 125 | 126 | for matrix_, expected in test_cases: 127 | self.single_case_test(matrix_, expected, w) 128 | 129 | def test_norm(self): 130 | w = Normalization() 131 | test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])), 132 | (self.c, self.c), 133 | (self.e, np.mat([[1/7.0,2/7.0,3/7.0], 134 | [1./7.0,0.,0.]])) 135 | ] 136 | 137 | for matrix_, expected in test_cases: 138 | self.single_case_test(matrix_, expected, w) 139 | 140 | w = Normalization(criterion = "length") 141 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])), 142 | (self.c, self.c), 143 | (self.e, np.mat([[0.25819889,0.51639778,0.77459667], 144 | [0.25819889,0. ,0. ]])) 145 | ] 146 | 147 | for matrix_, expected in test_cases: 148 | self.single_case_test(matrix_, expected, w) 149 | 150 | 151 | w = Normalization(criterion = "sum") 152 | test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])), 153 | (self.c, self.c), 154 | (self.e, np.mat([[1/7.0,2/7.0,3/7.0], 155 | [1./7.0,0.,0.]])) 156 | ] 157 | 158 | for matrix_, expected in test_cases: 159 | self.single_case_test(matrix_, expected, w) 160 | 161 | def test_epmi_raises(self): 162 | w = EpmiWeighting() 163 | test_cases = [(self.d, ValueError)] 164 | 165 | for matrix_, error_type in test_cases: 166 | self.single_case_raises_test(matrix_, error_type, w) 167 | 168 | def test_ppmi_raises(self): 169 | w = PpmiWeighting() 170 | test_cases = [(self.d, ValueError)] 171 | 172 | for matrix_, error_type in test_cases: 173 | self.single_case_raises_test(matrix_, error_type, w) 174 | 175 | def test_plmi_raises(self): 176 | w = PlmiWeighting() 177 | test_cases = [(self.d, ValueError)] 178 | 179 | for matrix_, error_type in test_cases: 180 | self.single_case_raises_test(matrix_, error_type, w) 181 | 182 | 183 | if __name__ == "__main__": 184 | #import sys;sys.argv = ['', 'Test.test_epmi'] 185 | unittest.main() -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27 8 | 9 | [testenv] 10 | sitepackages = True 11 | commands = python setup.py test 12 | deps = 13 | numpy 14 | Cython 15 | --------------------------------------------------------------------------------