├── .gitignore
├── .project
├── .pydevproject
├── .travis.yml
├── README.rst
├── resource
└── unittest
│ ├── config
│ ├── neighbours_config.cfg
│ └── sim_config.cfg
│ ├── pipelines_test_resources
│ ├── AN_mat.dm
│ ├── A_mat.dm
│ ├── N_mat.dm
│ ├── aan_train_data.txt
│ ├── an_train_data.txt
│ ├── config1.txt
│ ├── mat1.col
│ ├── mat1.cols
│ ├── mat1.pickle
│ ├── mat1.row
│ ├── mat1.sm
│ ├── mat1.sm.gz
│ ├── mat2.dm
│ ├── mat2.dm.gz
│ ├── mat3.cols
│ ├── mat3.dm
│ ├── mat3.sm
│ ├── na_train_data.txt
│ ├── neighbours_input.txt
│ ├── pred1.txt
│ ├── pred2.txt
│ └── sim_input.txt
│ └── space_test_resources
│ ├── col1.col
│ ├── col2.col
│ ├── col3.col
│ ├── col4.col
│ ├── col5.col
│ ├── data1.cols
│ ├── data1.dense
│ ├── data1.sparse
│ ├── data10.dense
│ ├── data10.sparse
│ ├── data2.cols
│ ├── data2.dense
│ ├── data2.sparse
│ ├── data3.cols
│ ├── data3.dense
│ ├── data3.sparse
│ ├── data4.cols
│ ├── data4.dense
│ ├── data4.sparse
│ ├── data5.cols
│ ├── data7.cols
│ ├── data7.dense
│ ├── data7.sparse
│ ├── data8.dense
│ ├── data8.sparse
│ ├── data9.cols
│ ├── data9.dense
│ ├── data9.sparse
│ ├── row1.row
│ ├── row2.row
│ ├── row3.row
│ ├── tmp.col
│ ├── tmp.cols
│ ├── tmp.dm
│ ├── tmp.row
│ ├── tmp.rows
│ └── tmp.sm
├── setup.py
├── src
├── composes
│ ├── __init__.py
│ ├── composition
│ │ ├── __init__.py
│ │ ├── composition_model.py
│ │ ├── dilation.py
│ │ ├── full_additive.py
│ │ ├── lexical_function.py
│ │ ├── multiplicative.py
│ │ └── weighted_additive.py
│ ├── exception
│ │ ├── __init__.py
│ │ ├── illegal_state_error.py
│ │ └── invalid_argument_error.py
│ ├── matrix
│ │ ├── __init__.py
│ │ ├── dense_matrix.py
│ │ ├── linalg.py
│ │ ├── matrix.py
│ │ └── sparse_matrix.py
│ ├── semantic_space
│ │ ├── __init__.py
│ │ ├── operation.py
│ │ ├── peripheral_space.py
│ │ └── space.py
│ ├── similarity
│ │ ├── __init__.py
│ │ ├── cos.py
│ │ ├── dot_prod.py
│ │ ├── euclidean.py
│ │ ├── lin.py
│ │ └── similarity.py
│ ├── transformation
│ │ ├── __init__.py
│ │ ├── dim_reduction
│ │ │ ├── __init__.py
│ │ │ ├── dimensionality_reduction.py
│ │ │ ├── nmf.py
│ │ │ └── svd.py
│ │ ├── feature_selection
│ │ │ ├── __init__.py
│ │ │ ├── feature_selection.py
│ │ │ └── top_feature_selection.py
│ │ └── scaling
│ │ │ ├── __init__.py
│ │ │ ├── epmi_weighting.py
│ │ │ ├── normalization.py
│ │ │ ├── plmi_weighting.py
│ │ │ ├── plog_weighting.py
│ │ │ ├── ppmi_weighting.py
│ │ │ ├── row_normalization.py
│ │ │ └── scaling.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── crossvalidation_utils.py
│ │ ├── gen_utils.py
│ │ ├── io_utils.py
│ │ ├── log_utils.py
│ │ ├── matrix_utils.py
│ │ ├── mem_utils.py
│ │ ├── num_utils.py
│ │ ├── py_matrix_utils.py
│ │ ├── regression_learner.py
│ │ ├── scoring_utils.py
│ │ └── space_utils.py
├── examples
│ ├── __init__.py
│ ├── cmd_ex01.sh
│ ├── cmd_ex02.sh
│ ├── cmd_ex03.sh
│ ├── cmd_ex04.sh
│ ├── cmd_ex05.sh
│ ├── cmd_ex06.sh
│ ├── cmd_ex07.sh
│ ├── data
│ │ ├── in
│ │ │ ├── config1.cfg
│ │ │ ├── config2.cfg
│ │ │ ├── data_to_comp.txt
│ │ │ ├── data_to_comp2.txt
│ │ │ ├── ex01.cols
│ │ │ ├── ex01.rows
│ │ │ ├── ex01.sm
│ │ │ ├── ex05.cols
│ │ │ ├── ex05.sm
│ │ │ ├── ex10.cols
│ │ │ ├── ex10.rows
│ │ │ ├── ex10.sm
│ │ │ ├── ex19-n.cols
│ │ │ ├── ex19-n.sm
│ │ │ ├── ex19-svo.cols
│ │ │ ├── ex19-svo.sm
│ │ │ ├── sim_data.txt
│ │ │ ├── sim_data2.txt
│ │ │ ├── sim_data3.txt
│ │ │ ├── train_data.txt
│ │ │ ├── word_list.txt
│ │ │ ├── word_pairs1.txt
│ │ │ ├── word_pairs2.txt
│ │ │ └── word_sims.txt
│ │ └── out
│ │ │ ├── COMPOSED_SS.ex10.pkl
│ │ │ ├── PER_SS.ex05.pkl
│ │ │ ├── PHRASE_SS.ex10.pkl
│ │ │ ├── ex01.cols
│ │ │ ├── ex01.dm
│ │ │ ├── ex01.pkl
│ │ │ ├── ex01.rows
│ │ │ ├── ex01.sm
│ │ │ ├── ex10.pkl
│ │ │ ├── model01.params
│ │ │ └── model01.pkl
│ ├── ex01.py
│ ├── ex02.py
│ ├── ex03.py
│ ├── ex04.py
│ ├── ex05.py
│ ├── ex06.py
│ ├── ex07.py
│ ├── ex08.py
│ ├── ex09.py
│ ├── ex10.py
│ ├── ex11.py
│ ├── ex12.py
│ ├── ex13.py
│ ├── ex14.py
│ ├── ex15.py
│ ├── ex16.py
│ ├── ex17.py
│ ├── ex18.py
│ ├── ex19.py
│ ├── ex20.py
│ ├── exercise.sh
│ └── full_example.py
├── pipelines
│ ├── __init__.py
│ ├── apply_composition.py
│ ├── build_core_space.py
│ ├── build_peripheral_space.py
│ ├── compute_neighbours.py
│ ├── compute_similarities.py
│ ├── evaluate_similarities.py
│ ├── pipeline_utils.py
│ └── train_composition.py
└── unitest
│ ├── __init__.py
│ ├── ac_pipeline_test.py
│ ├── bcs_pipeline_test.py
│ ├── bps_pipeline_test.py
│ ├── conftest.py
│ ├── crossvalidation_utils_test.py
│ ├── dense_matrix_test.py
│ ├── dilation_test.py
│ ├── dimensionality_reduction_test.py
│ ├── es_pipeline_test.py
│ ├── feat_selection_test.py
│ ├── full_aditive_test.py
│ ├── lexical_function_test.py
│ ├── linalg_test.py
│ ├── matrix_utils_test.py
│ ├── model_export_test.py
│ ├── neighbour_pipeline_test.py
│ ├── operation_test.py
│ ├── peripheral_space_test.py
│ ├── regression_learner_utils_test.py
│ ├── sim_pipeline_test.py
│ ├── similarity_test.py
│ ├── space_test.py
│ ├── sparse_matrix_test.py
│ ├── tc_pipeline_test.py
│ ├── utils_test.py
│ ├── weighted_additive_test.py
│ └── weighting_test.py
└── tox.ini
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
3 | resource/unittest/pipelines_test_resources/
4 |
5 | .Python
6 | .coverage
7 |
8 | .tox/
9 |
10 | *.egg/
11 | src/dissect.egg-info/
12 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | gittoolkit
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 | Default
4 | python 2.7
5 |
6 | /gittoolkit/src
7 | /gittoolkit/src
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | before_install:
5 | - sudo apt-get update -qq
6 | - sudo apt-get install -qq python-numpy python-scipy python-matplotlib
7 | - rm /home/travis/virtualenv/python2.7/lib/python2.7/no-global-site-packages.txt
8 | # command to install dependencies
9 | install:
10 | - pip install cython --use-mirrors
11 | - pip install . --use-mirrors
12 | # command to run tests
13 | script: python setup.py test
14 | after_success:
15 | - sudo apt-get install python-yaml
16 | - pip install coveralls pytest-cov . --use-mirrors
17 | - py.test --cov=composes --cov=pipelines --cov-report=term-missing src/unitest
18 | - coveralls
19 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | DIStributional SEmantics Composition Toolkit
2 | ============================================
3 |
4 |
5 | For documentation, please, refer to http://clic.cimec.unitn.it/composes/toolkit/
6 |
--------------------------------------------------------------------------------
/resource/unittest/config/neighbours_config.cfg:
--------------------------------------------------------------------------------
1 | # configuration file for similarity pipeline
2 | [compute_neighbours]
3 |
4 | #input file
5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/neighbours_input.txt
6 |
7 | # similarity measure
8 | sim_measure=cos
9 |
10 | # output directory
11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/
12 |
13 | # space file(s)
14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle,/home/thenghia.pham/git/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle
15 |
16 | # number of neighbours
17 | no_neighbours=3
18 |
19 | # log file
20 | log=/home/georgianadinu/work/localtoolkit/toolkit/log/sim_log.txt
--------------------------------------------------------------------------------
/resource/unittest/config/sim_config.cfg:
--------------------------------------------------------------------------------
1 | # configuration file for similarity pipeline
2 | [compute_similarities]
3 |
4 | #input file
5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/sim_input.txt
6 |
7 | # similarity measure
8 | sim_measure=cos,dot_prod,lin
9 |
10 | # output directory
11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/
12 |
13 | # space file(s)
14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl,/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl
15 | # columns
16 | columns=0,1
17 |
18 | # log file
19 | log=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/AN_mat.dm:
--------------------------------------------------------------------------------
1 | big_car 3 4
2 | big_man 5 6
3 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/A_mat.dm:
--------------------------------------------------------------------------------
1 | big 3 4
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/N_mat.dm:
--------------------------------------------------------------------------------
1 | car 3 4
2 | man 5 6
3 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/aan_train_data.txt:
--------------------------------------------------------------------------------
1 | big big_car big_big_car
2 | big big_man big_big_man
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/an_train_data.txt:
--------------------------------------------------------------------------------
1 | big car big_car
2 | big man big_man
3 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/config1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/config1.txt
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.col:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.cols:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.pickle:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (ccomposes.semantic_space.space
5 | Space
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_id2row'
14 | p6
15 | (lp7
16 | S'red'
17 | p8
18 | asS'_column2id'
19 | p9
20 | (dp10
21 | S'car'
22 | p11
23 | I0
24 | ssS'_operations'
25 | p12
26 | (lp13
27 | sS'_id2column'
28 | p14
29 | (lp15
30 | g11
31 | asS'_element_shape'
32 | p16
33 | (I1
34 | tp17
35 | sS'_cooccurrence_matrix'
36 | p18
37 | g0
38 | (ccomposes.matrix.sparse_matrix
39 | SparseMatrix
40 | p19
41 | g2
42 | Ntp20
43 | Rp21
44 | (dp22
45 | S'_mat'
46 | p23
47 | g0
48 | (cscipy.sparse.csr
49 | csr_matrix
50 | p24
51 | g2
52 | Ntp25
53 | Rp26
54 | (dp27
55 | S'format'
56 | p28
57 | S'csr'
58 | p29
59 | sS'_shape'
60 | p30
61 | (I1
62 | I1
63 | tp31
64 | sS'indptr'
65 | p32
66 | cnumpy.core.multiarray
67 | _reconstruct
68 | p33
69 | (cnumpy
70 | ndarray
71 | p34
72 | (I0
73 | tp35
74 | S'b'
75 | p36
76 | tp37
77 | Rp38
78 | (I1
79 | (I2
80 | tp39
81 | cnumpy
82 | dtype
83 | p40
84 | (S'i4'
85 | p41
86 | I0
87 | I1
88 | tp42
89 | Rp43
90 | (I3
91 | S'<'
92 | p44
93 | NNNI-1
94 | I-1
95 | I0
96 | tp45
97 | bI00
98 | S'\x00\x00\x00\x00\x01\x00\x00\x00'
99 | p46
100 | tp47
101 | bsS'indices'
102 | p48
103 | g33
104 | (g34
105 | (I0
106 | tp49
107 | g36
108 | tp50
109 | Rp51
110 | (I1
111 | (I1
112 | tp52
113 | g43
114 | I00
115 | S'\x00\x00\x00\x00'
116 | p53
117 | tp54
118 | bsS'maxprint'
119 | p55
120 | I50
121 | sS'data'
122 | p56
123 | g33
124 | (g34
125 | (I0
126 | tp57
127 | g36
128 | tp58
129 | Rp59
130 | (I1
131 | (I1
132 | tp60
133 | g40
134 | (S'f8'
135 | p61
136 | I0
137 | I1
138 | tp62
139 | Rp63
140 | (I3
141 | S'<'
142 | p64
143 | NNNI-1
144 | I-1
145 | I0
146 | tp65
147 | bI00
148 | S'\x00\x00\x00\x00\x00\x00\x08@'
149 | p66
150 | tp67
151 | bsbsbsS'_row2id'
152 | p68
153 | (dp69
154 | g8
155 | I0
156 | ssb.
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.row:
--------------------------------------------------------------------------------
1 | red
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.sm:
--------------------------------------------------------------------------------
1 | red car 3.000000
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.sm.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat1.sm.gz
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat2.dm:
--------------------------------------------------------------------------------
1 | car 3 4 5
2 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat2.dm.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat2.dm.gz
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.cols:
--------------------------------------------------------------------------------
1 | f1
2 | f2
3 | f3
4 | f4
5 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.dm:
--------------------------------------------------------------------------------
1 | a 1 2 3 1
2 | b 2 4 6 1
3 | c 4 675 43 1
4 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.sm:
--------------------------------------------------------------------------------
1 | a f1 1
2 | a f2 2
3 | a f3 3
4 | a f4 1
5 | b f1 2
6 | b f2 4
7 | b f3 6
8 | b f4 1
9 | c f1 4
10 | c f2 675
11 | c f3 43
12 | c f4 1
13 |
14 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/na_train_data.txt:
--------------------------------------------------------------------------------
1 | car big big_car
2 | man big big_man
3 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/neighbours_input.txt:
--------------------------------------------------------------------------------
1 | a
2 | b
3 | c
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/pred1.txt:
--------------------------------------------------------------------------------
1 | 23 car 23 sdrs
2 | 4 man 4 sdfs
3 | 13 cad 13 sfd
4 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/pred2.txt:
--------------------------------------------------------------------------------
1 | 23 car 23 sdrs
2 | 4 man 4 sdfs
3 | 13 cad 13 sfd
4 |
--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/sim_input.txt:
--------------------------------------------------------------------------------
1 | a b 1
2 | a c 0
3 | a a 1
4 | b c 1
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col1.col:
--------------------------------------------------------------------------------
1 | man
2 | car
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col2.col:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col3.col:
--------------------------------------------------------------------------------
1 | man
2 | car
3 | man
4 | car
5 | car
6 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col4.col:
--------------------------------------------------------------------------------
1 | airplane
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col5.col:
--------------------------------------------------------------------------------
1 | man sdrf
2 | car 3
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.dense:
--------------------------------------------------------------------------------
1 | red 3 5
2 | blue 0 10
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.sparse:
--------------------------------------------------------------------------------
1 | red car 3
2 | red man 5
3 | blue man 10
4 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data10.dense:
--------------------------------------------------------------------------------
1 | car man 3
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data10.sparse:
--------------------------------------------------------------------------------
1 | man car car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.cols:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.dense:
--------------------------------------------------------------------------------
1 | red 3
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.sparse:
--------------------------------------------------------------------------------
1 | red car 3
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.dense:
--------------------------------------------------------------------------------
1 | red 5 0
2 | red 10 0
3 | blue 0 6
4 |
5 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.sparse:
--------------------------------------------------------------------------------
1 | red car 5
2 | red car 10
3 | blue man 6
4 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.dense:
--------------------------------------------------------------------------------
1 | red 5 0
2 | blue 0 6
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.sparse:
--------------------------------------------------------------------------------
1 | red car 5
2 | blue man 6
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data5.cols:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data5.cols
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.cols:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.dense:
--------------------------------------------------------------------------------
1 | red 0
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.sparse:
--------------------------------------------------------------------------------
1 | red car 0
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data8.dense:
--------------------------------------------------------------------------------
1 | car 3 5 6
2 | man 3 5
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data8.sparse:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data8.sparse
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.cols:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.dense:
--------------------------------------------------------------------------------
1 | car
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.sparse:
--------------------------------------------------------------------------------
1 | man car 4 5
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row1.row:
--------------------------------------------------------------------------------
1 | red
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row2.row:
--------------------------------------------------------------------------------
1 | blue
2 | red
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row3.row:
--------------------------------------------------------------------------------
1 | blue
2 | red
3 | blue
4 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.col:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.col
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.cols:
--------------------------------------------------------------------------------
1 | f1
2 | f2
3 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.dm:
--------------------------------------------------------------------------------
1 | a 0 0
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.row:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.row
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.rows:
--------------------------------------------------------------------------------
1 | a
2 |
--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.sm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.sm
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 |
4 | from setuptools import setup
5 | from setuptools.command.test import test as TestCommand
6 |
7 |
8 | class PyTest(TestCommand):
9 | def finalize_options(self):
10 | TestCommand.finalize_options(self)
11 | self.test_args = 'src/unitest'
12 | self.test_suite = True
13 |
14 | def run_tests(self):
15 | #import here, cause outside the eggs aren't loaded
16 | import pytest
17 | errno = pytest.main(self.test_args)
18 | sys.exit(errno)
19 |
20 |
21 | setup(
22 | name='dissect',
23 | version='0.1.0',
24 | description='COMPOSES DISSECT TOOLKIT',
25 | author='Georgiana Dinu, The Nghia Pham, Marco Baroni',
26 | author_email='georgiana.dinu@unitn.it,thenghia.pham@unitn.it',
27 | url='http://http://clic.cimec.unitn.it/composes/toolkit/',
28 | install_requires=['numpy', 'scipy', 'sparsesvd'],
29 | tests_require=['pytest>=2.4.2'],
30 | cmdclass={'test': PyTest},
31 | package_dir={'': 'src'},
32 | packages=[
33 | 'composes',
34 | 'composes.composition',
35 | 'composes.matrix',
36 | 'composes.semantic_space',
37 | 'composes.exception',
38 | 'composes.similarity',
39 | 'composes.transformation',
40 | 'composes.utils',
41 | 'composes.transformation.dim_reduction',
42 | 'composes.transformation.feature_selection',
43 | 'composes.transformation.scaling',
44 | ],
45 | )
46 |
--------------------------------------------------------------------------------
/src/composes/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | class NullHandler(logging.Handler):
4 | """For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""
5 | def emit(self, record):
6 | pass
7 |
8 | logger = logging.getLogger(__name__)
9 | if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
10 | logger.addHandler(NullHandler())
11 |
12 | #logging.basicConfig(filename='composes.log', filemode='w+',level=logging.DEBUG, format = "")
13 |
--------------------------------------------------------------------------------
/src/composes/composition/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/composition/__init__.py
--------------------------------------------------------------------------------
/src/composes/composition/dilation.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 15, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import numpy as np
7 | from composition_model import CompositionModel
8 | from composes.utils.num_utils import is_numeric
9 | from composes.utils.py_matrix_utils import nonzero_invert
10 |
11 |
12 | class Dilation(CompositionModel):
13 | """
14 | Implements the dilation compositional model:
15 |
16 | :math:`\\vec{p} = (\\vec{u} \\cdot \\vec{u}) \\vec{v} + (\\lambda - 1) (\\vec{u} \\cdot \\vec{v}) \\vec{u}`
17 |
18 | where :math:`\\vec{p}` is the vector of the composed phrase, :math:`\\vec{u}, \\vec{v}` the vectors of the components
19 | and :math:`\\lambda` is a scalar.
20 |
21 | """
22 |
23 |
24 | _name = "dilation"
25 |
26 | _lambda = 2
27 |
28 |
29 | def __init__(self, lambda_=None):
30 | """
31 | Constructor.
32 |
33 | Args:
34 | lambda_ : numeric, value of the lambda parameter. Optional.
35 | """
36 |
37 | if not lambda_ is None:
38 | if not is_numeric(lambda_):
39 | raise ValueError("Parameter not numeric: %s " %(type(lambda_)))
40 | else:
41 | self._lambda = lambda_
42 |
43 | def _solve(self, arg1_mat, arg2_mat, phrase_mat):
44 |
45 | v1_row_norms = arg1_mat.norm(1)
46 | v1_row_sqr_norms = np.multiply(v1_row_norms, v1_row_norms)
47 |
48 | v2_minus_p = arg2_mat.scale_rows(v1_row_sqr_norms) - phrase_mat
49 | v1_dot_prod_v2_minus_p = arg1_mat.multiply(v2_minus_p).sum(1)
50 |
51 | v1_v2 = arg1_mat.multiply(arg2_mat).sum(1)
52 | v1_v2_sqr = np.multiply(v1_v2, v1_v2)
53 |
54 | nom = np.multiply(v1_v2_sqr, v1_row_sqr_norms).sum()
55 | denom = np.multiply(v1_v2, v1_dot_prod_v2_minus_p).sum()
56 |
57 | if nom != 0:
58 | self._lambda = 1 - denom/nom
59 | else:
60 | self._lambda = 2
61 |
62 |
63 | def _compose(self, arg1_mat, arg2_mat):
64 | # TO DO: this is inefficient here, we do 2 for s instead of one
65 | # we do a for in get_rows in parent.compose() and a for here
66 | # comp = ((self._lambda -1) * v1.multiply(v2).sum()/pow(v1.norm(),2)) * v1 + v2
67 |
68 | v1_row_norms = arg1_mat.norm(1)
69 | scale_factors1 = arg1_mat.multiply(arg2_mat).sum(1)
70 | scale_factors2 = np.multiply(v1_row_norms, v1_row_norms)
71 |
72 | arg1_mat_scaled = arg1_mat.scale_rows(scale_factors1)
73 | arg2_mat_scaled = arg2_mat.scale_rows(scale_factors2)
74 |
75 | #print "FACTORS u:", ((self._lambda -1)*scale_factors1).sum()/float(len(scale_factors1))
76 | #print "FACTORS v:", (scale_factors2).sum()/float(len(scale_factors2))
77 |
78 | result = (self._lambda - 1) * arg1_mat_scaled + arg2_mat_scaled
79 |
80 | return result
81 |
82 | def get_lambda(self):
83 | return self._lambda
84 | """
85 | Lambda parameter. Default, set to lambda=2.
86 | """
87 |
88 |
89 | def _export(self, filename):
90 | with open(filename, "w") as output_stream:
91 | output_stream.write("lambda\t%f" % self._lambda)
92 |
--------------------------------------------------------------------------------
/src/composes/composition/full_additive.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 5, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from composition_model import CompositionModel
8 | from composes.utils.gen_utils import assert_is_instance
9 | from composes.utils.matrix_utils import is_array_or_matrix
10 | from composes.utils.matrix_utils import padd_matrix
11 | from composes.utils.matrix_utils import to_compatible_matrix_types
12 | from composes.utils.regression_learner import LstsqRegressionLearner
13 | from composes.utils.regression_learner import RegressionLearner
14 | from composes.utils.matrix_utils import resolve_type_conflict
15 | from composes.matrix.dense_matrix import DenseMatrix
16 | from composes.exception.illegal_state_error import IllegalStateError
17 |
18 |
19 | class FullAdditive(CompositionModel):
20 | """
21 | Implements the full additive compositional model:
22 |
23 | :math:`\\vec{p} = A \\vec{u} + B \\vec{v}`
24 |
25 | where :math:`\\vec{p}` is the vector of the composed phrase,
26 | :math:`\\vec{u}, \\vec{v}`, the vectors of the components
27 | and :math:`A`, :math:`B` are two matrices.
28 |
29 | """
30 | _name = "full_additive"
31 | _mat_a_t = None
32 | _mat_b_t = None
33 |
34 |
35 | def __init__(self, A=None, B=None, learner=LstsqRegressionLearner()):
36 | #TODO here; very important, should be able to set the intercept
37 | #when mat a and mat b are given , to true or false. now by default is
38 | #is false
39 | """
40 | Constructor.
41 |
42 | Args:
43 | A= : matrix A, of matrix-like type (Matrix, ndarray,
44 | numpy matrix, scipy matrix). Optional (parameters can be set
45 | through training.)
46 |
47 | B= : matrix B, matrix-like type. Optional.
48 |
49 | learner= : regression learner object, of type RegressionLearner.
50 | Optional, default LstsqRegressionLearner.
51 | """
52 | if A is not None and B is not None:
53 | mat_a = A
54 | mat_b = B
55 | if not is_array_or_matrix(mat_a):
56 | raise TypeError("expected matrix type, received: %s"
57 | % type(mat_a))
58 |
59 | if not is_array_or_matrix(mat_b):
60 | raise TypeError("expected matrix type, received: %s"
61 | % type(mat_b))
62 |
63 | mat_a, mat_b = to_compatible_matrix_types(mat_a, mat_b)
64 | self._mat_a_t = mat_a.transpose()
65 | self._mat_b_t = mat_b.transpose()
66 | self._has_intercept = False
67 |
68 | else:
69 | self._regression_learner = learner
70 | self._has_intercept = self._regression_learner.has_intercept()
71 |
72 |
73 | def _solve(self, arg1_mat, arg2_mat, phrase_mat):
74 |
75 | self._has_intercept = self._regression_learner.has_intercept()
76 |
77 | result = self._regression_learner.train(arg1_mat.hstack(arg2_mat), phrase_mat)
78 |
79 | self._mat_a_t = result[0:arg1_mat.shape[1], :]
80 | self._mat_b_t = result[arg1_mat.shape[1]:, :]
81 |
82 |
83 | def _compose(self, arg1_mat, arg2_mat):
84 | #NOTE when we get in this compose arg1 mat and arg2 mat have the same type
85 | [mat_a_t, mat_b_t, arg1_mat] = resolve_type_conflict([self._mat_a_t,
86 | self._mat_b_t,
87 | arg1_mat],
88 | type(arg1_mat))
89 | if self._has_intercept:
90 | return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t
91 | else:
92 | return arg1_mat * mat_a_t + arg2_mat * mat_b_t
93 |
94 | def set_regression_learner(self, regression_learner):
95 | assert_is_instance(regression_learner, RegressionLearner)
96 | self._regression_learner = regression_learner
97 |
98 | def get_regression_learner(self):
99 | return self._regression_learner
100 |
101 | regression_learner = property(get_regression_learner, set_regression_learner)
102 | """
103 | Regression method to be used in training, of type RegressionLearner.
104 | Default is LstsqRegressionLearner.
105 | """
106 |
107 | def _build_id2column(self, arg1_space, arg2_space):
108 | return []
109 |
110 | def _export(self, filename):
111 | if self._mat_a_t is None or self._mat_b_t is None:
112 | raise IllegalStateError("cannot export an untrained FullAdditive model.")
113 |
114 | with open(filename, "w") as output_stream:
115 | output_stream.write("A\n")
116 | output_stream.write(str(DenseMatrix(self._mat_a_t).mat.T))
117 | output_stream.write("\nB\n")
118 |
119 | if self._has_intercept:
120 | output_stream.write(str(DenseMatrix(self._mat_b_t[:-1,]).mat.T))
121 | output_stream.write("\nIntercept\n")
122 | output_stream.write(str(DenseMatrix(self._mat_b_t[-1,]).mat.T))
123 | else:
124 | output_stream.write(str(DenseMatrix(self._mat_b_t).mat.T))
125 |
126 |
127 | def get_mat_a_t(self):
128 | return self._mat_a_t
129 | mat_a_t = property(get_mat_a_t)
130 | """
131 | Transpose of matrix A parameter, of type Matrix.
132 | """
133 |
134 | def get_mat_b_t(self):
135 | return self._mat_b_t
136 | mat_b_t = property(get_mat_b_t)
137 | """
138 | Transpose of matrix B parameter, of type Matrix.
139 | """
140 |
--------------------------------------------------------------------------------
/src/composes/composition/multiplicative.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 5, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from composition_model import CompositionModel
8 | from composes.exception.illegal_state_error import IllegalOperationError
9 |
10 | class Multiplicative(CompositionModel):
11 | """
12 | Implements the component-wise multiplication compositional model:
13 |
14 | :math:`\\vec{p} = \\vec{u} \\cdot \\vec{v}`
15 |
16 | where :math:`\\vec{p}` is the vector of the composed phrase and
17 | :math:`\\vec{u}, \\vec{v}` are the vectors of the components.
18 |
19 | :math:`\\vec{u} \\cdot \\vec{v} = (u_1v_1,...,u_nv_n)`
20 | """
21 |
22 | _name = "multiplicative"
23 |
24 | def __init__(self):
25 | """
26 | Constructor
27 | """
28 |
29 | def train(self):
30 | """
31 | Current multiplicative model cannot be trained, it has no parameters.
32 | """
33 | raise IllegalOperationError("Cannot train multiplicative model!")
34 |
35 | def _compose(self, arg1_mat, arg2_mat):
36 | return arg1_mat.multiply(arg2_mat)
37 |
38 | def export(self, filename):
39 | """
40 | Current multiplicative model cannot be exported, it has no parameters.
41 | """
42 | raise IllegalOperationError("cannot export a Multiplicative model.")
43 |
--------------------------------------------------------------------------------
/src/composes/composition/weighted_additive.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 5, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from composition_model import CompositionModel
8 | from composes.matrix.dense_matrix import DenseMatrix
9 | from composes.utils.num_utils import is_numeric
10 | # from composes.utils.mem_utils import get_mem_usage
11 | from composes.utils.matrix_utils import resolve_type_conflict
12 | import numpy as np
13 | import math
14 |
15 | class WeightedAdditive(CompositionModel):
16 | """
17 | Implements weighted additive compositional model:
18 |
19 | :math:`\\vec{p} = \\alpha \\vec{u} + \\beta \\vec{v}`
20 |
21 | where :math:`\\vec{p}` is the vector of the composed phrase and
22 | :math:`\\vec{u}, \\vec{v}` are the vectors of the components
23 |
24 | When :math:`\\alpha=\\beta=0.5` the model performs simple vector addition.
25 | """
26 |
27 | _name = "weighted_additive"
28 |
29 | """
30 | double, in interval [0,1]
31 | maximum overhead allowed: MAX_MEM_OVERHEAD ratio of peripheral space memory
32 | """
33 | MAX_MEM_OVERHEAD = 0.2
34 |
35 |
36 | def __init__(self, alpha=None, beta=None):
37 | """
38 | Constructor.
39 |
40 | Args:
41 | alpha: alpha parameter, numeric type. Optional, can be set through
42 | training
43 | beta: beta parameter, numeric type. Optional, can be set through
44 | training.
45 |
46 | Raises:
47 | TypeError if alpha or beta are not numeric.
48 | """
49 | self._alpha = 0.5
50 | self._beta = 0.5
51 | if not alpha is None:
52 | if not is_numeric(alpha):
53 | raise TypeError("Parameter not numeric: %s " % (type(alpha)))
54 | else:
55 | self._alpha = alpha
56 |
57 | if not beta is None:
58 | if not is_numeric(beta):
59 | raise TypeError("Parameter not numeric: %s " % (type(beta)))
60 | else:
61 | self._beta = beta
62 |
63 | if not alpha is None and beta is None:
64 | self._beta = 1 - self._alpha
65 |
66 |
67 | def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list):
68 |
69 | # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
70 | # the /3.0 is needed
71 | # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
72 | chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] * self.MAX_MEM_OVERHEAD / 3.0) + 1
73 |
74 | arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (0, 0, 0, 0, 0)
75 |
76 | for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
77 | beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))
78 |
79 | arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
80 | arg2_mat = arg2_space.get_rows(arg2_list[beg:end])
81 | phrase_mat = phrase_space.get_rows(phrase_list[beg:end])
82 |
83 | [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat,
84 | arg2_mat,
85 | phrase_mat],
86 | DenseMatrix)
87 |
88 | res = self._process(arg1_mat, arg2_mat, phrase_mat)
89 | arg1_arg2_dot += res[0]
90 | arg1_phrase_dot += res[1]
91 | arg2_phrase_dot += res[2]
92 | arg1_norm_sqr += res[3]
93 | arg2_norm_sqr += res[4]
94 |
95 |
96 | self._solve(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr)
97 |
98 |
99 | def _process(self, arg1_mat, arg2_mat, phrase_mat):
100 |
101 | # debug here
102 | # remove when done
103 | # print "Using %s MB " % (get_mem_usage())
104 |
105 | arg1_arg2_dot = arg1_mat.multiply(arg2_mat).sum()
106 | arg1_phrase_dot = arg1_mat.multiply(phrase_mat).sum()
107 | arg2_phrase_dot = arg2_mat.multiply(phrase_mat).sum()
108 |
109 | arg1_norm_sqr = pow(arg1_mat.norm(), 2)
110 | arg2_norm_sqr = pow(arg2_mat.norm(), 2)
111 |
112 | return arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr
113 |
114 | def _solve(self, arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr):
115 |
116 | a = np.linalg.pinv(np.mat([[arg1_norm_sqr,arg1_arg2_dot],
117 | [arg1_arg2_dot,arg2_norm_sqr]]))
118 | a = a * np.mat([[arg1_phrase_dot],[arg2_phrase_dot]])
119 | self._alpha = a[0, 0]
120 | self._beta = a[1, 0]
121 |
122 |
123 | def _compose(self, arg1_mat, arg2_mat):
124 | return self._alpha * arg1_mat + self._beta * arg2_mat
125 |
126 | def _export(self, filename):
127 | with open(filename, "w") as output_stream:
128 | output_stream.write("alpha\t%f\n" % self._alpha)
129 | output_stream.write("beta\t%f" % self._beta)
130 |
131 | def get_alpha(self):
132 | return self._alpha
133 | alpha = property(get_alpha)
134 | """
135 | Alpha parameter, default 0.5.
136 | """
137 |
138 | def get_beta(self):
139 | return self._beta
140 | beta = property(get_beta)
141 | """
142 | Beta parameter, default 0.5.
143 | """
144 |
--------------------------------------------------------------------------------
/src/composes/exception/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/exception/__init__.py
--------------------------------------------------------------------------------
/src/composes/exception/illegal_state_error.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jun 15, 2012
3 |
4 | @author: thenghia.pham
5 | '''
6 |
7 | class IllegalStateError(Exception):
8 | '''
9 | '''
10 | def __init__(self, msg):
11 | self.__msg = msg
12 |
13 |
14 | class IllegalOperationError(Exception):
15 | '''
16 | '''
17 | def __init__(self, msg):
18 | self.__msg = msg
--------------------------------------------------------------------------------
/src/composes/exception/invalid_argument_error.py:
--------------------------------------------------------------------------------
1 |
2 | class InvalidArgumentError(Exception):
3 | '''
4 | '''
5 | def __init__(self, msg):
6 | self.__msg = msg
--------------------------------------------------------------------------------
/src/composes/matrix/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/matrix/__init__.py
--------------------------------------------------------------------------------
/src/composes/matrix/matrix.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 17, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from composes.utils.num_utils import is_numeric
8 | from composes.utils.py_matrix_utils import is_array
9 |
10 | class Matrix(object):
11 | """
12 | Provides a common interface for matrix implementations.
13 |
14 | Provides a common interface for different matrix implementations
15 | (sparse/dense). In vector space models, a matrix is used to encode
16 | a set of entities such as words or phrases (rows) described in terms
17 | of contextual features (columns).
18 | """
19 |
20 | def __init__(self, *args, **kwargs):
21 | raise NotImplementedError()
22 |
23 |
24 | def __add__(self, matrix_):
25 | ''' + operation'''
26 | self._assert_same_type(matrix_)
27 | return type(self)(self.mat + matrix_.mat)
28 |
29 | def __sub__(self, matrix_):
30 | ''' - operation'''
31 | self._assert_same_type(matrix_)
32 | return type(self)(self.mat - matrix_.mat)
33 |
34 | def __neg__(self):
35 | ''' - operation'''
36 | return type(self)(-self.mat)
37 |
38 | def __mul__(self, factor):
39 | ''' * operation'''
40 | if is_numeric(factor):
41 | return type(self)(self.mat * factor)
42 | else:
43 | self._assert_same_type(factor)
44 | return type(self)(self.mat * factor.mat)
45 |
46 | def __div__(self, factor):
47 | ''' / operation'''
48 | if is_numeric(factor):
49 | if factor == 0:
50 | raise ZeroDivisionError("Division by zero")
51 | else:
52 | raise TypeError("expected numeric type, received %s" % (type(factor)))
53 | return type(self)(self.mat / float(factor))
54 |
55 | def __rmul__(self, factor):
56 | ''' * operation'''
57 | if is_numeric(factor):
58 | return self.__mul__(factor)
59 | raise TypeError("expected numeric type, received %s" % (type(factor)))
60 |
61 |
62 | #TODO move all these asserts somewhere else
63 | def _assert_same_type(self, operand):
64 | if type(self) != type(operand):
65 | raise TypeError("expected matrix of type %s, received %s" %
66 | (type(self), type(operand)))
67 |
68 | def assert_same_shape(self, matrix_):
69 | """
70 | Asserts that the matrix has the same shape as a second matrix.
71 |
72 | Args:
73 | matrix_: A second matrix of type Matrix.
74 |
75 | Raises:
76 | ValueError: If the current matrix and the argument matrix
77 | do not have the same shape.
78 | """
79 |
80 | if self.mat.shape != matrix_.mat.shape:
81 | raise ValueError("inconsistent shapes: %s %s"
82 | % (str(self.mat.shape), str(matrix_.mat.shape) ))
83 |
84 | #TODO move all these asserts somewhere else
85 | def _assert_array(self, operand):
86 | if not is_array(operand):
87 | raise TypeError("expected array, received %s" % (type(operand)))
88 |
89 |
90 | def sum(self, axis=None):
91 | #return type is dense matrix of shape (1, dimy) or (dimx,1)
92 | #or a number if **kwargs is None
93 | return self.mat.sum(axis)
94 |
95 | def sorted_permutation(self, norm_function, axis_):
96 | """
97 | Computes the permutation resulted when sorting the matrix
98 | on an axis, according to a function, in descending order.
99 |
100 | Sorts the rows or the columns (as given by axis)
101 | of a matrix according to a norm_function and returns
102 | the permutation of this as a np.array
103 |
104 | Args:
105 | norm_function: One of sum/length. A function that
106 | takes an axis as an argument (i.e. 0 or 1) and
107 | returns an array of values (i.e. sum of all rows
108 | if axis = 0 and norm_function = sum).
109 |
110 | axis_: axis value, one of 0/1
111 |
112 | Returns:
113 | perm_srtd: np.array containing the permutation of the
114 | sorting
115 | """
116 |
117 | #norms = norm_function(axis=axis_)
118 |
119 | norms = norm_function(axis_).getA().flatten()
120 | perm_srtd = sorted(range(len(norms)), key = norms.__getitem__,
121 | reverse=True)
122 |
123 | return perm_srtd
124 |
125 | def get_mat(self):
126 | return self._mat
127 |
128 | def set_mat(self, mat_):
129 | self._mat = mat_
130 |
131 | mat = property(get_mat, set_mat)
132 | """
133 | Stores the actual matrix structure of the Matrix object.
134 | Of type numpy.matrix for DenseMatrix, and scipy.sparse.csr_matrix
135 | for SparseMatrix.
136 | """
137 |
138 | def get_shape(self):
139 | return self.mat.shape
140 |
141 | shape = property(get_shape)
142 | """
143 | Shape of the matrix, tuple with two elements.
144 | """
145 |
146 | def copy(self):
147 | return type(self)(self.mat.copy())
148 |
149 |
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/src/composes/semantic_space/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/semantic_space/__init__.py
--------------------------------------------------------------------------------
/src/composes/semantic_space/peripheral_space.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 26, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from space import Space
8 | from numpy import array
9 | from composes.utils.space_utils import list2dict
10 | from composes.utils.space_utils import assert_dict_match_list
11 | from composes.utils.space_utils import assert_shape_consistent
12 | from composes.utils.space_utils import add_items_to_dict
13 | from composes.semantic_space.operation import FeatureSelectionOperation
14 | from composes.semantic_space.operation import DimensionalityReductionOperation
15 | from composes.utils.gen_utils import assert_is_instance
16 | from composes.matrix.matrix import Matrix
17 |
18 | class PeripheralSpace(Space):
19 | '''
20 | classdocs
21 | '''
22 |
23 |
24 | def __init__(self, core_space, matrix_, id2row, row2id=None):
25 | """
26 | Constructor.
27 |
28 | Args:
29 | core_space: Space type, the core space that this is peripheral to.
30 | matrix_: Matrix type, the data matrix of the space
31 | id2row: list, the row elements
32 | row2id: dictionary, maps row strings to ids. Optional, built from
33 | id2row by default.
34 |
35 | Returns:
36 | A peripheral semantic space (type PeripheralSpace) on which the
37 | core space operations have been projected. Column indexing structures
38 | and operations are taken over from the core space.
39 |
40 | Raises:
41 | TypeError: if matrix_ or core_space are not of the correct type
42 | ValueError: if element shape is not consistent with
43 | the size of matrix rows
44 | if the matrix and the provided row and column
45 | indexing structures are not of consistent shapes.
46 | """
47 | assert_is_instance(matrix_, Matrix)
48 | assert_is_instance(core_space, Space)
49 | assert_is_instance(id2row, list)
50 | # TODO: assert it is not a peripheral space here!
51 |
52 | if row2id is None:
53 | row2id = list2dict(id2row)
54 | else:
55 | assert_dict_match_list(row2id, id2row)
56 |
57 | column2id = core_space.column2id
58 | id2column = core_space.id2column
59 |
60 | self._operations = list(core_space.operations)
61 | self._row2id = row2id
62 | self._id2row = id2row
63 | self._column2id = column2id
64 | self._id2column = id2column
65 |
66 | self._cooccurrence_matrix = self._project_core_operations(matrix_)
67 | assert_shape_consistent(self.cooccurrence_matrix, self._id2row,
68 | self._id2column, self._row2id, self._column2id)
69 |
70 | self._element_shape = (self._cooccurrence_matrix.shape[1],)
71 |
72 |
73 | def _project_core_operations(self, matrix_):
74 |
75 | for operation in self._operations:
76 | if isinstance(operation, DimensionalityReductionOperation):
77 | self._id2column, self._column2id = [], {}
78 |
79 | if isinstance(operation, FeatureSelectionOperation):
80 | if operation.original_columns:
81 | self._id2column = list(array(operation.original_columns)[operation.selected_columns])
82 | self._column2id = list2dict(self._id2column)
83 | else:
84 | self._id2column, self._column2id = [],{}
85 |
86 | matrix_ = operation.project(matrix_)
87 | return matrix_
88 |
89 |
90 | def add_rows(self, matrix_, id2row):
91 | """
92 | Adds rows to a peripheral space.
93 |
94 | Args:
95 | matrix_: Matrix type, the matrix of the elements to be added.
96 | id2row: list, string identifiers of the rows to be added.
97 |
98 | Modifies the current space by appending the new rows.
99 | All operations of the core space are projected to the new rows.
100 |
101 | Raises:
102 | ValueError: if attempting to add row strings which are already
103 | in the space.
104 | matrix of the new data is not consistent in shape
105 | with the current data matrix.
106 | """
107 |
108 | try:
109 | self._row2id = add_items_to_dict(self.row2id, id2row)
110 | except ValueError:
111 | raise ValueError("Found duplicate keys when appending rows to\
112 | peripheral space.")
113 |
114 | if matrix_.mat.shape[0] != len(id2row):
115 | raise ValueError("Matrix shape inconsistent with no. of rows:%s %s"
116 | % (matrix_.mat.shape, len(id2row)))
117 |
118 | self._id2row = self.id2row + id2row
119 | matrix_ = self._project_core_operations(matrix_)
120 |
121 | self._cooccurrence_matrix = self._cooccurrence_matrix.vstack(matrix_)
122 | assert_shape_consistent(self.cooccurrence_matrix, self.id2row,
123 | self.id2column, self.row2id, self.column2id)
124 |
125 | @classmethod
126 | def build(cls, core_space, **kwargs):
127 | """
128 | Reads in data files and extracts the data to construct a semantic space.
129 |
130 | If the data is read in dense format and no columns are provided,
131 | the column indexing structures are set to empty.
132 |
133 | Args:
134 | data: file containing the counts
135 | format: format on the input data file: one of sm/dm
136 | rows: file containing the row elements. Optional, if not provided,
137 | extracted from the data file.
138 | cols: file containing the column elements
139 |
140 | Returns:
141 | A semantic space build from the input data files.
142 |
143 | Raises:
144 | ValueError: if one of data/format arguments is missing.
145 | if cols is missing and format is "sm"
146 | if the input columns provided are not consistent with
147 | the shape of the matrix (for "dm" format)
148 |
149 | """
150 |
151 | sp = Space.build(**kwargs)
152 |
153 | mat = sp._cooccurrence_matrix
154 | id2row = sp.id2row
155 | row2id = sp.row2id
156 | return PeripheralSpace(core_space, mat, id2row, row2id)
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/src/composes/similarity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/similarity/__init__.py
--------------------------------------------------------------------------------
/src/composes/similarity/cos.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Oct 2, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | """
6 | import numpy as np
7 |
8 | from composes.utils.py_matrix_utils import nonzero_invert
9 |
10 | from composes.similarity.similarity import Similarity
11 | from composes.similarity.dot_prod import DotProdSimilarity
12 |
13 |
14 | class CosSimilarity(Similarity):
15 | """
16 | Computes the cosine similarity of two vectors.
17 |
18 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{<\\vec{u},\\vec{v}>}{\\sqrt{||\\vec{u}||||\\vec{v}||}}`
19 |
20 | """
21 |
22 | def _sim(self, v1, v2):
23 | if v1.norm() == 0 or v2.norm() == 0:
24 | return 0.0
25 | s = DotProdSimilarity()._sim(v1, v2) / np.double(v1.norm() * v2.norm())
26 | return s
27 |
28 | def _sims_to_matrix(self, vector, matrix_):
29 | sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_)
30 |
31 | vector_norm = vector.norm()
32 | row_norms = vector_norm * matrix_.norm(1)
33 | row_norms = nonzero_invert(row_norms)
34 |
35 | return sims.scale_rows(row_norms)
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/src/composes/similarity/dot_prod.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Oct 2, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | """
6 | from composes.similarity.similarity import Similarity
7 |
8 |
9 | class DotProdSimilarity(Similarity):
10 | """
11 | Computes the scalar product (dot product) of two vectors.
12 |
13 | :math:`sim(\\vec{u},\\vec{v}) = <\\vec{u},\\vec{v}> = \\sum_iu_iv_i`
14 |
15 | """
16 | def _sim(self, v1, v2):
17 | return v1.multiply(v2).sum()
18 |
19 | def _sims_to_matrix(self, vector, matrix_):
20 | return matrix_ * vector.transpose()
21 |
--------------------------------------------------------------------------------
/src/composes/similarity/euclidean.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Oct 2, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | """
6 | from composes.similarity.similarity import Similarity
7 |
8 |
9 | class EuclideanSimilarity(Similarity):
10 | """
11 | Computes the euclidean similarity of two vectors as the inverse of their
12 | euclidean distance.
13 |
14 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{1}{||\\vec{u}-\\vec{v}|| + 1}`
15 | """
16 |
17 | def _sim(self, v1, v2):
18 | return 1 / (1 + (v1 - v2).norm())
19 |
--------------------------------------------------------------------------------
/src/composes/similarity/lin.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Oct 2, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | """
6 | import numpy as np
7 |
8 | from composes.similarity.similarity import Similarity
9 |
10 |
11 | class LinSimilarity(Similarity):
12 | """
13 | Computes the Lin similarity of two vectors.
14 |
15 | :math:`sim(\\vec{u},\\vec{v}) = \\frac{\\sum_{i \\in I}(u_i+v_i)}{\\sum_iu_i + \\sum_iv_i}`
16 |
17 | Where :math:`I=\\{i | u_i > 0 \\text{ and } v_i > 0\\}`, the set of components
18 | on which both vectors are strictly positive.
19 |
20 | """
21 |
22 | def _sim(self, v1, v2):
23 |
24 | common = v1.multiply(v2)
25 | common.to_ones()
26 | denom = v1.sum() + v2.sum()
27 |
28 | if denom == 0:
29 | return 0
30 | else:
31 | return common.multiply(v1 + v2).sum() / np.double(denom)
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/composes/similarity/similarity.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Oct 2, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | """
6 | import numpy as np
7 |
8 | from composes.utils.matrix_utils import (
9 | assert_is_array_or_matrix,
10 | to_compatible_matrix_types,
11 | )
12 |
13 |
14 | class Similarity(object):
15 |
16 | def get_sim(self, v1, v2):
17 |
18 | assert_is_array_or_matrix(v1)
19 | assert_is_array_or_matrix(v2)
20 |
21 | # TODO: figure out where these asserts belong!!
22 | v1, v2 = to_compatible_matrix_types(v1, v2)
23 | v1.assert_same_shape(v2)
24 |
25 | return self._sim(v1, v2)
26 |
27 | def get_sims_to_matrix(self, vector, matrix_):
28 |
29 | assert_is_array_or_matrix(vector)
30 | assert_is_array_or_matrix(matrix_)
31 |
32 | vector, matrix_ = to_compatible_matrix_types(vector, matrix_)
33 |
34 | if vector.shape[1] != matrix_.shape[1] or vector.shape[0] != 1:
35 | raise ValueError(
36 | 'Inconsistent shapes {0} and {1}'.format(vector.shape, matrix_.shape)
37 | )
38 |
39 | return self._sims_to_matrix(vector, matrix_)
40 |
41 | def _sims_to_matrix(self, vector, matrix_):
42 |
43 | result = np.zeros(shape=(matrix_.shape[0], 1))
44 | for i in range(matrix_.shape[0]):
45 | result[i] = self._sim(vector, matrix_[i, :])
46 | return type(matrix_)(result)
47 |
--------------------------------------------------------------------------------
/src/composes/transformation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/__init__.py
--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/dim_reduction/__init__.py
--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/dimensionality_reduction.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 28, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | from composes.semantic_space.operation import DimensionalityReductionOperation
7 |
8 | class DimensionalityReduction(object):
9 | '''
10 | classdocs
11 | '''
12 |
13 | _name = "we are NOT stupid"
14 |
15 | def __init__(self, reduced_dimension):
16 | '''
17 | Constructor
18 | '''
19 | if reduced_dimension <= 0:
20 | raise ValueError("Cannot reduce to non-positive dimensionality: %d"
21 | % reduced_dimension)
22 | self._reduced_dimension = reduced_dimension
23 |
24 | def create_operation(self):
25 | return DimensionalityReductionOperation(self)
26 |
27 | def get_reduced_dimension(self):
28 | return self._reduced_dimension
29 |
30 | def get_name(self):
31 | return self._name
32 |
33 | def __str__(self):
34 | return self._name
35 |
36 | name = property(get_name)
37 | reduced_dimension = property(get_reduced_dimension)
38 |
--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/nmf.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 1, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | import numpy as np
8 | from dimensionality_reduction import DimensionalityReduction
9 | from composes.matrix.linalg import Linalg
10 | from math import sqrt
11 |
12 | class Nmf(DimensionalityReduction):
13 | """
14 | Performs Non-negative Matrix Factorization to reduced dimension :math:`k`.
15 |
16 | Given an input non-negative matrix :math:`X`, it computes the decomposition:
17 |
18 | :math:`X \\approx WH` where W and H are non-negative matrices which minimize
19 | :math:`||X-WH||_{2}`
20 |
21 | It returns the matrix W.
22 | """
23 |
24 | _name = "nmf"
25 |
26 | def __init__(self, reduced_dimension):
27 | '''
28 | Constructor
29 | '''
30 | super(Nmf, self).__init__(reduced_dimension)
31 |
32 | def apply(self, matrix_):
33 |
34 | matrix_.assert_positive()
35 | #w_init, h_init = self.nndsvd_init(matrix_)
36 | w_init, h_init = self.v_col_init(matrix_)
37 | #w_init, h_init = self.random_init(matrix_)
38 | w, h = Linalg.nmf(matrix_, w_init, h_init)
39 | return w, Linalg.pinv(h)
40 |
41 | def random_init(self, matrix_):
42 |
43 | # TODO: implement the fancier but still fast init (from nimfa: v_col)
44 | rndcol = np.random.random_integers(0, matrix_.shape[1] - 1,
45 | self._reduced_dimension)
46 |
47 | rndrow = np.random.random_integers(0, matrix_.shape[0] - 1,
48 | self._reduced_dimension)
49 |
50 | #otherwise we would have had to convert to DenseMatrix/SparseMatrix
51 | #type(matrix_)(result)
52 | w = matrix_[:, rndcol]
53 | h = matrix_[rndrow, :]
54 |
55 | return w, h
56 |
57 | def v_col_init(self, matrix_):
58 | w = np.zeros((matrix_.shape[0], self._reduced_dimension))
59 | h = np.zeros((self._reduced_dimension, matrix_.shape[1]))
60 |
61 | #in case there are less than 5 rows or columns
62 | p_col = matrix_.shape[1]//5 + 1
63 | p_row = matrix_.shape[0]//5 + 1
64 | for i in range(self._reduced_dimension):
65 |
66 | rndcol = np.random.random_integers(0, matrix_.shape[1] - 1,
67 | p_col)
68 |
69 | rndrow = np.random.random_integers(0, matrix_.shape[0] - 1,
70 | p_row)
71 |
72 | w[:, i] = (matrix_[:, rndcol].sum(1)/float(p_col)).flatten()
73 | h[i, :] = (matrix_[rndrow, :].sum(0)/float(p_row)).flatten()
74 |
75 | w = type(matrix_)(w)
76 | h = type(matrix_)(h)
77 |
78 | return w, h
79 |
80 | def nndsvd_init(self,matrix_):
81 | def matrix_abs(mat_):
82 | mat_p = mat_.get_non_negative()
83 | mat_n_abs = mat_p - mat_
84 | return mat_p + mat_n_abs
85 |
86 | def padd_zeros(matrix_, axis, thickness):
87 | matrix_type = type(matrix_)
88 | if axis == 0:
89 | append_mat = matrix_type(np.zeros((thickness, matrix_.shape[1])))
90 | return matrix_.vstack(append_mat)
91 | elif axis == 1:
92 | append_mat = matrix_type(np.zeros((matrix_.shape[0], thickness)))
93 | return matrix_.hstack(append_mat)
94 |
95 | u, s, v = Linalg.svd(matrix_, self._reduced_dimension);
96 |
97 | rank = u.shape[1]
98 | w = [[]]*rank
99 | h = [[]]*rank
100 |
101 | vt = v.transpose()
102 |
103 | w[0] = sqrt(s[0]) * matrix_abs(u[:,0])
104 | h[0] = sqrt(s[0]) * matrix_abs(vt[0,:])
105 |
106 | for i in range(1,rank):
107 | uu = u[:,i]
108 | vv = vt[i,:]
109 | uup = uu.get_non_negative()
110 | uun = uup - uu
111 | vvp = vv.get_non_negative()
112 | vvn = vvp - vv
113 |
114 | n_uup = uup.norm()
115 | n_uun = uun.norm()
116 | n_vvp = vvp.norm()
117 | n_vvn = vvn.norm()
118 |
119 | termp = n_uup * n_vvp; termn = n_uun * n_vvn
120 | if (termp >= termn):
121 | w[i] = sqrt(s[i] * termp) * uup / n_uup
122 | h[i] = sqrt(s[i] * termp) * vvp / n_vvp
123 | else:
124 | w[i] = sqrt(s[i] * termn) * uun / n_uun
125 | h[i] = sqrt(s[i] * termn) * vvn / n_vvn
126 |
127 | w = matrix_.nary_hstack(w)
128 | h = matrix_.nary_vstack(h)
129 |
130 | w.remove_small_values(0.0000000001)
131 | h.remove_small_values(0.0000000001)
132 |
133 | if (rank < self._reduced_dimension):
134 | w = padd_zeros(w, 1, self._reduced_dimension - rank)
135 | h = padd_zeros(h, 0, self._reduced_dimension - rank)
136 | return w,h
137 |
--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/svd.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 28, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from dimensionality_reduction import DimensionalityReduction
8 | from composes.matrix.linalg import Linalg
9 |
10 | class Svd(DimensionalityReduction):
11 | """
12 | Performs truncated Singular Value Decomposition to a reduced dimension :math:`k`.
13 |
14 | Given an input matrix :math:`X`, it computes the decomposition:
15 |
16 | :math:`X = U \\Sigma V^{T}`
17 |
18 | It returns :math:`U \\Sigma` truncated to dimension :math:`min(k,rank(X))`
19 | """
20 |
21 | _name = "svd"
22 |
23 | def __init__(self, reduced_dimension):
24 | '''
25 | Constructor
26 | '''
27 | super(Svd, self).__init__(reduced_dimension)
28 |
29 | def apply(self, matrix_):
30 |
31 | u, s, v = Linalg.svd(matrix_, self._reduced_dimension)
32 | return u.scale_columns(s), v
33 |
34 |
--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/feature_selection/__init__.py
--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/feature_selection.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 5, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | from composes.semantic_space.operation import FeatureSelectionOperation
7 |
8 | class FeatureSelection(object):
9 | '''
10 | classdocs
11 | '''
12 |
13 |
14 | def __init__(self, reduced_dimension):
15 |
16 | if reduced_dimension <= 0:
17 | raise ValueError("Cannot reduce to non-positive dimensionality: %d"
18 | % reduced_dimension)
19 | self._reduced_dimension = reduced_dimension
20 |
21 | def create_operation(self):
22 | return FeatureSelectionOperation(self)
23 |
24 | def get_reduced_dimension(self):
25 | return self._reduced_dimension
26 |
27 | reduced_dimension = property(get_reduced_dimension)
--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/top_feature_selection.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 5, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | from warnings import warn
7 | from feature_selection import FeatureSelection
8 |
9 | class TopFeatureSelection(FeatureSelection):
10 | """
11 | Sorts the columns of a space according to some criterion and returns a space
12 | containing only the top :math:`k` ones.
13 |
14 | Available criteria:
15 |
16 | sum: Default. Ranks columns according to the sum on their elements.
17 |
18 | length: Ranks columns according to their vector length.
19 |
20 | """
21 |
22 | _name = "top_feature_selection"
23 | _valid_criteria = {"sum", "length"}
24 |
25 | def __init__(self, reduced_dimension, criterion='sum'):
26 | '''
27 | Constructor
28 | '''
29 | super(TopFeatureSelection, self).__init__(reduced_dimension)
30 |
31 | if criterion:
32 | if criterion not in self._valid_criteria:
33 | raise ValueError("Unrecognized criterion: %s" % criterion)
34 | self.criterion = criterion
35 |
36 | def apply(self, matrix_):
37 |
38 | if self.criterion == "sum":
39 | norm_function = matrix_.sum
40 | else:
41 | norm_function = matrix_.norm
42 |
43 | if self._reduced_dimension >= matrix_.shape[1]:
44 | warn("Reduced dimension larger than number of columns!")
45 |
46 | no_columns = min(self._reduced_dimension, matrix_.shape[1])
47 | sorted_perm = matrix_.sorted_permutation(norm_function, 0)
48 |
49 | sorted_perm = sorted_perm[0:no_columns]
50 | matrix_ = matrix_[:, sorted_perm]
51 |
52 | return matrix_, sorted_perm
53 |
54 |
55 |
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/scaling/__init__.py
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/epmi_weighting.py:
--------------------------------------------------------------------------------
1 |
2 | from scaling import Scaling
3 | from composes.utils.py_matrix_utils import nonzero_invert
4 |
5 | class EpmiWeighting(Scaling):
6 | """
7 | Exponential Point-wise Mutual Information.
8 |
9 | :math:`epmi(r,c) = \\frac{P(r,c)}{P(r)P(c)}`
10 |
11 | """
12 |
13 | _name = 'epmi'
14 | _uses_column_stats = True
15 |
16 | def apply(self, matrix_, column_marginal=None):
17 | """
18 | Performs epmi weighting.
19 |
20 | Args:
21 | matrix_ (Matrix): Input matrix
22 |
23 | column_marginal (np.ndarray): column marginals of the
24 | core matrix if the matrix is a peripheral matrix
25 |
26 | Returns:
27 | Matrix: the matrix after applying epmi.
28 |
29 | """
30 |
31 | matrix_.assert_positive()
32 | row_sum = matrix_.sum(axis = 1)
33 |
34 | if not column_marginal is None:
35 | col_sum = column_marginal
36 | else:
37 | col_sum = matrix_.sum(axis = 0)
38 |
39 | total = col_sum.sum()
40 |
41 | row_sum = nonzero_invert(row_sum)
42 | col_sum = nonzero_invert(col_sum)
43 | col_sum = col_sum * total
44 |
45 | matrix_ = matrix_.scale_rows(row_sum)
46 | matrix_ = matrix_.scale_columns(col_sum)
47 |
48 | return matrix_
49 |
50 | def get_column_stats(self, matrix_):
51 | return matrix_.sum(0)
52 |
53 |
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/normalization.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 4, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | from numpy import double
7 | from warnings import warn
8 | from scaling import Scaling
9 |
10 | class Normalization(Scaling):
11 | """
12 | Normalizes the a space according to a some criterion.
13 |
14 | Available criteria:
15 |
16 | sum: Default. The result matrix :math:`X` will satisfy: :math:`\\sum_{i,j} X_{ij}=1`
17 |
18 | length: The result matrix :math:`X` will satisfy: :math:`\\sqrt{\\sum_{i,j} X_{ij}^2}=1`
19 |
20 | """
21 | _name = "row_normalization"
22 | _valid_criteria = ["sum", "length"]
23 | _uses_column_stats = True
24 |
25 | def __init__(self, criterion='sum'):
26 | '''
27 | Constructor
28 | '''
29 | if criterion:
30 | if criterion not in self._valid_criteria:
31 | raise ValueError("Unrecognized criterion: %s" % criterion)
32 | self.criterion = criterion
33 |
34 |
35 | def apply(self, matrix_, total=None):
36 |
37 | if total is None:
38 | if self.criterion == "length":
39 | total = matrix_.norm()
40 | else:
41 | total = matrix_.sum()
42 |
43 | if total == 0:
44 | warn("Could not normalize: sum/length of matrix is 0.")
45 | return matrix_
46 |
47 | matrix_ = (1 / double(total)) * matrix_
48 | return matrix_
49 |
50 | def get_column_stats(self, matrix_):
51 |
52 | if self.criterion == "length":
53 | return matrix_.norm()
54 | else:
55 | return matrix_.sum()
56 |
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/plmi_weighting.py:
--------------------------------------------------------------------------------
1 |
2 | from scaling import Scaling
3 | from ppmi_weighting import PpmiWeighting
4 |
5 | class PlmiWeighting(Scaling):
6 | """
7 | Positive Local Mutual Information.
8 |
9 | :math:`plmi(r,c)=ppmi(r,c)count(r,c)`
10 |
11 | """
12 |
13 | _name = "plmi"
14 | _uses_column_stats = True
15 |
16 | def apply(self, matrix_, column_marginal=None):
17 | return matrix_.multiply(PpmiWeighting().apply(matrix_,
18 | column_marginal))
19 |
20 |
21 | def get_column_stats(self, matrix_):
22 | return matrix_.sum(0)
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/plog_weighting.py:
--------------------------------------------------------------------------------
1 |
2 | from scaling import Scaling
3 |
4 | class PlogWeighting(Scaling):
5 | """
6 | Positive Log Weighting
7 |
8 | :math:`plog(r,c)= log(r,c) \\text{ if } log(r,c) \\geq 0 \\text{ else } 0`
9 | """
10 |
11 | _name = "plog"
12 |
13 | def apply(self, matrix_):
14 | '''
15 | Performs positive log weighting.
16 |
17 | Args:
18 | matrix_ (Matrix): Input matrix
19 | column_marginal (array): column marginals of the core matrix if the matrix is a peripheral matrix
20 |
21 | Returns:
22 | Matrix: the matrix after applying plog
23 |
24 | '''
25 | matrix_ = matrix_.copy()
26 | matrix_.plog()
27 | return matrix_
28 |
29 |
30 |
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/ppmi_weighting.py:
--------------------------------------------------------------------------------
1 |
2 | from scaling import Scaling
3 | from epmi_weighting import EpmiWeighting
4 |
5 | class PpmiWeighting(Scaling):
6 | """
7 | Positive Point-wise Mutual Information.
8 |
9 |
10 | :math:`pmi(r,c) = log\\frac{P(r,c)}{P(r)P(c)}`
11 |
12 | :math:`ppmi(r,c)= pmi(r,c) \\text{ if } pmi(r,c)\\geq 0 \\text{ else } 0`
13 | """
14 |
15 | _name = "ppmi"
16 | _uses_column_stats = True
17 |
18 | def apply(self, matrix_, column_marginal=None):
19 |
20 | matrix_ = EpmiWeighting().apply(matrix_, column_marginal)
21 | matrix_.plog()
22 | return matrix_
23 |
24 | def get_column_stats(self, matrix_):
25 | return matrix_.sum(0)
26 |
27 | """
28 | :math:`ppmi(r,c)=\\begin{cases}pmi(rc) & \\text{if }pmi(r,c)\\geq0
29 | 0 & \\text{otherwise}\\end{cases}`
30 | """
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/row_normalization.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 4, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from scaling import Scaling
8 | from composes.utils.py_matrix_utils import nonzero_invert
9 |
10 | class RowNormalization(Scaling):
11 | """
12 | Normalizes the rows of a space according to a some criterion.
13 |
14 | Available criteria:
15 |
16 | length: Default. Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sqrt{\\sum_j X_{ij}^2}=1`
17 |
18 |
19 | sum: Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sum_j X_{ij}=1`
20 |
21 | """
22 | _name = "row_normalization"
23 | _valid_criteria = ["sum", "length"]
24 |
25 | def __init__(self, criterion='length'):
26 | '''
27 | Constructor
28 | '''
29 | if criterion:
30 | if criterion not in self._valid_criteria:
31 | raise ValueError("Unrecognized criterion: %s" % criterion)
32 | self.criterion = criterion
33 |
34 |
35 | def apply(self, matrix_):
36 |
37 | if self.criterion == "length":
38 | row_norms = matrix_.norm(axis=1)
39 | else:
40 | row_norms = matrix_.sum(axis=1)
41 |
42 | inv_row_norm = nonzero_invert(row_norms)
43 | matrix_ = matrix_.scale_rows(inv_row_norm)
44 | return matrix_
45 |
46 |
47 |
--------------------------------------------------------------------------------
/src/composes/transformation/scaling/scaling.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 20, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from composes.semantic_space.operation import ScalingOperation
8 |
9 | class Scaling(object):
10 | '''
11 | classdocs
12 | '''
13 | _name = "we are NOT stupid"
14 | _uses_column_stats = False
15 |
16 | def get_name(self):
17 | return self._name
18 |
19 | def get_uses_column_stats(self):
20 | return self._uses_column_stats
21 |
22 | def create_operation(self):
23 | return ScalingOperation(self)
24 |
25 | def __str__(self):
26 | return self._name
27 |
28 | name = property(get_name)
29 | uses_column_stats = property(get_uses_column_stats)
--------------------------------------------------------------------------------
/src/composes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/utils/__init__.py
--------------------------------------------------------------------------------
/src/composes/utils/crossvalidation_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 9, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from random import shuffle
8 |
9 | def get_split_indices(range_len, fold):
10 |
11 | if fold <= 0:
12 | raise ValueError("Strictly positive number of folds required, received %s:"
13 | % fold)
14 |
15 | indices_list = []
16 | if range_len < fold:
17 | return get_split_indices(range_len, range_len)
18 |
19 | range_ = range(range_len)
20 | shuffle(range_)
21 | current_index = 0
22 | for i in range(fold):
23 | if i < len(range_)%fold:
24 | slice_length = range_len // fold + 1
25 | else:
26 | slice_length = range_len // fold
27 |
28 | indices_list.append(range_[current_index:current_index + slice_length])
29 | current_index += slice_length
30 |
31 | return indices_list
32 |
33 | def get_submatrix_list(matrix_, indices_list):
34 | return [matrix_[indices, :] for indices in indices_list]
35 |
36 |
--------------------------------------------------------------------------------
/src/composes/utils/gen_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on May 21, 2013
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | from composes.exception.invalid_argument_error import InvalidArgumentError
7 |
8 |
9 | def assert_is_instance(object_, class_):
10 | if not isinstance(object_, class_):
11 | raise TypeError("expected %s, received %s" % (class_, type(object_)))
12 |
13 |
14 | def get_partitions(sorted_list, min_samples):
15 | prev_idx = 0
16 | range_list = []
17 | for i in range(1, len(sorted_list)):
18 | if sorted_list[i] != sorted_list[i - 1]:
19 | if i - prev_idx >= min_samples:
20 | range_list.append((prev_idx, i))
21 |
22 | prev_idx = i
23 |
24 | if len(sorted_list) - prev_idx >= min_samples:
25 | range_list.append((prev_idx, len(sorted_list)))
26 |
27 | keys = [sorted_list[range_list[i][0]] for i in xrange(len(range_list))]
28 |
29 | return keys, range_list
--------------------------------------------------------------------------------
/src/composes/utils/log_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 15, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from numpy import double
8 | import logging
9 | from composes.utils.io_utils import create_parent_directories
10 |
11 | def config_logging(file_name, level = logging.INFO, format_ =""):
12 | if not file_name is None:
13 | create_parent_directories(file_name)
14 | logging.basicConfig(filename=file_name, level=level, format=format_)
15 | logging.debug("start logging")
16 |
17 |
18 | def get_ident(delim, ident_level):
19 | return delim * ident_level
20 |
21 | def print_matrix_info(logger_, matrix_, ident_level, intro_string):
22 | delim = " "
23 | ident = get_ident(delim, ident_level)
24 | logger_string = ident + intro_string
25 | ident = ident + delim
26 |
27 | logger_string += ("\n%sMatrix type:%s" % (ident, type(matrix_).__name__))
28 | logger_string += ("\n%sMatrix shape:%sx%s" % (ident, matrix_.shape[0],
29 | matrix_.shape[1]))
30 |
31 | if type(matrix_).__name__ == "SparseMatrix":
32 | perc_nnz = 100 * matrix_.mat.nnz/double(matrix_.shape[0]*matrix_.shape[1])
33 | logger_string += ("\n%sPerc. non-zero entries:%d" % (ident, perc_nnz))
34 |
35 | logger_.info(logger_string)
36 |
37 |
38 | def get_learner_info(learner, ident):
39 | logger_string = ""
40 |
41 | if hasattr(learner, '_intercept'):
42 | logger_string += ("\n%sUsing intercept:%s" % (ident, learner._intercept))
43 |
44 | if hasattr(learner, '_crossvalidation'):
45 | logger_string += ("\n%sUsing crossvalidation:%s" % (ident, learner._crossvalidation))
46 |
47 | if learner._crossvalidation and hasattr(learner, '_folds'):
48 | logger_string += ("\n%sUsing number of folds:%s" % (ident, learner._folds))
49 |
50 | return logger_string
51 |
52 | def print_composition_model_info(logger_, model, ident_level, intro_string):
53 |
54 | delim = " "
55 | ident = get_ident(delim, ident_level)
56 | logger_string = ident + intro_string
57 | ident = ident + delim
58 |
59 | logger_.info(logger_string)
60 |
61 | print_name(logger_, model, ident_level, "Composition model type:")
62 |
63 | logger_string = ""
64 | if hasattr(model, '_regression_learner'):
65 | logger_string += ("\n%sUsing regression:%s" % (ident,
66 | type(model.regression_learner).__name__))
67 | logger_string += get_learner_info(model.regression_learner, ident + delim)
68 |
69 | logger_.info(logger_string)
70 |
71 | def print_transformation_info(logger_, trans, ident_level, intro_string):
72 | delim = " "
73 | ident = get_ident(delim, ident_level)
74 | logger_string = ident + intro_string
75 | ident = ident + delim
76 |
77 | logger_string += ("\n%sTransformation type:%s" % (ident, type(trans).__name__))
78 |
79 | if hasattr(trans, '_reduced_dimension'):
80 | logger_string += ("\n%sReduced dimension:%s" % (ident, trans.reduced_dimension))
81 |
82 |
83 | logger_.info(logger_string)
84 |
85 | def print_info(logger_, ident_level, text):
86 | delim = " "
87 | ident = get_ident(delim, ident_level)
88 | logger_string = ident + ""
89 |
90 | logger_string += "\n%s%s" % (ident, text)
91 | logger_.info(logger_string)
92 |
93 | def print_name(logger_, object_, ident_level, intro_string):
94 | delim = " "
95 | ident = get_ident(delim, ident_level)
96 | logger_string = ident + intro_string
97 | ident = ident + delim
98 |
99 | logger_string += ("\n%s%s" % (ident, type(object_).__name__))
100 |
101 | logger_.info(logger_string)
102 |
103 | def print_time_info(logger_, end, beg, ident_level):
104 | delim = " "
105 | ident = get_ident(delim, ident_level)
106 | logger_string = ident
107 | logger_string += ("\n%sTiming:%s seconds" % (ident, end - beg))
108 |
109 | logger_.info(logger_string)
110 |
111 |
--------------------------------------------------------------------------------
/src/composes/utils/matrix_utils.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | from composes.matrix.sparse_matrix import SparseMatrix
4 | from composes.matrix.dense_matrix import DenseMatrix
5 | from composes.matrix.matrix import Matrix
6 | from scipy.sparse import issparse
7 | from py_matrix_utils import is_array
8 | from warnings import warn
9 |
10 | def to_matrix(matrix_):
11 | """
12 | Converts an array-like structure to a DenseMatrix/SparseMatrix
13 | """
14 | if issparse(matrix_):
15 | return SparseMatrix(matrix_)
16 | else:
17 | return DenseMatrix(matrix_)
18 |
19 | def is_array_or_matrix(data):
20 | return is_array(data) or isinstance(data, Matrix)
21 |
22 |
23 | def assert_is_array_or_matrix(data):
24 | if not is_array_or_matrix(data):
25 | raise TypeError("expected array-like or matrix, received %s"
26 | % (type(data)))
27 |
28 | def padd_matrix(matrix_, axis, value=1):
29 | matrix_type = type(matrix_)
30 | if axis == 0:
31 | append_mat = matrix_type(np.ones((1, matrix_.shape[1]))*value)
32 | return matrix_.vstack(append_mat)
33 | elif axis == 1:
34 | append_mat = matrix_type(np.ones((matrix_.shape[0], 1))*value)
35 | return matrix_.hstack(append_mat)
36 | else:
37 | raise ValueError("Invalid axis value:%s" % axis)
38 |
39 |
40 | def assert_same_shape(matrix1, matrix2, axis=None):
41 |
42 | if axis is None:
43 | if matrix1.shape != matrix2.shape:
44 | raise ValueError("Inconsistent shapes")
45 | else:
46 | if not axis in [0, 1]:
47 | raise ValueError("Invalid axis value: %s, expected 0 or 1." % axis)
48 | if matrix1.shape[axis] != matrix2.shape[axis]:
49 | raise ValueError("Inconsistent shapes")
50 |
51 |
52 | def to_compatible_matrix_types(v1, v2):
53 |
54 | if isinstance(v1, Matrix) and isinstance(v2, Matrix):
55 | v2 = type(v1)(v2)
56 | elif not isinstance(v1, Matrix) and isinstance(v2, Matrix):
57 | v1 = type(v2)(v1)
58 | elif not isinstance(v2, Matrix) and isinstance(v1, Matrix):
59 | v2 = type(v1)(v2)
60 | else:
61 | v1 = to_matrix(v1)
62 | v2 = type(v1)(v2)
63 |
64 | return v1, v2
65 |
66 |
67 |
68 | def get_type_of_largest(matrix_list):
69 | max_dim = 0
70 | max_type = None
71 | for matrix_ in matrix_list:
72 | if matrix_.shape[0] * matrix_.shape[1] > max_dim:
73 | max_type = type(matrix_)
74 | max_dim = matrix_.shape[0] * matrix_.shape[1]
75 |
76 | return max_type
77 |
78 | def resolve_type_conflict(matrix_list, matrix_type):
79 | new_matrix_list = []
80 |
81 | if matrix_type_conflict(matrix_list):
82 | warn("Efficiency warning: matrices should have the same dense/sparse type!")
83 | for matrix_ in matrix_list:
84 | new_matrix_list.append(matrix_type(matrix_))
85 | return new_matrix_list
86 |
87 | return list(matrix_list)
88 |
89 |
90 | def matrix_type_conflict(matrix_list):
91 |
92 | if not matrix_list:
93 | return False
94 |
95 | matrix_type = type(matrix_list[0])
96 | for matrix_ in matrix_list:
97 | if not isinstance(matrix_, matrix_type):
98 | return True
99 |
100 | return False
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/src/composes/utils/mem_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 21, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | """
8 | Wrappers around psutil functions that display memory usage information.
9 | """
10 | import numpy as np
11 | from os import getpid
12 | import psutil
13 |
14 | def get_mem_usage():
15 | p = psutil.Process(getpid())
16 | return p.get_memory_info()[0]/np.double(1024*1024)
--------------------------------------------------------------------------------
/src/composes/utils/num_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 18, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | from numbers import Number
8 | from numbers import Integral
9 | import numpy as np
10 |
11 | def is_numeric(operand):
12 | return isinstance(operand, (Number, np.number))
13 |
14 | def is_integer(operand):
15 | return isinstance(operand, Integral)
16 |
--------------------------------------------------------------------------------
/src/composes/utils/py_matrix_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 19, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import numpy as np
7 | from scipy.sparse import spdiags
8 |
9 |
10 | def array_to_csr_diagonal(array_):
11 | #array_ can't be a sparse matrix, if it is dense, it has to be a row matrix
12 | #(i.e. shape = (1, x))
13 |
14 | flat_array = array_.flatten()
15 | array_size = flat_array.size
16 | csr_diag = spdiags(flat_array, [0], array_size, array_size, format = 'csr')
17 | return csr_diag
18 |
19 | def is_array(operand):
20 | return hasattr(operand, 'dtype') and hasattr(operand, 'shape')
21 |
22 |
23 | def nonzero_invert(matrix_):
24 | '''
25 | Performs 1/x for all x, non-zero elements of the matrix.
26 |
27 | Params:
28 | matrix_: np.matrix
29 | '''
30 |
31 | matrix_ = matrix_.astype(np.double)
32 | matrix_[matrix_ != 0] = np.array(1.0/matrix_[matrix_ != 0]).flatten()
33 | return matrix_
34 |
35 |
36 |
--------------------------------------------------------------------------------
/src/composes/utils/regression_learner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from composes.matrix.linalg import Linalg
3 |
4 |
5 | class RegressionLearner(object):
6 | """
7 | Implements a set of regression methods.
8 |
9 | Supported regression methods are least squares regression and
10 | ridge regression. Ridge regression can be used with generalized
11 | cross validation. (Hastie, Tibshirani and Friedman, Second edition,
12 | page 244)
13 | """
14 |
15 |
16 | def __init__(self):
17 | '''
18 | Constructor
19 | '''
20 |
21 | def has_intercept(self):
22 | return self._intercept
23 |
24 |
25 | class LstsqRegressionLearner(RegressionLearner):
26 | """
27 | This class performs Least Squares Regression.
28 |
29 | It finds the matrix X which solves:
30 |
31 | :math:`X = argmin(||AX - B||_2)`
32 |
33 | It can be used with intercept or without (by default intercept=True).
34 |
35 | """
36 |
37 | def __init__(self, intercept=True):
38 | self._intercept = intercept
39 |
40 | def train(self, matrix_a, matrix_b):
41 | return Linalg.lstsq_regression(matrix_a, matrix_b, self._intercept)
42 |
43 |
44 | class RidgeRegressionLearner(RegressionLearner):
45 | """
46 | This class performs Ridge Regression.
47 |
48 | It finds the matrix X which solves:
49 |
50 | :math:`X = argmin(||AX - B||_2 + \\lambda||X||_2)`
51 |
52 | It can be used with intercept or without (by default intercept=True).
53 | Cross validation can be used with default :math:`\\lambda` range of
54 | :math:`linspace(0, 5, 11)`. By default Generalized cross validation is performed.
55 | If cross validation is set False it requires the input of a :math:`\\lambda` value.
56 |
57 | """
58 |
59 | def __init__(self, intercept=True, param_range=None, crossvalidation=True, param=None):
60 | self._intercept = intercept
61 | self._param_range = param_range if param_range is not None else np.linspace(0.0, 5, 11)
62 |
63 | self._param = param
64 | self._crossvalidation = crossvalidation
65 |
66 | if param:
67 | self._crossvalidation = False
68 | self._param = param
69 |
70 | if not self._crossvalidation and self._param is None:
71 | raise ValueError("Cannot run (no-crossvalidation) RidgeRegression with no lambda value!")
72 |
73 |
74 | def train(self, matrix_a, matrix_b):
75 | """
76 | If cross validation is set to True, it performs generalized
77 | cross validation. (Hastie, Tibshirani and Friedman, Second edition,
78 | page 244).
79 | """
80 |
81 | if not self._crossvalidation:
82 | return Linalg.ridge_regression(matrix_a, matrix_b, self._param,
83 | self._intercept)[0]
84 |
85 | else:
86 | min_err_param = 0
87 | min_err = np.Inf
88 | gcv_err = np.Inf
89 |
90 | N = matrix_a.shape[0]
91 | for param in self._param_range:
92 |
93 | mat_x, S_trace, err1 = Linalg.ridge_regression(matrix_a, matrix_b, param,
94 | self._intercept)
95 |
96 | nom = pow(1 - S_trace / N, 2) * N
97 | if nom != 0:
98 | gcv_err = (err1 * err1) / nom
99 |
100 | if gcv_err < min_err:
101 | min_err = gcv_err
102 | min_err_param = param
103 |
104 | #print "lambda:", min_err_param
105 | return Linalg.ridge_regression(matrix_a, matrix_b, min_err_param,
106 | self._intercept)[0]
107 |
--------------------------------------------------------------------------------
/src/composes/utils/scoring_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 17, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | import numpy as np
8 | from scipy import stats
9 |
10 |
11 | def score(gold, prediction, method):
12 | if len(gold) != len(prediction):
13 | raise ValueError("The two arrays must have the same length!")
14 |
15 | gold = np.array(gold, dtype=np.double)
16 | prediction = np.array(prediction, dtype=np.double)
17 |
18 | if method == "pearson":
19 | return pearson(gold, prediction)[0]
20 | elif method == "spearman":
21 | return spearman(gold, prediction)[0]
22 | elif method == "auc":
23 | return auc(gold, prediction)
24 | else:
25 | raise NotImplementedError("Unknown scoring measure:%s" % method)
26 |
27 | def pearson(gold, prediction):
28 | return stats.pearsonr(gold, prediction)
29 |
30 | def spearman(gold, prediction):
31 | return stats.spearmanr(gold, prediction, None)
32 |
33 | def auc(gold, prediction):
34 |
35 | positive = float(gold[gold == 1].size)
36 | negative = float(gold.size - positive)
37 |
38 | total_count = gold.size
39 | point_set = np.empty(total_count, dtype = [('gold',float),('score',float)])
40 | for i in range(total_count):
41 | if not gold[i] in (0,1):
42 | raise ValueError("For evaluating AUC, gold scores are required to be 0 or 1.")
43 | point_set[i]=(gold[i], prediction[i])
44 |
45 | point_set.sort(order = 'score')
46 |
47 | xi = 1.0
48 | yi = 1.0
49 | xi_old = 1.0
50 | true_positive = positive
51 | false_positive = negative
52 | auc = 0
53 |
54 | for i in range(total_count):
55 | if (point_set[i][0] == 1):
56 | true_positive -= 1
57 | yi = true_positive / positive
58 | else:
59 | false_positive -= 1
60 | xi = false_positive / negative
61 | auc += (xi_old - xi) * yi
62 | xi_old = xi
63 |
64 | return auc
65 |
--------------------------------------------------------------------------------
/src/composes/utils/space_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 26, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 |
8 | def list2dict(list_):
9 | return_dict = {}
10 |
11 | for idx, word in enumerate(list_):
12 | if word in return_dict:
13 | raise ValueError("duplicate string found in list: %s" % (word))
14 | return_dict[word] = idx
15 |
16 | return return_dict
17 |
18 | def add_items_to_dict(dict_, list_):
19 |
20 | no_els = len(dict_)
21 | for idx, el in enumerate(list_):
22 | if el in dict_:
23 | raise ValueError("Found duplicate keys when appending elements to\
24 | dictionary.")
25 | dict_[el] = no_els + idx
26 | return dict_
27 |
28 | def assert_dict_match_list(dict_, list_):
29 |
30 | match_err = ValueError("expected matching dictionary and list structures.")
31 |
32 | if not len(list_) == len(dict_):
33 | raise match_err
34 | for (k, v) in dict_.iteritems():
35 | if not list_[v] == k:
36 | raise match_err
37 |
38 |
39 | def assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id):
40 |
41 | no_rows = matrix_.mat.shape[0]
42 | no_cols = matrix_.mat.shape[1]
43 |
44 | has_column_maps = column2id or id2column
45 |
46 | if not no_rows == len(id2row) or not no_rows == len(row2id):
47 | raise ValueError("expected consistent shapes: %d %d %d"
48 | % (no_rows, len(id2row), len(row2id)))
49 |
50 | if (has_column_maps and
51 | (not no_cols == len(id2column) or not no_cols == len(column2id))):
52 | raise ValueError("expected consistent shapes: %d %d %d"
53 | % (no_cols, len(id2column), len(column2id)))
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/src/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/__init__.py
--------------------------------------------------------------------------------
/src/examples/cmd_ex01.sh:
--------------------------------------------------------------------------------
1 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm -o ../examples/data/out/
2 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm --output_format dm -w ppmi,plog -r svd_2 -n none,row -o ../examples/data/out/ -l ../examples/data/out/ex01.log
3 | #or
4 | python2.7 build_core_space.py ../examples/data/in/config1.cfg
5 | python2.7 build_core_space.py ../examples/data/in/config2.cfg
6 |
--------------------------------------------------------------------------------
/src/examples/cmd_ex02.sh:
--------------------------------------------------------------------------------
1 | python2.7 build_peripheral_space.py -i ../examples/data/in/ex05 --input_format sm -o ../examples/data/out/ -c ../examples/data/out/CORE_SS.ex01.ppmi.svd_2.pkl
2 |
--------------------------------------------------------------------------------
/src/examples/cmd_ex03.sh:
--------------------------------------------------------------------------------
1 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs1.txt -c 1,2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos,euclidean
2 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs2.txt -c 1,2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos,euclidean
3 |
--------------------------------------------------------------------------------
/src/examples/cmd_ex04.sh:
--------------------------------------------------------------------------------
1 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos
2 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos
--------------------------------------------------------------------------------
/src/examples/cmd_ex05.sh:
--------------------------------------------------------------------------------
1 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m dilation --lambda 2 -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
2 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m mult -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
3 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
4 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp2.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ --output_format dm
5 |
--------------------------------------------------------------------------------
/src/examples/cmd_ex06.sh:
--------------------------------------------------------------------------------
1 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True
2 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -r ridge --lambda 0.0 -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True
--------------------------------------------------------------------------------
/src/examples/cmd_ex07.sh:
--------------------------------------------------------------------------------
1 | python2.7 evaluate_similarities.py -i ../examples/data/in/sim_data.txt -c 3,5 -m pearson,spearman
2 | python2.7 evaluate_similarities.py --in_dir ../examples/data/in/ --filter sim_data -c 3,5 -m pearson,spearman
3 |
--------------------------------------------------------------------------------
/src/examples/data/in/config1.cfg:
--------------------------------------------------------------------------------
1 | [build_core_space]
2 |
3 | #input file
4 | input=../examples/data/in/ex01
5 |
6 | # output directory
7 | output=../examples/data/out/
8 |
9 | # input format
10 | input_format=sm
11 |
12 |
--------------------------------------------------------------------------------
/src/examples/data/in/config2.cfg:
--------------------------------------------------------------------------------
1 | [build_core_space]
2 |
3 | #input file
4 | input=../examples/data/in/ex01
5 |
6 | # output directory
7 | output=../examples/out/
8 |
9 | # input format
10 | input_format=sm
11 |
12 | # weighing schemes
13 | weighting=ppmi,plog
14 |
15 | # reductions
16 | reduction=svd_2
17 |
18 | # normalizations
19 | normalization=none,row
20 |
21 | # additional output format
22 | output_format=dm
23 |
24 | # log file
25 | log=../examples/data/out/ex01.log
--------------------------------------------------------------------------------
/src/examples/data/in/data_to_comp.txt:
--------------------------------------------------------------------------------
1 | book book book__book
2 | car book car__book
3 | car car car__car
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/data_to_comp2.txt:
--------------------------------------------------------------------------------
1 | book history_book book__history_book
2 | car sports_car car__sports_car
3 | book sports_car book__sports_book
4 |
5 |
6 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex01.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex01.rows:
--------------------------------------------------------------------------------
1 | car
2 | book
3 |
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex01.sm:
--------------------------------------------------------------------------------
1 | car red 5
2 | book readable 6
3 | car blue 1
4 | book red 3
5 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex05.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex05.sm:
--------------------------------------------------------------------------------
1 | sports_car red 5
2 | history_book readable 1
3 | history_book red 1
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex10.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex10.rows:
--------------------------------------------------------------------------------
1 | book
2 | car
3 | bike
4 | good
--------------------------------------------------------------------------------
/src/examples/data/in/ex10.sm:
--------------------------------------------------------------------------------
1 | car red 5
2 | book readable 6
3 | car blue 1
4 | book red 3
5 | bike blue 4
6 | bike red 4
7 | good readable 3
8 | good blue 2
9 | good red 6
--------------------------------------------------------------------------------
/src/examples/data/in/ex19-n.cols:
--------------------------------------------------------------------------------
1 | book
2 | car
--------------------------------------------------------------------------------
/src/examples/data/in/ex19-n.sm:
--------------------------------------------------------------------------------
1 | man book 5
2 | man car 2
3 | boy book 7
4 | boy car 1
5 | woman book 5
6 | woman car 2
7 |
--------------------------------------------------------------------------------
/src/examples/data/in/ex19-svo.cols:
--------------------------------------------------------------------------------
1 | book
2 | car
--------------------------------------------------------------------------------
/src/examples/data/in/ex19-svo.sm:
--------------------------------------------------------------------------------
1 | man_hate_boy car 4
2 | man_hate_boy book 3
3 | man_hate_man car 10
4 | boy_hate_boy book 2
5 | boy_hate_man car 6
6 | boy_hate_boy car 11
7 |
--------------------------------------------------------------------------------
/src/examples/data/in/sim_data.txt:
--------------------------------------------------------------------------------
1 | book history_book 0.894427191 other_field 4
2 | car sports_car 0.980580675691 other_field 4
3 | book sports_car 0.4472135955 other_field 6
--------------------------------------------------------------------------------
/src/examples/data/in/sim_data2.txt:
--------------------------------------------------------------------------------
1 | book history_book 0.894427191 other_field 6
2 | car sports_car 0.980580675691 other_field 4
3 | book sports_car 0.4472135955 other_field 5
--------------------------------------------------------------------------------
/src/examples/data/in/sim_data3.txt:
--------------------------------------------------------------------------------
1 | book book 0.894427191 other_field 4
2 | car car 0.980580675691 other_field 4
3 | book car 0.4472135955 other_field 6
--------------------------------------------------------------------------------
/src/examples/data/in/train_data.txt:
--------------------------------------------------------------------------------
1 | book_function car my_car_book
2 | book_function book 2x_book
3 |
--------------------------------------------------------------------------------
/src/examples/data/in/word_list.txt:
--------------------------------------------------------------------------------
1 | car
2 | book
3 |
--------------------------------------------------------------------------------
/src/examples/data/in/word_pairs1.txt:
--------------------------------------------------------------------------------
1 | book book
2 | car book
3 | car car
4 |
--------------------------------------------------------------------------------
/src/examples/data/in/word_pairs2.txt:
--------------------------------------------------------------------------------
1 | book history_book
2 | car sports_car
3 | book sports_car
4 |
5 |
--------------------------------------------------------------------------------
/src/examples/data/in/word_sims.txt:
--------------------------------------------------------------------------------
1 | book book 7
2 | car car 7
3 | book car 2
--------------------------------------------------------------------------------
/src/examples/data/out/COMPOSED_SS.ex10.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (ccomposes.semantic_space.space
5 | Space
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_id2row'
14 | p6
15 | (lp7
16 | S'my_car_book'
17 | p8
18 | aS'my_special_book'
19 | p9
20 | asS'_column2id'
21 | p10
22 | (dp11
23 | S'blue'
24 | p12
25 | I1
26 | sS'readable'
27 | p13
28 | I2
29 | sS'red'
30 | p14
31 | I0
32 | ssS'_operations'
33 | p15
34 | (lp16
35 | sS'_id2column'
36 | p17
37 | (lp18
38 | g14
39 | ag12
40 | ag13
41 | asS'_element_shape'
42 | p19
43 | (I3
44 | tp20
45 | sS'_cooccurrence_matrix'
46 | p21
47 | g0
48 | (ccomposes.matrix.sparse_matrix
49 | SparseMatrix
50 | p22
51 | g2
52 | Ntp23
53 | Rp24
54 | (dp25
55 | S'_mat'
56 | p26
57 | g0
58 | (cscipy.sparse.csr
59 | csr_matrix
60 | p27
61 | g2
62 | Ntp28
63 | Rp29
64 | (dp30
65 | S'format'
66 | p31
67 | S'csr'
68 | p32
69 | sS'_shape'
70 | p33
71 | (I2
72 | I3
73 | tp34
74 | sS'indptr'
75 | p35
76 | cnumpy.core.multiarray
77 | _reconstruct
78 | p36
79 | (cnumpy
80 | ndarray
81 | p37
82 | (I0
83 | tp38
84 | S'b'
85 | p39
86 | tp40
87 | Rp41
88 | (I1
89 | (I3
90 | tp42
91 | cnumpy
92 | dtype
93 | p43
94 | (S'i4'
95 | p44
96 | I0
97 | I1
98 | tp45
99 | Rp46
100 | (I3
101 | S'<'
102 | p47
103 | NNNI-1
104 | I-1
105 | I0
106 | tp48
107 | bI00
108 | S'\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00'
109 | p49
110 | tp50
111 | bsS'indices'
112 | p51
113 | g36
114 | (g37
115 | (I0
116 | tp52
117 | g39
118 | tp53
119 | Rp54
120 | (I1
121 | (I5
122 | tp55
123 | g46
124 | I00
125 | S'\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00'
126 | p56
127 | tp57
128 | bsS'maxprint'
129 | p58
130 | I50
131 | sS'data'
132 | p59
133 | g36
134 | (g37
135 | (I0
136 | tp60
137 | g39
138 | tp61
139 | Rp62
140 | (I1
141 | (I5
142 | tp63
143 | g43
144 | (S'f8'
145 | p64
146 | I0
147 | I1
148 | tp65
149 | Rp66
150 | (I3
151 | S'<'
152 | p67
153 | NNNI-1
154 | I-1
155 | I0
156 | tp68
157 | bI00
158 | S'\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00(@'
159 | p69
160 | tp70
161 | bsbsbsS'_row2id'
162 | p71
163 | (dp72
164 | g8
165 | I0
166 | sg9
167 | I1
168 | ssb.
--------------------------------------------------------------------------------
/src/examples/data/out/PER_SS.ex05.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (ccomposes.semantic_space.peripheral_space
5 | PeripheralSpace
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_id2row'
14 | p6
15 | (lp7
16 | S'sports_car'
17 | p8
18 | aS'history_book'
19 | p9
20 | asS'_column2id'
21 | p10
22 | (dp11
23 | S'blue'
24 | p12
25 | I1
26 | sS'readable'
27 | p13
28 | I2
29 | sS'red'
30 | p14
31 | I0
32 | ssS'_operations'
33 | p15
34 | (lp16
35 | g0
36 | (ccomposes.semantic_space.operation
37 | ScalingOperation
38 | p17
39 | g2
40 | Ntp18
41 | Rp19
42 | (dp20
43 | S'_ScalingOperation__scaling'
44 | p21
45 | g0
46 | (ccomposes.transformation.scaling.ppmi_weighting
47 | PpmiWeighting
48 | p22
49 | g2
50 | Ntp23
51 | Rp24
52 | sS'_ScalingOperation__column_stats'
53 | p25
54 | cnumpy.core.multiarray
55 | _reconstruct
56 | p26
57 | (cnumpy.matrixlib.defmatrix
58 | matrix
59 | p27
60 | (I0
61 | tp28
62 | S'b'
63 | p29
64 | tp30
65 | Rp31
66 | (I1
67 | (I1
68 | I3
69 | tp32
70 | cnumpy
71 | dtype
72 | p33
73 | (S'f8'
74 | p34
75 | I0
76 | I1
77 | tp35
78 | Rp36
79 | (I3
80 | S'<'
81 | p37
82 | NNNI-1
83 | I-1
84 | I0
85 | tp38
86 | bI01
87 | S'\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@'
88 | p39
89 | tp40
90 | bsbasS'_id2column'
91 | p41
92 | (lp42
93 | g14
94 | ag12
95 | ag13
96 | asS'_element_shape'
97 | p43
98 | (I3
99 | tp44
100 | sS'_cooccurrence_matrix'
101 | p45
102 | g0
103 | (ccomposes.matrix.sparse_matrix
104 | SparseMatrix
105 | p46
106 | g2
107 | Ntp47
108 | Rp48
109 | (dp49
110 | S'_mat'
111 | p50
112 | g0
113 | (cscipy.sparse.csr
114 | csr_matrix
115 | p51
116 | g2
117 | Ntp52
118 | Rp53
119 | (dp54
120 | S'format'
121 | p55
122 | S'csr'
123 | p56
124 | sS'_shape'
125 | p57
126 | (I2
127 | I3
128 | tp58
129 | sS'indptr'
130 | p59
131 | g26
132 | (cnumpy
133 | ndarray
134 | p60
135 | (I0
136 | tp61
137 | g29
138 | tp62
139 | Rp63
140 | (I1
141 | (I3
142 | tp64
143 | g33
144 | (S'i4'
145 | p65
146 | I0
147 | I1
148 | tp66
149 | Rp67
150 | (I3
151 | S'<'
152 | p68
153 | NNNI-1
154 | I-1
155 | I0
156 | tp69
157 | bI00
158 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
159 | p70
160 | tp71
161 | bsS'indices'
162 | p72
163 | g26
164 | (g60
165 | (I0
166 | tp73
167 | g29
168 | tp74
169 | Rp75
170 | (I1
171 | (I2
172 | tp76
173 | g67
174 | I00
175 | S'\x00\x00\x00\x00\x02\x00\x00\x00'
176 | p77
177 | tp78
178 | bsS'maxprint'
179 | p79
180 | I50
181 | sS'data'
182 | p80
183 | g26
184 | (g60
185 | (I0
186 | tp81
187 | g29
188 | tp82
189 | Rp83
190 | (I1
191 | (I2
192 | tp84
193 | g36
194 | I00
195 | S'\xaerF\xe8\x8f\x1d\xe4?"\x9a\x9a\xc7\xf7\x8f\xcc?'
196 | p85
197 | tp86
198 | bsbsbsS'_row2id'
199 | p87
200 | (dp88
201 | g9
202 | I1
203 | sg8
204 | I0
205 | ssb.
--------------------------------------------------------------------------------
/src/examples/data/out/PHRASE_SS.ex10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/PHRASE_SS.ex10.pkl
--------------------------------------------------------------------------------
/src/examples/data/out/ex01.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 |
--------------------------------------------------------------------------------
/src/examples/data/out/ex01.dm:
--------------------------------------------------------------------------------
1 | car 5.0 1.0 0.0
2 | book 3.0 0.0 6.0
3 |
--------------------------------------------------------------------------------
/src/examples/data/out/ex01.pkl:
--------------------------------------------------------------------------------
1 | ccopy_reg
2 | _reconstructor
3 | p0
4 | (ccomposes.semantic_space.space
5 | Space
6 | p1
7 | c__builtin__
8 | object
9 | p2
10 | Ntp3
11 | Rp4
12 | (dp5
13 | S'_id2row'
14 | p6
15 | (lp7
16 | S'car'
17 | p8
18 | aS'book'
19 | p9
20 | asS'_column2id'
21 | p10
22 | (dp11
23 | S'blue'
24 | p12
25 | I1
26 | sS'readable'
27 | p13
28 | I2
29 | sS'red'
30 | p14
31 | I0
32 | ssS'_operations'
33 | p15
34 | (lp16
35 | sS'_id2column'
36 | p17
37 | (lp18
38 | g14
39 | ag12
40 | ag13
41 | asS'_element_shape'
42 | p19
43 | (I3
44 | tp20
45 | sS'_cooccurrence_matrix'
46 | p21
47 | g0
48 | (ccomposes.matrix.sparse_matrix
49 | SparseMatrix
50 | p22
51 | g2
52 | Ntp23
53 | Rp24
54 | (dp25
55 | S'_mat'
56 | p26
57 | g0
58 | (cscipy.sparse.csr
59 | csr_matrix
60 | p27
61 | g2
62 | Ntp28
63 | Rp29
64 | (dp30
65 | S'format'
66 | p31
67 | S'csr'
68 | p32
69 | sS'_shape'
70 | p33
71 | (I2
72 | I3
73 | tp34
74 | sS'indptr'
75 | p35
76 | cnumpy.core.multiarray
77 | _reconstruct
78 | p36
79 | (cnumpy
80 | ndarray
81 | p37
82 | (I0
83 | tp38
84 | S'b'
85 | p39
86 | tp40
87 | Rp41
88 | (I1
89 | (I3
90 | tp42
91 | cnumpy
92 | dtype
93 | p43
94 | (S'i4'
95 | p44
96 | I0
97 | I1
98 | tp45
99 | Rp46
100 | (I3
101 | S'<'
102 | p47
103 | NNNI-1
104 | I-1
105 | I0
106 | tp48
107 | bI00
108 | S'\x00\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00'
109 | p49
110 | tp50
111 | bsS'indices'
112 | p51
113 | g36
114 | (g37
115 | (I0
116 | tp52
117 | g39
118 | tp53
119 | Rp54
120 | (I1
121 | (I4
122 | tp55
123 | g46
124 | I00
125 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00'
126 | p56
127 | tp57
128 | bsS'maxprint'
129 | p58
130 | I50
131 | sS'data'
132 | p59
133 | g36
134 | (g37
135 | (I0
136 | tp60
137 | g39
138 | tp61
139 | Rp62
140 | (I1
141 | (I4
142 | tp63
143 | g43
144 | (S'f8'
145 | p64
146 | I0
147 | I1
148 | tp65
149 | Rp66
150 | (I3
151 | S'<'
152 | p67
153 | NNNI-1
154 | I-1
155 | I0
156 | tp68
157 | bI00
158 | S'\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x18@'
159 | p69
160 | tp70
161 | bsbsbsS'_row2id'
162 | p71
163 | (dp72
164 | g8
165 | I0
166 | sg9
167 | I1
168 | ssb.
--------------------------------------------------------------------------------
/src/examples/data/out/ex01.rows:
--------------------------------------------------------------------------------
1 | car
2 | book
3 |
--------------------------------------------------------------------------------
/src/examples/data/out/ex01.sm:
--------------------------------------------------------------------------------
1 | car red 5.000000
2 | car blue 1.000000
3 | book red 3.000000
4 | book readable 6.000000
5 |
--------------------------------------------------------------------------------
/src/examples/data/out/ex10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/ex10.pkl
--------------------------------------------------------------------------------
/src/examples/data/out/model01.params:
--------------------------------------------------------------------------------
1 | alpha 1.000000
2 | beta 1.000000
--------------------------------------------------------------------------------
/src/examples/data/out/model01.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/model01.pkl
--------------------------------------------------------------------------------
/src/examples/ex01.py:
--------------------------------------------------------------------------------
1 | #ex01.py
2 | #-------
3 | from composes.semantic_space.space import Space
4 |
5 | #create a space from co-occurrence counts in sparse format
6 | my_space = Space.build(data = "./data/in/ex01.sm",
7 | rows = "./data/in/ex01.rows",
8 | cols = "./data/in/ex01.cols",
9 | format = "sm")
10 |
11 | #export the space in sparse format
12 | my_space.export("./data/out/ex01", format = "sm")
13 |
14 | #export the space in dense format
15 | my_space.export("./data/out/ex01", format = "dm")
16 |
--------------------------------------------------------------------------------
/src/examples/ex02.py:
--------------------------------------------------------------------------------
1 | #ex02.py
2 | #-------
3 | from composes.semantic_space.space import Space
4 | from composes.utils import io_utils
5 |
6 | #create a space from co-occurrence counts in sparse format
7 | my_space = Space.build(data = "./data/in/ex01.sm",
8 | rows = "./data/in/ex01.rows",
9 | cols = "./data/in/ex01.cols",
10 | format = "sm")
11 |
12 | #print the co-occurrence matrix of the space
13 | print my_space.cooccurrence_matrix
14 |
15 | #save the Space object in pickle format
16 | io_utils.save(my_space, "./data/out/ex01.pkl")
17 |
18 | #load the saved object
19 | my_space2 = io_utils.load("./data/out/ex01.pkl")
20 |
21 | #print the co-occurrence matrix of the loaded space
22 | print my_space2.cooccurrence_matrix
23 |
24 |
--------------------------------------------------------------------------------
/src/examples/ex03.py:
--------------------------------------------------------------------------------
1 | #ex03.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
5 |
6 | #create a space from co-occurrence counts in sparse format
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 |
9 | #print the co-occurrence matrix of the space
10 | print my_space.cooccurrence_matrix
11 |
12 | #apply ppmi weighting
13 | my_space = my_space.apply(PpmiWeighting())
14 |
15 | #print the co-occurrence matrix of the transformed space
16 | print my_space.cooccurrence_matrix
17 |
18 |
--------------------------------------------------------------------------------
/src/examples/ex04.py:
--------------------------------------------------------------------------------
1 | #ex04.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.transformation.dim_reduction.svd import Svd
5 |
6 | #load a space
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 |
9 | #print the co-occurrence matrix and the columns of the space
10 | print my_space.cooccurrence_matrix
11 | print my_space.id2column
12 |
13 | #apply svd reduction
14 | my_space = my_space.apply(Svd(2))
15 |
16 | #print the transformed space
17 | print my_space.cooccurrence_matrix
18 | print my_space.id2column
19 |
--------------------------------------------------------------------------------
/src/examples/ex05.py:
--------------------------------------------------------------------------------
1 | #ex05.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.semantic_space.peripheral_space import PeripheralSpace
5 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
6 |
7 |
8 | #load a space and apply ppmi on it
9 | my_space = io_utils.load("./data/out/ex01.pkl")
10 | my_space = my_space.apply(PpmiWeighting())
11 |
12 | print my_space.cooccurrence_matrix
13 | print my_space.id2row
14 |
15 | #create a peripheral space
16 | my_per_space = PeripheralSpace.build(my_space,
17 | data="./data/in/ex05.sm",
18 | cols="./data/in/ex05.cols",
19 | format="sm")
20 |
21 | print my_per_space.cooccurrence_matrix
22 | print my_per_space.id2row
23 |
24 | #save the space
25 | io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
26 |
27 |
--------------------------------------------------------------------------------
/src/examples/ex06.py:
--------------------------------------------------------------------------------
1 | #ex06.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.similarity.cos import CosSimilarity
5 |
6 | #load a space
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 |
9 | print my_space.cooccurrence_matrix
10 | print my_space.id2row
11 |
12 | #compute similarity between two words in the space
13 | print my_space.get_sim("car", "car", CosSimilarity())
14 | print my_space.get_sim("car", "book", CosSimilarity())
15 |
--------------------------------------------------------------------------------
/src/examples/ex07.py:
--------------------------------------------------------------------------------
1 | #ex07.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.similarity.cos import CosSimilarity
5 |
6 | #load two spaces
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
9 |
10 | print my_space.id2row
11 | print my_per_space.id2row
12 |
13 | #compute similarity between a word and a phrase in the two spaces
14 | print my_space.get_sim("car", "sports_car", CosSimilarity(),
15 | space2 = my_per_space)
16 |
--------------------------------------------------------------------------------
/src/examples/ex08.py:
--------------------------------------------------------------------------------
1 | #ex08.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.similarity.cos import CosSimilarity
5 |
6 | #load a space
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 |
9 | #get the top 2 neighbours of "car"
10 | print my_space.get_neighbours("car", 2, CosSimilarity())
11 |
--------------------------------------------------------------------------------
/src/examples/ex09.py:
--------------------------------------------------------------------------------
1 | #ex09.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.similarity.cos import CosSimilarity
5 |
6 | #load two spaces
7 | my_space = io_utils.load("./data/out/ex01.pkl")
8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
9 |
10 | print my_space.id2row
11 | print my_space.cooccurrence_matrix
12 | print my_per_space.id2row
13 | print my_per_space.cooccurrence_matrix
14 |
15 | #get the top two neighbours of "car" in a peripheral space
16 | print my_space.get_neighbours("car", 2, CosSimilarity(),
17 | space2 = my_per_space)
18 |
19 |
--------------------------------------------------------------------------------
/src/examples/ex10.py:
--------------------------------------------------------------------------------
1 | #ex10.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.weighted_additive import WeightedAdditive
5 |
6 | #load a space
7 | my_space = io_utils.load("./data/out/ex10.pkl")
8 |
9 | print my_space.id2row
10 | print my_space.cooccurrence_matrix
11 |
12 | # instantiate a weighted additive model
13 | my_comp = WeightedAdditive(alpha = 1, beta = 1)
14 |
15 | # use the model to compose words in my_space
16 | composed_space = my_comp.compose([("good", "book", "good_book"),
17 | ("good", "car", "good_car")],
18 | my_space)
19 |
20 | print composed_space.id2row
21 | print composed_space.cooccurrence_matrix
22 |
23 | #save the composed space
24 | io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/examples/ex11.py:
--------------------------------------------------------------------------------
1 | #ex11.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.weighted_additive import WeightedAdditive
5 |
6 | # instantiate a weighted additive model
7 | my_comp = WeightedAdditive(alpha = 1, beta = 1)
8 |
9 | #save it to pickle
10 | io_utils.save(my_comp, "./data/out/model01.pkl")
11 |
12 | #print its parameters
13 | my_comp.export("./data/out/model01.params")
14 |
15 |
--------------------------------------------------------------------------------
/src/examples/ex12.py:
--------------------------------------------------------------------------------
1 | #ex12.py
2 | #-------
3 | from composes.utils import io_utils
4 |
5 | #load a previously saved weighted additive model
6 | my_comp = io_utils.load("./data/out/model01.pkl")
7 |
8 | #print its parameters
9 | print "alpha:", my_comp.alpha
10 | print "beta:", my_comp.beta
11 |
12 | #load two spaces
13 | my_space = io_utils.load("./data/out/ex10.pkl")
14 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
15 |
16 | #apply the composition model to them
17 | composed_space = my_comp.compose([("good", "history_book", "good_history_book")],
18 | (my_space, my_per_space))
19 |
20 | print composed_space.id2row
21 | print composed_space.cooccurrence_matrix
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/examples/ex13.py:
--------------------------------------------------------------------------------
1 | #ex13.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.weighted_additive import WeightedAdditive
5 |
6 |
7 | #training data
8 | train_data = [("good", "car", "good_car"),
9 | ("good", "book", "good_book")
10 | ]
11 |
12 | #load an argument space
13 | arg_space = io_utils.load("./data/out/ex10.pkl")
14 | print arg_space.id2row
15 | print arg_space.cooccurrence_matrix
16 |
17 | #load a phrase space
18 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
19 | print phrase_space.id2row
20 | print phrase_space.cooccurrence_matrix
21 |
22 | #train a weighted additive model on the data
23 | my_comp = WeightedAdditive()
24 | my_comp.train(train_data, arg_space, phrase_space)
25 |
26 | #print its parameters
27 | print "alpha:", my_comp.alpha
28 | print "beta:", my_comp.beta
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/src/examples/ex14.py:
--------------------------------------------------------------------------------
1 | #ex14.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.dilation import Dilation
5 |
6 | #training data
7 | train_data = [("good", "car", "good_car"),
8 | ("good", "book", "good_book")
9 | ]
10 |
11 | #load an argument space
12 | arg_space = io_utils.load("./data/out/ex10.pkl")
13 |
14 | #load a phrase space
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | print "Training phrase space"
17 | print phrase_space.id2row
18 | print phrase_space.cooccurrence_matrix
19 |
20 | #train a Dilation model on the data
21 | my_comp = Dilation()
22 | my_comp.train(train_data, arg_space, phrase_space)
23 |
24 | #print its parameters
25 | print "\nlambda:", my_comp._lambda
26 |
27 | #use the model to compose the train data
28 | composed_space = my_comp.compose([("good", "bike", "good_bike")],
29 | arg_space)
30 | print "\nComposed space:"
31 | print composed_space.id2row
32 | print composed_space.cooccurrence_matrix
--------------------------------------------------------------------------------
/src/examples/ex15.py:
--------------------------------------------------------------------------------
1 | #ex15.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.full_additive import FullAdditive
5 |
6 | #training data
7 | train_data = [("good", "car", "good_car"),
8 | ("good", "book", "good_book")
9 | ]
10 |
11 | #load an argument space
12 | arg_space = io_utils.load("./data/out/ex10.pkl")
13 |
14 | #load a phrase space
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | print "Training phrase space"
17 | print phrase_space.id2row
18 | print phrase_space.cooccurrence_matrix
19 |
20 | #train a FullAdditive model on the data
21 | my_comp = FullAdditive()
22 | my_comp.train(train_data, arg_space, phrase_space)
23 |
24 | #print its parameters
25 | print "\nA:", my_comp._mat_a_t.transpose()
26 | print "B:", my_comp._mat_b_t.transpose()
27 |
28 | #use the model to compose the train data
29 | composed_space = my_comp.compose([("good", "bike", "good_bike")],
30 | arg_space)
31 | print "\nComposed space:"
32 | print composed_space.id2row
33 | print composed_space.cooccurrence_matrix
34 |
--------------------------------------------------------------------------------
/src/examples/ex16.py:
--------------------------------------------------------------------------------
1 | #ex16.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.lexical_function import LexicalFunction
5 | from composes.similarity.cos import CosSimilarity
6 |
7 | #training data
8 | #trying to learn a "good" function
9 | train_data = [("good_function", "car", "good_car"),
10 | ("good_function", "book", "good_book")
11 | ]
12 |
13 | #load argument and phrase space
14 | arg_space = io_utils.load("./data/out/ex10.pkl")
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 |
17 | #train a lexical function model on the data
18 | my_comp = LexicalFunction()
19 | my_comp.train(train_data, arg_space, phrase_space)
20 |
21 | #print its parameters
22 | print "\nLexical function space:"
23 | print my_comp.function_space.id2row
24 | cooc_mat = my_comp.function_space.cooccurrence_matrix
25 | cooc_mat.reshape(my_comp.function_space.element_shape)
26 | print cooc_mat
27 |
28 | #similarity within the learned functional space
29 | print "\nSimilarity between good and good in the function space:"
30 | print my_comp.function_space.get_sim("good_function", "good_function",
31 | CosSimilarity())
--------------------------------------------------------------------------------
/src/examples/ex17.py:
--------------------------------------------------------------------------------
1 | #ex17.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.lexical_function import LexicalFunction
5 | from composes.utils.regression_learner import RidgeRegressionLearner
6 |
7 | #training data
8 | #trying to learn a "good" function
9 | train_data = [("good_function", "car", "good_car"),
10 | ("good_function", "book", "good_book")
11 | ]
12 |
13 | #load argument and phrase space
14 | arg_space = io_utils.load("./data/out/ex10.pkl")
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 |
17 | print "\nDefault regression:"
18 | my_comp = LexicalFunction()
19 | print type(my_comp.regression_learner).__name__
20 | my_comp.train(train_data, arg_space, phrase_space)
21 |
22 | #print its parameters
23 | print "Lexical function space:"
24 | print my_comp.function_space.id2row
25 | cooc_mat = my_comp.function_space.cooccurrence_matrix
26 | cooc_mat.reshape(my_comp.function_space.element_shape)
27 | print cooc_mat
28 |
29 | print "\nRidge Regression with lambda = 2"
30 | rr_learner=RidgeRegressionLearner(param = 2,
31 | intercept = False,
32 | crossvalidation=False)
33 | my_comp = LexicalFunction(learner = rr_learner)
34 | my_comp.train(train_data, arg_space, phrase_space)
35 |
36 | #print its parameters
37 | print "Lexical function space:"
38 | print my_comp.function_space.id2row
39 | cooc_mat = my_comp.function_space.cooccurrence_matrix
40 | cooc_mat.reshape(my_comp.function_space.element_shape)
41 | print cooc_mat
42 |
--------------------------------------------------------------------------------
/src/examples/ex18.py:
--------------------------------------------------------------------------------
1 | #ex18.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.composition.lexical_function import LexicalFunction
5 |
6 | #training data
7 | #trying to learn a "book" function
8 | train_data = [("good_function", "car", "good_car"),
9 | ("good_function", "book", "good_book")
10 | ]
11 |
12 | #load argument and phrase space
13 | arg_space = io_utils.load("./data/out/ex10.pkl")
14 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
15 |
16 | #train a lexical function model on the data
17 | my_comp = LexicalFunction()
18 | my_comp.train(train_data, arg_space, phrase_space)
19 |
20 | #apply the trained model
21 | comp_sp1 = my_comp.compose([("good_function", "car",
22 | "good_car")],
23 | arg_space)
24 |
25 | #apply the trained model a second time
26 | comp_sp2 = my_comp.compose([("good_function", "good_car",
27 | "good_good_car")],
28 | comp_sp1)
29 |
30 |
31 | #print the composed spaces:
32 | print "\nComposed space 1:"
33 | print comp_sp1.id2row
34 | print comp_sp1.cooccurrence_matrix
35 |
36 | print "\nComposed space 2:"
37 | print comp_sp2.id2row
38 | print comp_sp2.cooccurrence_matrix
39 |
--------------------------------------------------------------------------------
/src/examples/ex19.py:
--------------------------------------------------------------------------------
1 | #ex19.py
2 | #-------
3 | from composes.semantic_space.space import Space
4 | from composes.composition.lexical_function import LexicalFunction
5 | from composes.utils.regression_learner import LstsqRegressionLearner
6 |
7 | #training data1: VO N -> SVO
8 | train_vo_data = [("hate_boy", "man", "man_hate_boy"),
9 | ("hate_man", "man", "man_hate_man"),
10 | ("hate_boy", "boy", "boy_hate_boy"),
11 | ("hate_man", "boy", "boy_hate_man")
12 | ]
13 |
14 | #training data2: V N -> VO
15 | train_v_data = [("hate", "man", "hate_man"),
16 | ("hate", "boy", "hate_boy")
17 | ]
18 |
19 | #load N and SVO spaces
20 | n_space = Space.build(data = "./data/in/ex19-n.sm",
21 | cols = "./data/in/ex19-n.cols",
22 | format = "sm")
23 |
24 | svo_space = Space.build(data = "./data/in/ex19-svo.sm",
25 | cols = "./data/in/ex19-svo.cols",
26 | format = "sm")
27 |
28 | print "\nInput SVO training space:"
29 | print svo_space.id2row
30 | print svo_space.cooccurrence_matrix
31 |
32 | #1. train a model to learn VO functions on train data: VO N -> SVO
33 | print "\nStep 1 training"
34 | vo_model = LexicalFunction(learner=LstsqRegressionLearner())
35 | vo_model.train(train_vo_data, n_space, svo_space)
36 |
37 | #2. train a model to learn V functions on train data: V N -> VO
38 | # where VO space: function space learned in step 1
39 | print "\nStep 2 training"
40 | vo_space = vo_model.function_space
41 | v_model = LexicalFunction(learner=LstsqRegressionLearner())
42 | v_model.train(train_v_data, n_space, vo_space)
43 |
44 | #print the learned model
45 | print "\n3D Verb space"
46 | print v_model.function_space.id2row
47 | print v_model.function_space.cooccurrence_matrix
48 |
49 |
50 | #3. use the trained models to compose new SVO sentences
51 |
52 | #3.1 use the V model to create new VO combinations
53 | vo_composed_space = v_model.compose([("hate", "woman", "hate_woman"),
54 | ("hate", "man", "hate_man")],
55 | n_space)
56 |
57 | #3.2 the new VO combinations will be used as functions:
58 | # load the new VO combinations obtained through composition into
59 | # a new composition model
60 | expanded_vo_model = LexicalFunction(function_space=vo_composed_space,
61 | intercept=v_model._has_intercept)
62 |
63 | #3.3 use the new VO combinations by composing them with subject nouns
64 | # in order to obtain new SVO sentences
65 | svo_composed_space = expanded_vo_model.compose([("hate_woman", "woman", "woman_hates_woman"),
66 | ("hate_man", "man", "man_hates_man")],
67 | n_space)
68 |
69 | #print the composed spaces:
70 | print "\nVO composed space:"
71 | print vo_composed_space.id2row
72 | print vo_composed_space.cooccurrence_matrix
73 |
74 | #print the composed spaces:
75 | print "\nSVO composed space:"
76 | print svo_composed_space.id2row
77 | print svo_composed_space.cooccurrence_matrix
78 |
79 |
--------------------------------------------------------------------------------
/src/examples/ex20.py:
--------------------------------------------------------------------------------
1 | #ex20.py
2 | #-------
3 | from composes.utils import io_utils
4 | from composes.utils import scoring_utils
5 | from composes.similarity.cos import CosSimilarity
6 |
7 | #read in a space
8 | my_space = io_utils.load("data/out/ex01.pkl")
9 |
10 | #compute similarities of a list of word pairs
11 | fname = "data/in/word_sims.txt"
12 | word_pairs = io_utils.read_tuple_list(fname, fields=[0,1])
13 | predicted = my_space.get_sims(word_pairs, CosSimilarity())
14 |
15 | #compute correlations
16 | gold = io_utils.read_list(fname, field=2)
17 | print "Spearman"
18 | print scoring_utils.score(gold, predicted, "spearman")
19 | print "Pearson"
20 | print scoring_utils.score(gold, predicted, "pearson")
--------------------------------------------------------------------------------
/src/examples/exercise.sh:
--------------------------------------------------------------------------------
1 | # set pythonpath
2 | export PYTHONPATH=/home/thenghia.pham/git/toolkit/src:$PYTHONPATH
3 | export TOOLKIT_DIR=/home/thenghia.pham/git/toolkit
4 | export OUT_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/data/tutorial
5 | export DATA_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial
6 | export LOG_FILE=$OUT_DIR/log/exercises.log
7 |
8 | #**************************************************************************************
9 | echo Step1
10 | echo STARTING BUILDING CORE
11 | export CORE_IN_FILE_PREFIX=CORE_SS.verbnoun.core
12 | export CORE_OUT_DIR=$OUT_DIR/core
13 |
14 | # run build core space pipeline
15 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_core_space.py -i $DATA_DIR/$CORE_IN_FILE_PREFIX --input_format=pkl -o $CORE_OUT_DIR -w ppmi -s top_sum_2000 -r svd_100 --output_format=dm -l $LOG_FILE
16 |
17 | echo FINISHED BUILDING CORE
18 |
19 | #**************************************************************************************
20 | echo Step2
21 | echo STARTING PERIPHERAL PIPELINE
22 | export CORE_SPC=CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl
23 |
24 | export PER_RAW_FILE=$DATA_DIR/per.raw.SV
25 | export PER_OUT_DIR=$OUT_DIR/per
26 |
27 | # run build peripheral space pipeline
28 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_peripheral_space.py -i $PER_RAW_FILE --input_format sm -c $CORE_OUT_DIR/$CORE_SPC -o $PER_OUT_DIR dm -l $LOG_FILE
29 |
30 | echo FINISHED PERIPHERAL PIPELINE
31 |
32 | #**************************************************************************************
33 | echo step3
34 | echo STARTING TRAINING
35 |
36 | export MODEL_DIR=$OUT_DIR/trained
37 | export TRAIN_FILE=$DATA_DIR/ML08_SV_train.txt
38 | export PER_SPC=PER_SS.per.raw.SV.CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl
39 | export MODEL=lexical_func
40 |
41 | # run training pipeline
42 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/train_composition.py -i $TRAIN_FILE -m $MODEL -o $MODEL_DIR -a $CORE_OUT_DIR/$CORE_SPC -p $PER_OUT_DIR/$PER_SPC --regression ridge --intercept True --crossvalidation False --lambda 2.0 -l $LOG_FILE
43 |
44 | echo FINISHED TRAINING
45 | #**************************************************************************************
46 | echo step 4
47 | echo STARTING COMPOSING SPACE
48 |
49 | export TRNED_MODEL=TRAINED_COMP_MODEL.lexical_func.ML08_SV_train.txt.pkl
50 | export COMP_DIR=$OUT_DIR/composed
51 | export COMP_FILE=$DATA_DIR/ML08nvs_test.txt
52 |
53 | # run apply composition pipeline
54 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/apply_composition.py -i $COMP_FILE --load_model $MODEL_DIR/$TRNED_MODEL -o $COMP_DIR -a $CORE_OUT_DIR/$CORE_SPC -l $LOG_FILE
55 |
56 | echo FINISHED COMPOSING SPACE
57 | #**************************************************************************************
58 | echo step 5
59 | echo STARTING COMPUTING SIMS
60 |
61 | export COMP_SPC=COMPOSED_SS.LexicalFunction.ML08nvs_test.txt.pkl
62 | export SIM_DIR=$OUT_DIR/similarity
63 | export TEST_FILE=$DATA_DIR/ML08data_new.txt
64 |
65 | # create output directory for similarity if the directory doesn't exist
66 | if [ ! -d "$SIM_DIR" ]; then
67 | mkdir $SIM_DIR
68 | fi
69 |
70 | # run sim pipeline
71 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/compute_similarities.py -i $TEST_FILE -s $COMP_DIR/$COMP_SPC -o $SIM_DIR -m cos,lin,dot_prod,euclidean -c 1,2 -l $LOG_FILE
72 |
73 | echo FINISH COMPUTE SIMS
74 | #**************************************************************************************
75 | echo step 6
76 | echo STARTING EVAL SIMS
77 |
78 | # run evaluation pipeline
79 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/evaluate_similarities.py --in_dir $SIM_DIR -m spearman,pearson -c 3,4 -l $LOG_FILE
80 | echo FINISH EVAL SIMS
81 |
--------------------------------------------------------------------------------
/src/examples/full_example.py:
--------------------------------------------------------------------------------
1 | from composes.similarity.cos import CosSimilarity
2 | from composes.semantic_space.peripheral_space import PeripheralSpace
3 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
4 | from composes.transformation.dim_reduction.svd import Svd
5 | from composes.transformation.feature_selection.top_feature_selection import TopFeatureSelection
6 | from composes.composition.lexical_function import LexicalFunction
7 | from composes.composition.full_additive import FullAdditive
8 | from composes.composition.weighted_additive import WeightedAdditive
9 | from composes.composition.multiplicative import Multiplicative
10 | from composes.composition.dilation import Dilation
11 | from composes.utils.regression_learner import RidgeRegressionLearner
12 |
13 | import composes.utils.io_utils as io_utils
14 | import composes.utils.scoring_utils as scoring_utils
15 |
16 | #load a core space
17 | print "Loading the data..."
18 | data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/"
19 |
20 | space_file = data_path + "CORE_SS.verbnoun.core.pkl"
21 | space = io_utils.load(space_file)
22 |
23 | print "Applying PPMI..."
24 | space = space.apply(PpmiWeighting())
25 |
26 | print "Applying feature selection..."
27 | space = space.apply(TopFeatureSelection(2000))
28 |
29 | print "Applying SVD..."
30 | space = space.apply(Svd(100))
31 |
32 | print "Creating peripheral space.."
33 | per_space = PeripheralSpace.build(space,
34 | data = data_path + "per.raw.SV.sm",
35 | cols = data_path + "per.raw.SV.cols",
36 | format = "sm"
37 | )
38 |
39 | #reading in train data
40 | train_data_file = data_path + "ML08_SV_train.txt"
41 | train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2])
42 |
43 | print "Training Lexical Function composition model..."
44 | comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2))
45 | comp_model.train(train_data, space, per_space)
46 |
47 | print "Composing phrases..."
48 | test_phrases_file = data_path + "ML08nvs_test.txt"
49 | test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2])
50 | composed_space = comp_model.compose(test_phrases, space)
51 |
52 | print "Reading similarity test data..."
53 | test_similarity_file = data_path + "ML08data_new.txt"
54 | test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1])
55 | gold = io_utils.read_list(test_similarity_file, field=2)
56 |
57 | print "Computing similarity with lexical function..."
58 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
59 |
60 | #use this composed space to assign similarities
61 | print "Scoring lexical function..."
62 | print scoring_utils.score(gold, pred, "spearman")
63 |
64 |
65 | print "Training Full Additive composition model..."
66 | comp_model = FullAdditive(learner = RidgeRegressionLearner(param=2))
67 | comp_model.train(train_data, space, per_space)
68 | composed_space = comp_model.compose(test_phrases, space)
69 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
70 | print scoring_utils.score(gold, pred, "spearman")
71 |
72 | print "Training Weighted Additive composition model..."
73 | comp_model = WeightedAdditive()
74 | comp_model.train(train_data, space, per_space)
75 | print "alpha, beta:", comp_model.alpha, comp_model.beta
76 | composed_space = comp_model.compose(test_phrases, space)
77 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
78 | print scoring_utils.score(gold, pred, "spearman")
79 |
80 | print "Training Dilation composition model..."
81 | comp_model = Dilation()
82 | comp_model.train(train_data, space, per_space)
83 | print "lambda:", comp_model._lambda
84 | composed_space = comp_model.compose(test_phrases, space)
85 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
86 | print scoring_utils.score(gold, pred, "spearman")
87 |
88 | print "Multiplicative composition model..."
89 | comp_model = Multiplicative()
90 | composed_space = comp_model.compose(test_phrases, space)
91 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
92 | print scoring_utils.score(gold, pred, "spearman")
93 |
94 | print "Simple additive composition model..."
95 | comp_model = WeightedAdditive(1,1)
96 | composed_space = comp_model.compose(test_phrases, space)
97 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
98 | print scoring_utils.score(gold, pred, "spearman")
99 |
100 | print "Simple dilation composition model..."
101 | comp_model = Dilation()
102 | composed_space = comp_model.compose(test_phrases, space)
103 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
104 | print scoring_utils.score(gold, pred, "spearman")
105 |
--------------------------------------------------------------------------------
/src/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/pipelines/__init__.py
--------------------------------------------------------------------------------
/src/pipelines/compute_neighbours.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 17, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | '''
7 | Created on Oct 17, 2012
8 |
9 | @author: Georgiana Dinu, Pham The Nghia
10 | '''
11 |
12 | '''
13 | Created on Jun 12, 2012
14 |
15 | @author: thenghia.pham
16 | '''
17 |
18 |
19 | import sys
20 | import getopt
21 | from ConfigParser import ConfigParser
22 | from composes.semantic_space.space import Space
23 | from composes.similarity.cos import CosSimilarity
24 | from composes.similarity.lin import LinSimilarity
25 | from composes.similarity.dot_prod import DotProdSimilarity
26 | from composes.similarity.euclidean import EuclideanSimilarity
27 | from composes.utils import io_utils
28 | from composes.utils import log_utils
29 | import pipeline_utils as utils
30 | import logging
31 | logger = logging.getLogger("test vector space construction pipeline")
32 |
33 |
34 |
35 | def usage(errno=0):
36 | print >>sys.stderr,\
37 | """Usage:
38 | python compute_similarities.py [options] [config_file]
39 |
40 | Options:
41 | -i --input : input file.
42 | -o --output : output directory.
43 | -s --space : file of semantic space. The second
44 | word of a word pair is interpreted in the second space argument,
45 | if provided.
46 | -m --sim_measure : similarity measure
47 | -n --no_neighbours : number of neighbours to be returned
48 | -l --log : log file. Optional.
49 | -h --help : help
50 |
51 | Arguments:
52 | config_file: , used as default values for configuration options above.
53 | If you don't specify these options in [options] the value from the
54 | config_file will be used.
55 |
56 | Example:
57 | """
58 | sys.exit(errno)
59 |
60 |
61 | def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files):
62 | sim_dict = {"cos": CosSimilarity(),
63 | "lin": LinSimilarity(),
64 | "dot_prod": DotProdSimilarity(),
65 | "euclidean": EuclideanSimilarity()}
66 |
67 | if not sim_measure in sim_dict:
68 | raise ValueError("Similarity measure:%s not defined" % sim_measure)
69 |
70 | space = io_utils.load(space_files[0], Space)
71 | space2 = None
72 | space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
73 | if len(space_files) == 2:
74 | space2 = io_utils.load(space_files[1], Space)
75 | space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])
76 |
77 | sim = sim_dict[sim_measure]
78 |
79 | descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
80 | out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
81 | io_utils.create_parent_directories(out_file)
82 |
83 | data = io_utils.read_list(in_file)
84 |
85 | print "Computing neighbours: %s" % sim_measure
86 | with open(out_file,"w") as out_stream:
87 | for word in data:
88 | out_stream.write("%s\n" % word)
89 | result = space.get_neighbours(word, no_neighbours, sim, space2)
90 | for neighbour, neighbour_sim in result:
91 | out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
92 |
93 | def main(sys_argv):
94 | try:
95 | opts, argv = getopt.getopt(sys_argv[1:], "hi:o:s:m:n:l:",
96 | ["help", "input=", "output=", "sim_measures=",
97 | "space=", "log=", "no_neighbours="])
98 | except getopt.GetoptError, err:
99 | print str(err)
100 | usage()
101 | sys.exit(1)
102 |
103 | section = "compute_neighbours"
104 |
105 | out_dir = None
106 | in_file = None
107 | sim_measure = None
108 | spaces = None
109 | log_file = None
110 | no_neighbours = "20"
111 |
112 |
113 | if (len(argv) == 1):
114 | config_file = argv[0]
115 | with open(config_file) as f:
116 | pass
117 | config = ConfigParser()
118 | config.read(config_file)
119 | out_dir = utils.config_get(section, config, "output", None)
120 | in_file = utils.config_get(section, config, "input", None)
121 | sim_measure = utils.config_get(section, config, "sim_measure", None)
122 | spaces = utils.config_get(section, config, "space", None)
123 | if not spaces is None:
124 | spaces = spaces.split(",")
125 | no_neighbours = utils.config_get(section, config, "no_neighbours", no_neighbours)
126 | log_file = utils.config_get(section, config, "log", None)
127 |
128 | for opt, val in opts:
129 | if opt in ("-i", "--input"):
130 | in_file = val
131 | elif opt in ("-o", "--output"):
132 | out_dir = val
133 | elif opt in ("-m", "--sim_measure"):
134 | sim_measure = val
135 | elif opt in ("-s", "--space"):
136 | spaces = val.split(",")
137 | elif opt in ("-n", "--no_neighbours"):
138 | no_neighbours = val
139 | elif opt in ("-l", "--log"):
140 | log_file = val
141 | elif opt in ("-h", "--help"):
142 | usage()
143 | sys.exit(0)
144 | else:
145 | usage(1)
146 |
147 | log_utils.config_logging(log_file)
148 |
149 | no_neighbours = int(no_neighbours)
150 |
151 | utils.assert_option_not_none(in_file, "Input file required", usage)
152 | utils.assert_option_not_none(out_dir, "Output directory required", usage)
153 | utils.assert_option_not_none(sim_measure, "Similarity measure required", usage)
154 | utils.assert_option_not_none(spaces, "Semantic space file required", usage)
155 |
156 | compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, spaces)
157 |
158 |
159 |
160 | if __name__ == '__main__':
161 | main(sys.argv)
--------------------------------------------------------------------------------
/src/pipelines/evaluate_similarities.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 17, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | '''
7 | Created on Oct 17, 2012
8 |
9 | @author: Georgiana Dinu, Pham The Nghia
10 | '''
11 |
12 | '''
13 | Created on Jun 12, 2012
14 |
15 | @author: thenghia.pham
16 | '''
17 |
18 |
19 | import sys
20 | import getopt
21 | import os
22 | from ConfigParser import ConfigParser
23 | from composes.utils import scoring_utils
24 | from composes.utils import log_utils
25 | import pipeline_utils as utils
26 |
27 | import logging
28 | logger = logging.getLogger("test vector space construction pipeline")
29 |
30 |
31 |
32 | def usage(errno=0):
33 | print >>sys.stderr,\
34 | """Usage:
35 | python compute_similarities.py [options] [config_file]
36 |
37 | Options:
38 | -i --input : input file.
39 | --in_dir: : input directory, all files that pass the --filter are tested.
40 | -i value is ignored. Optional.
41 | --filter: : when --in_dir, it acts as a filter on the files to be tested:
42 | only files containing this substring are tested. Optional,
43 | default all files in in_dir are tested.
44 | -m --correlation_measure : comma-separated correlation measures
45 | -c --columns <(int,int)>: pair of columns, indicating which columns contain
46 | the words to be compared
47 | -l --log : log file. Optional, default ./build_core_space.log
48 | -h --help : help
49 |
50 | Arguments:
51 | config_file: , used as default values for configuration options above.
52 | If you don't specify these options in [options] the value from the
53 | config_file will be used.
54 |
55 | Example:
56 | """
57 | sys.exit(errno)
58 |
59 | def evaluate_sim(in_file, columns, corr_measures):
60 |
61 | if not len(columns) == 2:
62 | raise ValueError("Column description unrecognized!")
63 | col0 = int(columns[0]) - 1
64 | col1 = int(columns[1]) - 1
65 |
66 | gold = []
67 | prediction = []
68 | with open(in_file) as in_stream:
69 | for line in in_stream:
70 | if not line.strip() == "":
71 | elems = line.strip().split()
72 | gold.append(float(elems[col0]))
73 | prediction.append(float(elems[col1]))
74 |
75 | for corr_measure in corr_measures:
76 | print "CORRELATION:%s" % corr_measure
77 | corr = scoring_utils.score(gold, prediction, corr_measure)
78 | print "\t%f" % corr
79 |
80 |
81 | def evaluate_sim_batch(in_dir, columns, corr_measures, filter_=""):
82 |
83 | if not os.path.exists(in_dir):
84 | raise ValueError("Input directory not found: %s" % in_dir)
85 |
86 | if not in_dir.endswith("/"):
87 | in_dir = in_dir + "/"
88 |
89 | for file_ in os.listdir(in_dir):
90 | if file_.find(filter_) != -1:
91 | print file_
92 | evaluate_sim(in_dir + file_, columns, corr_measures)
93 |
94 |
95 | def main(sys_argv):
96 | try:
97 | opts, argv = getopt.getopt(sys_argv[1:], "hi:m:c:l:",
98 | ["help", "input=", "correlation_measure=",
99 | "columns=", "log=", "in_dir=", "filter="])
100 |
101 | except getopt.GetoptError, err:
102 | print str(err)
103 | usage()
104 | sys.exit(1)
105 |
106 | in_file = None
107 | in_dir = None
108 | filter_ = ""
109 | corr_measures = None
110 | columns = None
111 | log_file = None
112 |
113 | section = "evaluate_similarities"
114 |
115 | if (len(argv) == 1):
116 | config_file = argv[0]
117 | config = ConfigParser()
118 | config.read(config_file)
119 | in_file = utils.config_get(section, config, "input", None)
120 | in_dir = utils.config_get(section, config, "in_dir", None)
121 | filter_ = utils.config_get(section, config, "filter", filter_)
122 | corr_measures = utils.config_get(section, config, "correlation_measure", None)
123 | if not corr_measures is None:
124 | corr_measures = corr_measures.split(",")
125 | columns = utils.config_get(section, config, "columns", None)
126 | if not columns is None:
127 | columns = columns.split(",")
128 | log_file = utils.config_get(section, config, "log", None)
129 |
130 | for opt, val in opts:
131 | if opt in ("-i", "--input"):
132 | in_file = val
133 | elif opt in ("-m", "--correlation_measure"):
134 | corr_measures = val.split(",")
135 | elif opt in ("-c", "--columns"):
136 | columns = val.split(",")
137 | elif opt == "--in_dir":
138 | in_dir = val
139 | elif opt == "--filter":
140 | filter_ = val
141 | elif opt in ("-l", "--log"):
142 | log_file = val
143 | elif opt in ("-h", "--help"):
144 | usage()
145 | sys.exit(0)
146 | else:
147 | usage(1)
148 |
149 | log_utils.config_logging(log_file)
150 |
151 | utils.assert_option_not_none(corr_measures, "Correlation measures required", usage)
152 | utils.assert_option_not_none(columns, "Columns to be read from input file required", usage)
153 |
154 | if len(columns) != 2:
155 | raise ValueError("Columns (-c) field should contain two comma-separated integers (e.g. -c 3,4)")
156 |
157 | if not in_dir is None:
158 | evaluate_sim_batch(in_dir, columns, corr_measures, filter_)
159 | else:
160 | utils.assert_option_not_none(in_file, "Input file required", usage)
161 | evaluate_sim(in_file, columns, corr_measures)
162 |
163 | if __name__ == '__main__':
164 | main(sys.argv)
165 |
--------------------------------------------------------------------------------
/src/pipelines/pipeline_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 20, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 |
7 | def assert_bool(option, message, usage):
8 | if option not in (True, False):
9 | print message
10 | usage(1)
11 |
12 | def assert_option_not_none(option, message, usage):
13 | if option is None:
14 | print message
15 | usage(1)
16 |
17 | def assert_xor_options(option1, option2, message, usage):
18 | if not ((option1 is None) ^ (option2 is None)):
19 | print message
20 | usage(1)
21 |
22 | def config_get(section, config, option, default):
23 | return config.get(section, option) if config.has_option(section, option) else default
24 |
--------------------------------------------------------------------------------
/src/unitest/__init__.py:
--------------------------------------------------------------------------------
1 | current_file = __file__
2 | toolkit_dir = "/".join(current_file.split("/")[0:-3])
3 | data_dir = toolkit_dir + "/resource/unittest/"
--------------------------------------------------------------------------------
/src/unitest/bps_pipeline_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 18, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 |
8 | import numpy as np
9 |
10 | from pipelines import build_peripheral_space as bps
11 | from pipelines import build_core_space as bcs
12 | from composes.semantic_space.space import Space
13 |
14 | from unitest import data_dir
15 | import pytest
16 |
17 |
18 | class Test(unittest.TestCase):
19 |
20 | def setUp(self):
21 | self.dir_ = data_dir + "pipelines_test_resources/"
22 |
23 | def _test_equal_spaces_structs(self, sp, new_sp):
24 | self.assertListEqual(sp.id2row, new_sp.id2row)
25 | self.assertListEqual(sp.id2column, new_sp.id2column)
26 | self.assertDictEqual(sp.row2id, new_sp.row2id)
27 | self.assertDictEqual(sp.column2id, new_sp.column2id)
28 |
29 | def _test_equal_spaces_dense(self, sp, new_sp):
30 |
31 | self._test_equal_spaces_structs(sp, new_sp)
32 | np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat, new_sp.cooccurrence_matrix.mat, 6)
33 |
34 | def _test_equal_spaces_sparse(self, sp, new_sp):
35 |
36 | self._test_equal_spaces_structs(sp, new_sp)
37 | np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat.todense(), new_sp.cooccurrence_matrix.mat.todense(), 6)
38 |
39 | def test_raises(self):
40 | with pytest.raises(SystemExit):
41 | bps.main(["build_peripheral_space.py", "-h"])
42 |
43 | with pytest.raises(SystemExit):
44 | bps.main([
45 | "build_peripheral_space.py",
46 | "-l", '/tmp/test_build_peripheral_space.log',
47 | "-h",
48 | ])
49 |
50 | def tttest_simple_sparse_batch(self):
51 |
52 | bps.main(["build_peripheral_space.py",
53 | "-l", self.dir_ + "log1.txt",
54 | "-i", self.dir_ + "mat1",
55 | "-o", self.dir_,
56 | "--core_in_dir", self.dir_,
57 | "--core_filter", "CORE_SS.mat1.pkl",
58 | "--input_format", "sm",
59 | "--output_format", "sm"
60 | ])
61 |
62 | s1 = Space.build(data=self.dir_ + "mat1.sm",
63 | cols=self.dir_ + "mat1.cols",
64 | format="sm")
65 | s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
66 | cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
67 | format="sm")
68 | s3 = Space.build(data=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.sm",
69 | cols=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.cols",
70 | format="sm")
71 |
72 | self._test_equal_spaces_sparse(s1, s2)
73 | self._test_equal_spaces_sparse(s1, s3)
74 |
75 | def test_simple_sparse(self):
76 |
77 | bps.main(["build_peripheral_space.py",
78 | "-l", self.dir_ + "log1.txt",
79 | "-i", self.dir_ + "mat1",
80 | "-o", self.dir_,
81 | "-c", self.dir_ + "CORE_SS.mat1.pkl",
82 | "--input_format", "sm",
83 | "--output_format", "sm"
84 | ])
85 |
86 | s1 = Space.build(data=self.dir_ + "mat1.sm",
87 | cols=self.dir_ + "mat1.cols",
88 | format="sm")
89 | s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
90 | cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
91 | format="sm")
92 |
93 | self._test_equal_spaces_sparse(s1, s2)
94 |
95 | def test_simple_dense(self):
96 | bps.main(["build_peripheral_space.py",
97 | "-l", self.dir_ + "log1.txt",
98 | "-i", self.dir_ + "mat2",
99 | "-o", self.dir_,
100 | "-c", self.dir_ + "CORE_SS.mat2.pkl",
101 | "--input_format", "dm",
102 | "--output_format", "dm"
103 | ])
104 | s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
105 | s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm")
106 |
107 | self._test_equal_spaces_dense(s1, s2)
108 |
109 | def test_simple_ops(self):
110 |
111 | bcs.main(["build_core_space.py",
112 | "-l", self.dir_ + "log1.txt",
113 | "-i", self.dir_ + "mat3",
114 | "-w", "raw",
115 | "-s", "top_sum_3,top_length_3,top_sum_4",
116 | "-r", "svd_2,svd_1",
117 | "-o", self.dir_,
118 | "--input_format", "dm",
119 | "--output_format", "dm"
120 | ])
121 |
122 | core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2",
123 | "CORE_SS.mat3.raw.top_sum_3.svd_1",
124 | "CORE_SS.mat3.raw.top_length_3.svd_2",
125 | "CORE_SS.mat3.raw.top_length_3.svd_1",
126 | "CORE_SS.mat3.raw.top_sum_4.svd_2",
127 | "CORE_SS.mat3.raw.top_sum_4.svd_1"
128 | ]
129 |
130 | core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats]
131 |
132 | for i, core_mat in enumerate(core_mats):
133 | bps.main(["build_peripheral_space.py",
134 | "-l", self.dir_ + "log1.txt",
135 | "-i", self.dir_ + "mat3",
136 | "-o", self.dir_,
137 | "-c", self.dir_ + core_mat + ".pkl",
138 | "--input_format", "dm",
139 | "--output_format", "dm"
140 | ])
141 |
142 | s1 = core_spaces[i]
143 | data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
144 | s2 = Space.build(data=data_file, format="dm")
145 | self._test_equal_spaces_dense(s1, s2)
146 |
147 | bps.main(["build_peripheral_space.py",
148 | "-l", self.dir_ + "log1.txt",
149 | "-i", self.dir_ + "mat3",
150 | "-o", self.dir_,
151 | "-c", self.dir_ + core_mat + ".pkl",
152 | "--input_format", "sm",
153 | "--output_format", "dm"
154 | ])
155 |
156 | s1 = core_spaces[i]
157 | data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
158 | s2 = Space.build(data=data_file, format="dm")
159 |
160 | self._test_equal_spaces_dense(s1, s2)
161 |
--------------------------------------------------------------------------------
/src/unitest/conftest.py:
--------------------------------------------------------------------------------
1 |
2 | import py
3 | import pytest
4 |
5 |
6 | @pytest.fixture
7 | def toolkit_dir():
8 | return py.path.local(__file__).dirpath().join('..', '..')
9 |
10 |
11 | @pytest.fixture
12 | def data_dir(toolkit_dir):
13 | return toolkit_dir.join('resource', 'unittest')
14 |
15 |
16 | @pytest.fixture
17 | def config_dir(tmpdir):
18 | return tmpdir.mkdir('config')
19 |
20 |
21 | @pytest.fixture
22 | def pipelines_test_resources(data_dir):
23 | return data_dir.join('pipelines_test_resources')
24 |
25 |
26 | @pytest.fixture
27 | def sim_input(pipelines_test_resources):
28 | return str(pipelines_test_resources.join('sim_input.txt'))
29 |
30 |
--------------------------------------------------------------------------------
/src/unitest/crossvalidation_utils_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 9, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | from composes.utils.crossvalidation_utils import get_split_indices
8 |
9 | class Test(unittest.TestCase):
10 |
11 |
12 | def test_get_split_indicec(self):
13 |
14 | test_cases = [(10, 3, 4), (9, 10, 1), (10, 10, 1), (109, 10, 11), (1, 1, 1)]
15 |
16 | for range_, fold, max_len in test_cases:
17 |
18 | indices = get_split_indices(range_, fold)
19 | self.assertGreaterEqual(fold, len(indices))
20 |
21 | for i in range(len(indices)):
22 | self.assertTrue(len(indices[i]) >= range_//fold or fold >= range_)
23 | self.assertGreaterEqual(max_len, len(indices[i]))
24 |
25 |
26 | indices = get_split_indices(10, 3)
27 | self.assertEqual(len(indices[0]), 4)
28 | self.assertEqual(len(indices[1]), 3)
29 | self.assertEqual(len(indices[2]), 3)
30 |
31 | if __name__ == "__main__":
32 | #import sys;sys.argv = ['', 'Test.test_get_split_indicec']
33 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/dense_matrix_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 17, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | import numpy.testing
9 | from scipy.sparse import csr_matrix
10 | from scipy.sparse import csc_matrix
11 | from composes.matrix.sparse_matrix import SparseMatrix
12 | from composes.matrix.dense_matrix import DenseMatrix
13 |
14 | class TestDenseMatrix(unittest.TestCase):
15 |
16 |
17 | def setUp(self):
18 | self.a = np.array([[1,2,3],[4,0,5]])
19 | self.b = np.array([[0,0,0],[0,0,0]])
20 |
21 | self.c = np.array([[0,0],[0,0],[0,0]])
22 | self.d = np.array([[1,0],[0,1]])
23 | self.e = np.array([1,10])
24 | self.f = np.array([1,10,100])
25 |
26 | self.matrix_a = DenseMatrix(self.a)
27 | self.matrix_b = DenseMatrix(self.b)
28 |
29 | self.matrix_c = DenseMatrix(self.c)
30 | self.matrix_d = DenseMatrix(self.d)
31 |
32 | def tearDown(self):
33 | pass
34 |
35 | def test_init(self):
36 | nparr = self.a
37 | test_cases = [nparr,
38 | np.mat(nparr),
39 | csr_matrix(nparr),
40 | csc_matrix(nparr),
41 | SparseMatrix(nparr)]
42 |
43 | for inmat in test_cases:
44 | outmat = DenseMatrix(inmat)
45 | self.assertIsInstance(outmat.mat, np.matrix)
46 | numpy.testing.assert_array_equal(nparr, np.array(outmat.mat))
47 |
48 |
49 | def test_add(self):
50 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
51 | (self.matrix_a, self.matrix_b, self.matrix_a.mat)
52 | ]
53 |
54 |
55 | for (term1, term2, expected) in test_cases:
56 | sum_ = term1 + term2
57 | numpy.testing.assert_array_equal(sum_.mat, expected)
58 | self.assertIsInstance(sum_, type(term1))
59 |
60 | def test_add_raises(self):
61 | test_cases = [(self.matrix_a, self.a),
62 | (self.matrix_a, SparseMatrix(self.a))]
63 |
64 | for (term1, term2) in test_cases:
65 | self.assertRaises(TypeError, term1.__add__, term2)
66 |
67 | def test_div(self):
68 | test_cases = [(self.matrix_a, 2, np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]])),
69 | (self.matrix_c, 2, np.mat(self.c))
70 | ]
71 |
72 | for (term1, term2, expected) in test_cases:
73 | sum_ = term1 / term2
74 | numpy.testing.assert_array_equal(sum_.mat, expected)
75 | self.assertIsInstance(sum_, DenseMatrix)
76 |
77 | def test_div_raises(self):
78 | test_cases = [(self.matrix_a, self.a, TypeError),
79 | (self.matrix_a, SparseMatrix(self.a), TypeError),
80 | (self.matrix_a, "3", TypeError),
81 | (self.matrix_a, 0, ZeroDivisionError)
82 | ]
83 |
84 | for (term1, term2, error_type) in test_cases:
85 | self.assertRaises(error_type, term1.__div__, term2)
86 |
87 |
88 | def test_mul(self):
89 | test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])),
90 | (self.matrix_d, self.matrix_a, self.matrix_a.mat),
91 | (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])),
92 | (2, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
93 | (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]])),
94 | (np.int64(2), self.matrix_a, np.mat([[2,4,6],[8,0,10]]))
95 | ]
96 |
97 | for (term1, term2, expected) in test_cases:
98 | sum_ = term1 * term2
99 | numpy.testing.assert_array_equal(sum_.mat, expected)
100 | self.assertIsInstance(sum_, DenseMatrix)
101 |
102 | def test_mul_raises(self):
103 | test_cases = [(self.matrix_a, self.a),
104 | (self.matrix_a, SparseMatrix(self.a)),
105 | (self.matrix_a, "3"),
106 | ("3", self.matrix_a)]
107 |
108 | for (term1, term2) in test_cases:
109 | self.assertRaises(TypeError, term1.__mul__, term2)
110 |
111 | def test_multiply(self):
112 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[1,4,9],[16,0,25]])),
113 | (self.matrix_a, self.matrix_b, np.mat(self.b))
114 | ]
115 |
116 | for (term1, term2, expected) in test_cases:
117 | mult1 = term1.multiply(term2)
118 | mult2 = term2.multiply(term1)
119 |
120 | numpy.testing.assert_array_equal(mult1.mat, expected)
121 | numpy.testing.assert_array_equal(mult2.mat, expected)
122 |
123 | self.assertIsInstance(mult1, DenseMatrix)
124 | self.assertIsInstance(mult2, DenseMatrix)
125 |
126 | def test_multiply_raises(self):
127 |
128 | test_cases = [(self.matrix_a, self.matrix_d, ValueError),
129 | (self.matrix_a, self.a, TypeError),
130 | (self.matrix_a, SparseMatrix(self.a), TypeError),
131 | ]
132 |
133 | for (term1, term2, error_type) in test_cases:
134 | self.assertRaises(error_type, term1.multiply, term2)
135 |
136 | def test_scale_rows(self):
137 | outcome = np.mat([[1,2,3],[40,0,50]])
138 | test_cases = [(self.matrix_a, self.e, outcome),
139 | (self.matrix_a, np.mat(self.e).T, outcome),
140 | ]
141 |
142 | for (term1, term2, expected) in test_cases:
143 | term1 = term1.scale_rows(term2)
144 | numpy.testing.assert_array_equal(term1.mat, expected)
145 |
146 | def test_scale_columns(self):
147 | test_cases = [(self.matrix_a, self.f, np.mat([[1,20,300],[4,0,500]]))]
148 |
149 | for (term1, term2, expected) in test_cases:
150 | term1 = term1.scale_columns(term2)
151 | numpy.testing.assert_array_equal(term1.mat, expected)
152 |
153 |
154 | def test_scale_raises(self):
155 | test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
156 | (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
157 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
158 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
159 | (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
160 | ]
161 | for (term1, term2, error_type, function) in test_cases:
162 | self.assertRaises(error_type, function, term2)
163 |
164 |
165 | def test_plog(self):
166 | m = DenseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]]))
167 | m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]])
168 | a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]])
169 | test_cases = [(self.matrix_a.copy(), a_expected),
170 | (m, m_expected)
171 | ]
172 |
173 | for (term, expected) in test_cases:
174 | term.plog()
175 | numpy.testing.assert_array_almost_equal(term.mat, expected, 3)
176 |
177 | if __name__ == "__main__":
178 | #import sys;sys.argv = ['', 'Test.testName']
179 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/dilation_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 15, 2012
3 |
4 | @author: nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from composes.matrix.dense_matrix import DenseMatrix
9 | #from composes.composition.dilation_1 import DilationModel
10 | from composes.composition.dilation import Dilation
11 |
12 | class Test(unittest.TestCase):
13 |
14 | def setUp(self):
15 | self.m11 = DenseMatrix(np.mat([[4],[2]]))
16 | self.m21 = DenseMatrix(np.mat([[3],[6]]))
17 | #self.ph1 = DenseMatrix(np.mat([[5],[10]]))
18 | self.ph1 = DenseMatrix(np.mat([[80],[40]]))
19 |
20 | self.m12 = DenseMatrix(np.mat([[2,0],[3,0]]))
21 | self.m22 = DenseMatrix(np.mat([[3,3],[6,4]]))
22 | #self.ph2 = DenseMatrix(np.mat([[5,2],[10,5]]))
23 | self.ph2 = DenseMatrix(np.mat([[20,8],[90,45]]))
24 |
25 | self.m13 = DenseMatrix(np.mat([[4,3],[3,4]]))
26 | self.m23 = DenseMatrix(np.mat([[0,5],[0,5]]))
27 | #self.ph3 = DenseMatrix(np.mat([[12,14],[12,21]]))
28 | self.ph3 = DenseMatrix(np.mat([[300,350],[300,525]]))
29 |
30 | self.m14 = DenseMatrix(np.mat([[4,3],[3,4],[0,0]]))
31 | self.m24 = DenseMatrix(np.mat([[0,5],[0,5],[0,0]]))
32 | #self.ph4 = DenseMatrix(np.mat([[12,14],[12,21],[0,0]]))
33 | self.ph4 = DenseMatrix(np.mat([[300,350],[300,525],[0,0]]))
34 |
35 | self.m15 = DenseMatrix(np.mat([[2,0],[0,0],[3,0]]))
36 | self.m25 = DenseMatrix(np.mat([[3,3],[0,0],[6,4]]))
37 | #self.ph5 = DenseMatrix(np.mat([[5,2],[0,0],[10,5]]))
38 | self.ph5 = DenseMatrix(np.mat([[20,8],[0,0],[90,45]]))
39 |
40 | self.m16 = DenseMatrix(np.mat([[0,0],[0,0]]))
41 | self.m26 = DenseMatrix(np.mat([[0,0],[0,0]]))
42 | self.ph6 = DenseMatrix(np.mat([[0,0],[0,0]]))
43 |
44 | self.m17 = DenseMatrix(np.mat([[2,0],[3,0]]))
45 | self.m27 = DenseMatrix(np.mat([[0,1],[0,2]]))
46 | #self.ph7 = DenseMatrix(np.mat([[4,5],[5,4]]))
47 | self.ph7 = DenseMatrix(np.mat([[16,20],[45,36]]))
48 |
49 | def test_train_exact(self):
50 | test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)),
51 | (self.m12, self.m22, self.ph2, 5 / (3.0)),
52 | (self.m13, self.m23, self.ph3, 6),
53 | (self.m14, self.m24, self.ph4, 6),
54 | (self.m15, self.m25, self.ph5, 5 / (3.0)),
55 | (self.m16, self.m26, self.ph6, 2),
56 | (self.m17, self.m27, self.ph7, 2)
57 | ]
58 |
59 | for arg1, arg2, phrase, lambda_ in test_cases:
60 | m = Dilation()
61 | m._solve(arg1, arg2, phrase)
62 | self.assertAlmostEqual(m._lambda, lambda_)
63 | #
64 | def test_compose_exact(self):
65 |
66 | test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)),
67 | (self.m13, self.m23, self.ph3, 6),
68 | (self.m14, self.m24, self.ph4, 6)
69 | ]
70 | for arg1, arg2, phrase, lambda_ in test_cases:
71 |
72 | m = Dilation()
73 | m._solve(arg1, arg2, phrase)
74 | res = m._compose(arg1, arg2)
75 | np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2)
76 |
77 | m = Dilation(lambda_)
78 | res = m._compose(arg1, arg2)
79 | np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2)
80 |
81 |
82 | def test_train_random(self):
83 | test_cases = [1.0,2.0,3.0]
84 | rows = 4
85 | cols = 3
86 | m1 = np.random.rand(rows,cols)
87 | m2 = np.random.rand(rows,cols)
88 |
89 |
90 | for lambda_ in test_cases:
91 | m = Dilation(lambda_)
92 | result_p = m._compose(DenseMatrix(m1), DenseMatrix(m2))
93 |
94 | m = Dilation()
95 | m._solve(DenseMatrix(m1),DenseMatrix(m2),result_p)
96 | self.assertAlmostEqual(lambda_, m._lambda)
97 |
98 |
99 | if __name__ == "__main__":
100 | #import sys;sys.argv = ['', 'Test.testName']
101 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/dimensionality_reduction_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 28, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from composes.transformation.dim_reduction.svd import Svd
9 | from composes.transformation.dim_reduction.nmf import Nmf
10 | from composes.matrix.linalg import Linalg
11 | from composes.matrix.dense_matrix import DenseMatrix
12 | from composes.matrix.sparse_matrix import SparseMatrix
13 |
14 | class DimReductionTest(unittest.TestCase):
15 |
16 |
17 | def setUp(self):
18 | pass
19 |
20 |
21 | def tearDown(self):
22 | pass
23 |
24 | def test_nmf(self):
25 | test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double),
26 | np.mat([[1,0,0]], dtype = np.double)]
27 |
28 | for in_mat in test_cases:
29 | red = Nmf(2)
30 | d_mat = DenseMatrix(in_mat)
31 | #wd_init, hd_init = red.random_init(d_mat)
32 | wd_init, hd_init = red.v_col_init(d_mat)
33 |
34 | s_mat = SparseMatrix(in_mat)
35 | ws_init = SparseMatrix(wd_init)
36 | hs_init = SparseMatrix(hd_init)
37 |
38 | wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init)
39 | ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init)
40 |
41 | #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD
42 | #print wd_mat.mat
43 | #print hd_mat.mat
44 | #print ws_mat.mat.todense()
45 | #print hs_mat.mat.todense()
46 | print "V:", in_mat
47 | print "WH:", (ws_mat*hs_mat).mat.todense()
48 |
49 | np.testing.assert_array_almost_equal(wd_mat.mat,
50 | ws_mat.mat.todense(), 2)
51 | np.testing.assert_array_almost_equal(hd_mat.mat,
52 | hs_mat.mat.todense(), 2)
53 |
54 | def test_svd(self):
55 | test_cases = [(DenseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
56 | np.mat([[ 2.19272110e+00, 3.03174768e+00, 0],
57 | [ 4.38544220e+00, 6.06349536e+00, 0],
58 | [ 6.76369708e+02, -4.91431927e-02, 0]]),
59 | np.mat([[0.0059,0.9979,0.0636],
60 | [0.3255,-0.0621,0.9434],
61 | [0.945,0.015,-0.325]]).transpose())]
62 |
63 |
64 |
65 | for x, us_expected, v_expected in test_cases:
66 |
67 | svd_red = Svd(2)
68 | us, transmat = svd_red.apply(x)
69 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
70 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
71 |
72 | svd_red = Svd(3)
73 | us, transmat = svd_red.apply(x)
74 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
75 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
76 |
77 | svd_red = Svd(6)
78 | us, transmat = svd_red.apply(x)
79 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
80 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
81 |
82 | svd_red = Svd(1)
83 | us, transmat = svd_red.apply(x)
84 | np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:1], 2)
85 | np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:1], 2)
86 |
87 |
88 | test_cases = [(SparseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
89 | np.mat([[ 2.19272110e+00, 3.03174768e+00, 0],
90 | [ 4.38544220e+00, 6.06349536e+00, 0],
91 | [ 6.76369708e+02, -4.91431927e-02, 0]]),
92 | np.mat([[0.0059,0.9979,0.0636],
93 | [0.3255,-0.0621,0.9434],
94 | [0.945,0.015,-0.325]]).transpose())]
95 |
96 |
97 | for x, us_expected, v_expected in test_cases:
98 | us_expected = np.abs(us_expected)
99 | v_expected = np.abs(v_expected)
100 |
101 | svd_red = Svd(2)
102 | us, transmat = svd_red.apply(x)
103 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
104 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
105 |
106 | svd_red = Svd(3)
107 | us, transmat = svd_red.apply(x)
108 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
109 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
110 |
111 | svd_red = Svd(6)
112 | us, transmat = svd_red.apply(x)
113 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
114 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
115 |
116 | svd_red = Svd(1)
117 | us, transmat = svd_red.apply(x)
118 | np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:1], 2)
119 | np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:1], 2)
120 |
121 | if __name__ == "__main__":
122 | #import sys;sys.argv = ['', 'Test.test_svd']
123 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/es_pipeline_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 19, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | from unitest import data_dir
8 | from pipelines import evaluate_similarities as es
9 |
10 | class Test(unittest.TestCase):
11 |
12 |
13 | def setUp(self):
14 | self.dir_ = data_dir + "pipelines_test_resources/"
15 |
16 |
17 | def tearDown(self):
18 | pass
19 |
20 |
21 | def test_simple(self):
22 |
23 | es.main(["evaluate_similarities.py",
24 | "-l", self.dir_ + "log1.txt",
25 | "-i", self.dir_ + "pred1.txt",
26 | "-m", "pearson,spearman",
27 | "-c", "1,3",
28 | ])
29 |
30 | es.main(["evaluate_similarities.py",
31 | "-l", self.dir_ + "log1.txt",
32 | "-i", self.dir_ + "pred1.txt",
33 | "--in_dir", self.dir_,
34 | "--filter", "pred",
35 | "-m", "pearson,spearman",
36 | "-c", "1,3",
37 | ])
38 |
39 | if __name__ == "__main__":
40 | #import sys;sys.argv = ['', 'Test.testName']
41 | unittest.main()
42 |
--------------------------------------------------------------------------------
/src/unitest/matrix_utils_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 12, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from composes.matrix.dense_matrix import DenseMatrix
9 | from composes.matrix.sparse_matrix import SparseMatrix
10 | from composes.utils.matrix_utils import resolve_type_conflict
11 | from scipy.sparse import csr_matrix
12 |
13 | class Test(unittest.TestCase):
14 |
15 |
16 | def test_resolve_type_conflict(self):
17 |
18 | arr = np.mat([1,2])
19 |
20 | a = DenseMatrix(arr)
21 | b = SparseMatrix(arr)
22 |
23 | [c,d] = resolve_type_conflict([a,b], DenseMatrix)
24 | [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix)
25 | h = resolve_type_conflict([], DenseMatrix)
26 |
27 | [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix)
28 |
29 | self.assertIsInstance(c, DenseMatrix)
30 | self.assertIsInstance(d, DenseMatrix)
31 | self.assertIsInstance(e, DenseMatrix)
32 | self.assertIsInstance(f, DenseMatrix)
33 | self.assertIsInstance(g, DenseMatrix)
34 | self.assertListEqual([], h)
35 |
36 | self.assertIsInstance(g, DenseMatrix)
37 |
38 | self.assertIsInstance(u, DenseMatrix)
39 | self.assertIsInstance(v, DenseMatrix)
40 |
41 |
42 |
43 | if __name__ == "__main__":
44 | #import sys;sys.argv = ['', 'Test.testName']
45 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/model_export_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 16, 2012
3 |
4 | @author: nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from unitest import data_dir
9 | from composes.matrix.dense_matrix import DenseMatrix
10 | from composes.semantic_space.space import Space
11 |
12 | from composes.composition.weighted_additive import WeightedAdditive
13 | from composes.composition.full_additive import FullAdditive
14 | from composes.composition.dilation import Dilation
15 | from composes.composition.lexical_function import LexicalFunction
16 | from composes.exception.illegal_state_error import IllegalStateError
17 |
18 | class ModelExportingTest(unittest.TestCase):
19 |
20 | def setUp(self):
21 | self.prefix = data_dir + "output/model"
22 | def test_weighted_additive(self):
23 |
24 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
25 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
26 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
27 | self.row = ["a", "b"]
28 | self.ft = ["f1","f2"]
29 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
30 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
31 | m = WeightedAdditive()
32 | m.export(self.prefix + ".add1")
33 | m.train([("a","a","a_a")], self.space1, self.space2)
34 | m.export(self.prefix + ".add2")
35 |
36 | def test_full_additive(self):
37 |
38 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
39 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
40 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
41 | self.row = ["a", "b"]
42 | self.ft = ["f1","f2"]
43 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
44 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
45 | m = FullAdditive()
46 | self.assertRaises(IllegalStateError, m.export,self.prefix + ".full1")
47 | m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2)
48 |
49 | m.export(self.prefix + ".full2")
50 |
51 | def test_dilation(self):
52 |
53 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
54 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
55 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
56 | self.row = ["a", "b"]
57 | self.ft = ["f1","f2"]
58 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
59 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
60 | m = Dilation()
61 | m.export(self.prefix + ".dil1")
62 | m.train([("a","b","a_b")], self.space1, self.space2)
63 | m.export(self.prefix + ".dil2")
64 |
65 | def test_lexical_function(self):
66 |
67 | self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
68 | self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
69 | self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
70 | self.row = ["a", "b"]
71 | self.ft = ["f1","f2"]
72 | self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
73 | self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
74 | m = LexicalFunction()
75 | m._MIN_SAMPLES = 1
76 | self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
77 | m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2)
78 | m.export(self.prefix + ".lf2")
79 |
80 |
81 |
82 | if __name__ == "__main__":
83 | #import sys;sys.argv = ['', 'Test.test_weighted_additive']
84 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/neighbour_pipeline_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 19, 2012
3 |
4 | @author: thenghia.pham
5 | '''
6 | import unittest
7 | from unitest import data_dir
8 | from unitest import toolkit_dir
9 | import pipelines.compute_neighbours as find_neighbours
10 | from pipelines import build_core_space as bcs
11 |
12 | import pytest
13 |
14 |
15 | def read_neighbours_list(file_name):
16 | result = []
17 | word = None
18 | neighbours = []
19 | with open(file_name) as f:
20 | for line in f:
21 | line = line.strip()
22 | if (line != ""):
23 | elements = line.split()
24 | if (len(elements) == 1):
25 | if word != None:
26 | result.append((word,neighbours))
27 | neighbours = []
28 | else:
29 | word = elements[0]
30 | else:
31 | neighbours.append((elements[0],elements[1]))
32 | if word != None:
33 | result.append((word,neighbours))
34 | return result
35 |
36 |
37 | @pytest.mark.xfail(run=False)
38 | class NeighboursPipelineTest(unittest.TestCase):
39 |
40 |
41 | def setUp(self):
42 | self.dir_ = data_dir
43 | self.log_dir = toolkit_dir + "/log/"
44 |
45 | #create the spaces required in the tests
46 | bcs.main(["build_core_space.py",
47 | "-l", self.dir_ + "pipelines_test_resources/log1.txt",
48 | "-i", self.dir_ + "pipelines_test_resources/mat3",
49 | "-w", "raw",
50 | "-s", "top_sum_3",
51 | "-r", "svd_2",
52 | "-o", self.dir_ + "pipelines_test_resources/",
53 | "--input_format", "dm"
54 | ])
55 |
56 | def test_find_neighbours(self):
57 | """
58 | find_neighbours.main(["compute_neighbours.py",
59 | "-l", self.log_dir + "neighbours_log.txt",
60 | "-i", self.dir_ + "neighbours_input.txt",
61 | "-m", "dot_prod",
62 | "-n", "3",
63 | "-s", self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.pkl",
64 | "-o", self.dir_
65 | ])
66 |
67 | find_neighbours.main(["compute_neighbours.py",
68 | "%sconfig/neighbours_config.cfg" %self.dir_
69 | ])
70 |
71 | find_neighbours.main(["compute_neighbours.py",
72 | "-m", "lin",
73 | "%sconfig/neighbours_config.cfg" %self.dir_
74 | ])
75 |
76 | find_neighbours.main(["compute_neighbours.py",
77 | "-m", "euclidean",
78 | "%sconfig/neighbours_config.cfg" %self.dir_
79 | ])
80 |
81 | find_neighbours.main(["compute_neighbours.py",
82 | "-m", "euclidean",
83 | "--space", "%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
84 | "%sconfig/neighbours_config.cfg" %self.dir_
85 | ])
86 | """
87 | find_neighbours.main(["compute_neighbours.py",
88 | "-m", "euclidean",
89 | "-n", "2",
90 | "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
91 | "%sconfig/neighbours_config.cfg" %self.dir_
92 | ])
93 | find_neighbours.main(["compute_neighbours.py",
94 | "-m", "euclidean",
95 | "-i", self.dir_ + "pipelines_test_resources/neighbours_input.txt",
96 | "-n", "2",
97 | "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
98 | "-o" "/home/georgianadinu/work/FAKE_PATH"
99 | ])
100 | #neighbours_list = read_neighbours_list(self.dir_ + "NEIGHBOURS.neighbours_input.txt.euclidean")
101 | #print len(neighbours_list)
102 |
103 |
104 | def tearDown(self):
105 | pass
106 |
107 |
108 | def testName(self):
109 | pass
110 |
111 |
112 | if __name__ == "__main__":
113 | #import sys;sys.argv = ['', 'Test.testName']
114 | unittest.main()
115 |
--------------------------------------------------------------------------------
/src/unitest/operation_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 26, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
9 | from composes.transformation.dim_reduction.svd import Svd
10 | from composes.transformation.dim_reduction.nmf import Nmf
11 | from composes.semantic_space.operation import ScalingOperation
12 | from composes.semantic_space.operation import DimensionalityReductionOperation
13 | from composes.matrix.dense_matrix import DenseMatrix
14 | from composes.matrix.sparse_matrix import SparseMatrix
15 | from composes.exception.illegal_state_error import IllegalStateError
16 |
17 |
18 | class Test(unittest.TestCase):
19 |
20 |
21 | def setUp(self):
22 | self.m1 = np.array([[1,2,3]])
23 | self.m2 = np.array([[3]])
24 | self.m3 = np.array([[4,2,6]])
25 | self.m4 = np.array([[2]])
26 |
27 | self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
28 | self.us = np.mat([[ 2.19272110e+00, 3.03174768e+00],
29 | [ 4.38544220e+00, 6.06349536e+00],
30 | [ 6.76369708e+02, -4.91431927e-02]])
31 |
32 | self.xnmf = np.mat([[1,2,3],[2,4,6],[4,17,13]])
33 |
34 | def tearDown(self):
35 | pass
36 |
37 |
38 | def test_apply_dimensionality_reduction(self):
39 |
40 | test_cases =[(self.x, self.us)]
41 | red = Svd(2)
42 |
43 | for in_mat, expected_us_mat in test_cases:
44 | op = DimensionalityReductionOperation(red)
45 | tmp_mat = in_mat.copy()
46 |
47 | out_us_mat = op.apply(DenseMatrix(in_mat)).mat
48 | np.testing.assert_array_almost_equal(expected_us_mat, out_us_mat, 2)
49 |
50 | np.testing.assert_array_equal(in_mat, tmp_mat)
51 | self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat))
52 | self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat))
53 |
54 |
55 | def test_project_dimensionality_reduction(self):
56 |
57 | test_cases =[(self.x, self.us)]
58 | red = Svd(2)
59 |
60 | for in_mat, expected_us_mat in test_cases:
61 | op = DimensionalityReductionOperation(red)
62 | tmp_mat = in_mat.copy()
63 |
64 | self.assertRaises(IllegalStateError, op.project, DenseMatrix(in_mat))
65 |
66 | op.apply(DenseMatrix(in_mat)).mat
67 | out_proj_mat = op.project(DenseMatrix(in_mat)).mat
68 | np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat, 2)
69 |
70 | np.testing.assert_array_equal(in_mat, tmp_mat)
71 |
72 | self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat))
73 |
74 | out_proj_mat2 = op.project(DenseMatrix(in_mat)).mat
75 | np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat2, 2)
76 |
77 | def test_project_dimensionality_reduction_nmf(self):
78 |
79 | test_cases = [self.xnmf]
80 | red = Nmf(2)
81 |
82 | for in_mat in test_cases:
83 | d_in_mat = DenseMatrix(in_mat)
84 | op = DimensionalityReductionOperation(red)
85 | tmp_mat = in_mat.copy()
86 |
87 | self.assertRaises(IllegalStateError, op.project, d_in_mat)
88 |
89 | out_core_mat = op.apply(d_in_mat).mat
90 | out_proj_mat = op.project(d_in_mat).mat
91 | np.testing.assert_array_almost_equal(out_proj_mat, out_core_mat, 5)
92 |
93 | np.testing.assert_array_equal(in_mat, tmp_mat)
94 |
95 | self.assertRaises(IllegalStateError, op.apply, d_in_mat)
96 |
97 | out_proj_mat2 = op.project(d_in_mat).mat
98 | np.testing.assert_array_almost_equal(out_proj_mat2, out_core_mat, 5)
99 |
100 |
101 | def test_apply_weighting_operation(self):
102 | test_cases = [(self.m1, np.array([[0,0,0]])),
103 | (self.m2, np.array([[0]]))]
104 | w = PpmiWeighting()
105 | for in_mat, expected_mat in test_cases:
106 | op = ScalingOperation(w)
107 | tmp_mat = in_mat.copy()
108 | out_mat = op.apply(DenseMatrix(in_mat)).mat
109 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
110 | np.testing.assert_array_equal(in_mat, tmp_mat)
111 | self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat))
112 |
113 | def test_project_weighting_operation(self):
114 | test_cases = [(self.m1, self.m3,
115 | np.array([[0.69314718,0,0]])),
116 | (self.m2, self.m4, np.array([[0]]))]
117 | w = PpmiWeighting()
118 | for (core_mat, per_mat, expected_mat) in test_cases:
119 | op = ScalingOperation(w)
120 | tmp_mat = per_mat.copy()
121 |
122 | self.assertRaises(IllegalStateError, op.project,
123 | DenseMatrix(per_mat))
124 |
125 | op.apply(DenseMatrix(core_mat))
126 | out_mat = op.project(DenseMatrix(per_mat)).mat
127 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
128 | np.testing.assert_array_equal(per_mat, tmp_mat)
129 |
130 | out_mat = op.project(DenseMatrix(per_mat)).mat
131 | np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
132 |
133 | if __name__ == "__main__":
134 | #import sys;sys.argv = ['', 'Test.testName']
135 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/regression_learner_utils_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 9, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from composes.utils.regression_learner import RidgeRegressionLearner
9 | from composes.utils.regression_learner import LstsqRegressionLearner
10 | from composes.matrix.dense_matrix import DenseMatrix
11 | from composes.utils.matrix_utils import padd_matrix
12 |
13 | class Test(unittest.TestCase):
14 |
15 |
16 | def test_trivial_crossvalidation(self):
17 |
18 | for i in range(1, 10):
19 | m_a = DenseMatrix(np.mat(np.random.random((i + 1,4))))
20 | m_b = DenseMatrix(np.mat(np.random.random((i + 1,4))))
21 | tmp_a = m_a.mat.copy()
22 | tmp_b = m_b.mat.copy()
23 |
24 | learner = RidgeRegressionLearner(param_range=[3], intercept=False)
25 | solution = learner.train(m_a, m_b)
26 |
27 | learner2 = RidgeRegressionLearner(param = 3, intercept=False)
28 | solution2 = learner2.train(m_a, m_b)
29 |
30 | np.testing.assert_array_equal(tmp_a, m_a.mat)
31 | np.testing.assert_array_equal(tmp_b, m_b.mat)
32 | np.testing.assert_array_equal(solution.mat, solution2.mat)
33 |
34 | learner = RidgeRegressionLearner(param_range=[3], intercept=False)
35 | solution = learner.train(m_a, m_b)
36 |
37 | np.testing.assert_array_equal(tmp_a, m_a.mat)
38 | np.testing.assert_array_equal(tmp_b, m_b.mat)
39 | np.testing.assert_array_equal(solution.mat, solution2.mat)
40 |
41 | learner = RidgeRegressionLearner(param_range=[0], intercept=False)
42 | solution = learner.train(m_a, m_b)
43 |
44 | learner2 = LstsqRegressionLearner(intercept=False)
45 | solution2 = learner2.train(m_a, m_b)
46 |
47 | np.testing.assert_array_almost_equal(solution.mat, solution2.mat, 3)
48 |
49 |
50 | def test_crossvalidation(self):
51 |
52 | a = DenseMatrix(np.matrix([[1, 1],[2, 3],[4, 6]]))
53 | b = DenseMatrix(np.matrix([[12, 15, 18],[21, 27, 33],[35, 46, 57]]))
54 | res = DenseMatrix(np.matrix([[1, 2, 3],[4, 5, 6],[7, 8, 9]]))
55 |
56 | learner = RidgeRegressionLearner(intercept=True, param_range=[0])
57 | learner2 = LstsqRegressionLearner(intercept=False)
58 |
59 | res1 = learner2.train(a, b)
60 | res2 = learner.train(a, b)
61 |
62 | np.testing.assert_array_almost_equal(res2.mat[:-1,:], res[0:2,:].mat, 6)
63 | np.testing.assert_array_almost_equal(res2.mat[-1,:], res[2:3,:].mat, 6)
64 |
65 | new_a = padd_matrix(a, 1)
66 | self.assertGreater(((a * res1) - b).norm(), ((new_a * res2) - b).norm())
67 |
68 |
69 | if __name__ == "__main__":
70 | #import sys;sys.argv = ['', 'Test.test_trivial_cases']
71 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/sparse_matrix_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 18, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | import numpy.testing
9 | from scipy.sparse import csr_matrix
10 | from scipy.sparse import csc_matrix
11 | from scipy.sparse.sputils import isintlike
12 | from composes.matrix.sparse_matrix import SparseMatrix
13 | from composes.matrix.dense_matrix import DenseMatrix
14 |
15 |
16 |
17 | class TestSparseMatrix(unittest.TestCase):
18 |
19 | def setUp(self):
20 | self.a = np.array([[1,2,3],[4,0,5]])
21 | self.b = np.array([[0,0,0],[0,0,0]])
22 |
23 | self.c = np.array([[0,0],[0,0],[0,0]])
24 | self.d = np.array([[1,0],[0,1]])
25 | self.e = np.array([1,10])
26 | self.f = np.array([1,10,100])
27 |
28 | self.matrix_a = SparseMatrix(self.a)
29 | self.matrix_b = SparseMatrix(self.b)
30 |
31 | self.matrix_c = SparseMatrix(self.c)
32 | self.matrix_d = SparseMatrix(self.d)
33 |
34 |
35 | def tearDown(self):
36 | pass
37 |
38 | def test_reshape(self):
39 |
40 | test_cases = [(self.matrix_a, (1,6), self.a.reshape((1,6))),
41 | (self.matrix_a, (3,2), self.a.reshape((3,2))),
42 | (self.matrix_b, (1,6), self.b.reshape((1,6))),
43 | (self.matrix_b, (6,1), self.b.reshape((6,1))),
44 | (self.matrix_b, (2,3), self.b.reshape((2,3))),
45 | ]
46 |
47 | for mat, shape, expected in test_cases:
48 | mat.reshape(shape)
49 | np.testing.assert_array_equal(mat.mat.todense(), expected)
50 | self.assertTupleEqual(shape, mat.shape)
51 |
52 |
53 | def test_reshape_raises(self):
54 |
55 | test_cases = [(3,0), (3,3), 3, (3,3,3), ("3","5"), (2,"4")]
56 |
57 | for shape in test_cases:
58 | self.assertRaises(ValueError, self.matrix_a.reshape, shape)
59 |
60 |
61 | def test_init(self):
62 | nparr = self.a
63 | test_cases = [nparr,
64 | np.mat(nparr),
65 | csr_matrix(nparr),
66 | csc_matrix(nparr),
67 | DenseMatrix(nparr)]
68 |
69 | for inmat in test_cases:
70 | outmat = SparseMatrix(inmat)
71 | self.assertIsInstance(outmat.mat, csr_matrix)
72 | numpy.testing.assert_array_equal(nparr,
73 | np.array(outmat.mat.todense()))
74 |
75 | def test_add(self):
76 | test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
77 | (self.matrix_a, self.matrix_b, np.mat(self.a))
78 | ]
79 |
80 | for (term1, term2, expected) in test_cases:
81 | sum_ = term1 + term2
82 | numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
83 | self.assertIsInstance(sum_, type(term1))
84 |
85 | def test_add_raises(self):
86 | test_cases = [(self.matrix_a, self.a),
87 | (self.matrix_a, DenseMatrix(self.a))]
88 |
89 | for (term1, term2) in test_cases:
90 | self.assertRaises(TypeError, term1.__add__, term2)
91 |
92 | def test_mul(self):
93 | test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])),
94 | (self.matrix_d, self.matrix_a, self.matrix_a.mat.todense()),
95 | (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])),
96 | (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]]))
97 | ]
98 |
99 | for (term1, term2, expected) in test_cases:
100 | sum_ = term1 * term2
101 | numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
102 | self.assertIsInstance(sum_, type(term1))
103 |
104 | def test_mul_raises(self):
105 | test_cases = [(self.matrix_a, self.a),
106 | (self.matrix_a, DenseMatrix(self.a)),
107 | (self.matrix_a, "3")]
108 |
109 | for (term1, term2) in test_cases:
110 | self.assertRaises(TypeError, term1.__mul__, term2)
111 |
112 | def test_get_item(self):
113 |
114 | out_mat = SparseMatrix(self.a)[0,:]
115 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
116 |
117 | out_int = SparseMatrix(self.a)[0,1]
118 | self.assertEqual(out_int, 2)
119 |
120 | out_mat = SparseMatrix(self.a)[0,1:2]
121 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,1:2]))
122 |
123 | out_mat = SparseMatrix(self.a)[0]
124 | np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
125 |
126 |
127 | def test_scale_rows(self):
128 | outcome = np.mat([[1,2,3],[40,0,50]])
129 | test_cases = [(self.matrix_a.copy(), self.e, outcome),
130 | (self.matrix_a.copy(), np.mat(self.e).T, outcome),
131 | ]
132 |
133 | for (term1, term2, expected) in test_cases:
134 | term1 = term1.scale_rows(term2)
135 | numpy.testing.assert_array_equal(term1.mat.todense(), expected)
136 |
137 | def test_scale_columns(self):
138 | test_cases = [(self.matrix_a.copy(), self.f, np.mat([[1,20,300],[4,0,500]]))]
139 |
140 | for (term1, term2, expected) in test_cases:
141 | term1 = term1.scale_columns(term2)
142 | numpy.testing.assert_array_equal(term1.mat.todense(), expected)
143 | self.assertIsInstance(term1.mat, csr_matrix)
144 |
145 | def test_scale_raises(self):
146 | test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
147 | (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
148 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
149 | (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
150 | (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
151 | ]
152 | for (term1, term2, error_type, function) in test_cases:
153 | self.assertRaises(error_type, function, term2)
154 |
155 | def test_plog(self):
156 | m = SparseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]]))
157 | m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]])
158 | a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]])
159 | test_cases = [(self.matrix_a.copy(), a_expected),
160 | (m, m_expected)
161 | ]
162 |
163 | for (term, expected) in test_cases:
164 | term.plog()
165 | numpy.testing.assert_array_almost_equal(term.mat.todense(), expected, 3)
166 |
167 | if __name__ == "__main__":
168 | #import sys;sys.argv = ['', 'Test.testName']
169 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/tc_pipeline_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Oct 19, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | from unitest import data_dir
9 | import pipelines.train_composition as tc
10 | import pipelines.build_core_space as bcs
11 | from composes.utils import io_utils
12 | from composes.semantic_space.space import Space
13 |
14 | class Test(unittest.TestCase):
15 |
16 |
17 | def setUp(self):
18 | self.dir_ = data_dir + "pipelines_test_resources/"
19 |
20 | #use as a conversion tool, creates the files we want
21 | bcs.main(["build_core_space.py",
22 | "-l", self.dir_ + "log1.txt",
23 | "-i", self.dir_ + "N_mat",
24 | "-o", self.dir_,
25 | "--input_format", "dm",
26 | ])
27 |
28 | bcs.main(["build_core_space.py",
29 | "-l", self.dir_ + "log1.txt",
30 | "-i", self.dir_ + "AN_mat",
31 | "-o", self.dir_,
32 | "--input_format", "dm",
33 | ])
34 |
35 | def tearDown(self):
36 | pass
37 |
38 | def _test_equal_spaces_structs(self, sp, new_sp):
39 | self.assertListEqual(sp.id2row, new_sp.id2row)
40 | self.assertListEqual(sp.id2column, new_sp.id2column)
41 | self.assertDictEqual(sp.row2id, new_sp.row2id)
42 | self.assertDictEqual(sp.column2id, new_sp.column2id)
43 |
44 | def _test_equal_spaces_dense(self, sp, new_sp):
45 |
46 | self._test_equal_spaces_structs(sp, new_sp)
47 | np.testing.assert_array_equal(sp.cooccurrence_matrix.mat,
48 | new_sp.cooccurrence_matrix.mat)
49 |
50 | def _test_equal_spaces_sparse(self, sp, new_sp):
51 |
52 | self._test_equal_spaces_structs(sp, new_sp)
53 | np.testing.assert_array_equal(sp.cooccurrence_matrix.mat.todense(),
54 | new_sp.cooccurrence_matrix.mat.todense())
55 |
56 | def test_simple_lstsq_inter(self):
57 |
58 | tc.main(["train_composition.py",
59 | "-l", self.dir_ + "log1.txt",
60 | "-i", self.dir_ + "an_train_data.txt",
61 | "-o", self.dir_,
62 | "-m", "lexical_func",
63 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
64 | "-a", self.dir_ + "CORE_SS.N_mat.pkl",
65 | "-r", "lstsq",
66 | "--intercept", "True",
67 | "--export_params", "True",
68 | ])
69 |
70 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
71 | new_space = trained.function_space
72 |
73 | np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
74 | np.mat([[0.66666667,0.33333333,
75 | -0.33333333,0.33333333,
76 | 0.66666667,0.33333333]]),
77 | 7)
78 |
79 | self.assertTupleEqual(new_space.element_shape, (2,3))
80 | self.assertListEqual(new_space.id2row, ["big"])
81 | self.assertListEqual(new_space.id2column, [])
82 |
83 |
84 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
85 | format="dm")
86 |
87 | self._test_equal_spaces_dense(a_space, new_space)
88 |
89 |
90 | def test_simple_lstsq_no_inter(self):
91 | tc.main(["train_composition.py",
92 | "-l", self.dir_ + "log1.txt",
93 | "-i", self.dir_ + "an_train_data.txt",
94 | "-o", self.dir_,
95 | "-m", "lexical_func",
96 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
97 | "-a", self.dir_ + "CORE_SS.N_mat.pkl",
98 | "-r", "lstsq",
99 | "--intercept", "False",
100 | "--export_params", "True"
101 | ])
102 |
103 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
104 | new_space = trained.function_space
105 | np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
106 | np.mat([1,0,0,1]), 10)
107 | self.assertTupleEqual(new_space.element_shape, (2,2))
108 | self.assertListEqual(new_space.id2row, ["big"])
109 | self.assertListEqual(new_space.id2column, [])
110 |
111 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
112 | format="dm")
113 |
114 | self._test_equal_spaces_dense(a_space, new_space)
115 |
116 | tc.main(["train_composition.py",
117 | "-l", self.dir_ + "log1.txt",
118 | "-i", self.dir_ + "an_train_data.txt",
119 | "-o", self.dir_,
120 | "-m", "lexical_func",
121 | "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
122 | "-a", self.dir_ + "CORE_SS.N_mat.pkl",
123 | "-r", "ridge",
124 | "--lambda", "0",
125 | "--crossvalidation", "False",
126 | "--intercept", "False",
127 | "--export_params", "True"
128 | ])
129 |
130 | trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
131 | new_space2 = trained.function_space
132 | np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat,
133 | np.mat([1,0,0,1]), 10)
134 | self.assertTupleEqual(new_space2.element_shape, (2,2))
135 | self.assertListEqual(new_space2.id2row, ["big"])
136 | self.assertListEqual(new_space2.id2column, [])
137 |
138 | a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
139 | format="dm")
140 |
141 | self._test_equal_spaces_dense(a_space, new_space2)
142 |
143 |
144 |
145 | if __name__ == "__main__":
146 | #import sys;sys.argv = ['', 'Test.testName']
147 | unittest.main()
148 |
--------------------------------------------------------------------------------
/src/unitest/utils_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 26, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | from composes.utils.space_utils import list2dict
8 | #from composes.utils.py_matrix_utils import coo
9 |
10 | class UtilsTest(unittest.TestCase):
11 |
12 |
13 | def test_list2dict(self):
14 | test_cases = [(["a","v","d"], {"a":0, "v":1, "d":2}), ([], {})]
15 |
16 | for list_, expected in test_cases:
17 | outcome = list2dict(list_)
18 | self.assertDictEqual(outcome, expected)
19 |
20 | self.assertRaises(ValueError, list2dict, ["a","v","a"])
21 |
22 | #def test_coo(self):
23 | # coo()
24 |
25 | if __name__ == "__main__":
26 | #import sys;sys.argv = ['', 'Test.testName']
27 | unittest.main()
--------------------------------------------------------------------------------
/src/unitest/weighting_test.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Sep 20, 2012
3 |
4 | @author: Georgiana Dinu, Pham The Nghia
5 | '''
6 | import unittest
7 | import numpy as np
8 | import numpy.testing
9 | from composes.matrix.dense_matrix import DenseMatrix
10 | from composes.matrix.sparse_matrix import SparseMatrix
11 | from composes.transformation.scaling.epmi_weighting import EpmiWeighting
12 | from composes.transformation.scaling.plog_weighting import PlogWeighting
13 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
14 | from composes.transformation.scaling.plmi_weighting import PlmiWeighting
15 | from composes.transformation.scaling.row_normalization import RowNormalization
16 | from composes.transformation.scaling.normalization import Normalization
17 |
18 | class Test(unittest.TestCase):
19 |
20 |
21 | def setUp(self):
22 | self.a = np.array([[1,2,3],[4,0,5]])
23 | self.b = np.array([[1,2,3]])
24 |
25 | self.c = np.array([[0,0],[0,0],[0,0]])
26 | self.d = np.array([[1,-1],[0,1]])
27 |
28 | self.e = np.array([[1,2,3],[1,0,0]])
29 | self.f = np.array([1,10,100])
30 |
31 |
32 |
33 | def tearDown(self):
34 | pass
35 |
36 |
37 | def single_case_test(self, matrix_, expected, w):
38 |
39 | matrix_copy = matrix_.copy()
40 | dm = DenseMatrix(matrix_)
41 | sm = SparseMatrix(matrix_)
42 |
43 | out1 = w.apply(dm)
44 | out2 = w.apply(sm)
45 |
46 | numpy.testing.assert_array_almost_equal(out1.mat, expected, 7)
47 | numpy.testing.assert_array_almost_equal(out2.mat.todense(), expected, 7)
48 |
49 | numpy.testing.assert_array_equal(dm.mat, matrix_copy)
50 | numpy.testing.assert_array_equal(matrix_, matrix_copy)
51 | numpy.testing.assert_array_equal(sm.mat.todense(), matrix_copy)
52 |
53 | def single_case_raises_test(self, matrix_, error_type, w):
54 | dm = DenseMatrix(matrix_)
55 | sm = SparseMatrix(matrix_)
56 |
57 | self.assertRaises(error_type, w.apply, dm)
58 | self.assertRaises(error_type, w.apply, sm)
59 |
60 |
61 | def test_epmi(self):
62 | w = EpmiWeighting()
63 | test_cases = [(self.b, np.mat([[1,1,1]])),
64 | (self.c, self.c)
65 | ]
66 | for matrix_, expected in test_cases:
67 | self.single_case_test(matrix_, expected, w)
68 |
69 | def test_plog(self):
70 | w = PlogWeighting()
71 | test_cases = [(np.mat([[1,1,1]]), np.mat([[0,0,0]])),
72 | (self.c, self.c)
73 | ]
74 | for matrix_, expected in test_cases:
75 | self.single_case_test(matrix_, expected, w)
76 |
77 | def test_ppmi(self):
78 | w = PpmiWeighting()
79 | test_cases = [(self.b, np.mat([[0,0,0]])),
80 | (self.c, self.c)
81 | ]
82 |
83 | for matrix_, expected in test_cases:
84 | self.single_case_test(matrix_, expected, w)
85 |
86 |
87 | def test_plmi(self):
88 | w = PlmiWeighting()
89 | test_cases = [(self.b, np.mat([[0,0,0]])),
90 | (self.c, self.c),
91 | (self.e, np.mat([[0.,0.30830136,0.46245204],
92 | [1.25276297,0.,0.]]))
93 | ]
94 |
95 | for matrix_, expected in test_cases:
96 | self.single_case_test(matrix_, expected, w)
97 |
98 | def test_row_norm(self):
99 | w = RowNormalization()
100 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
101 | (self.c, self.c),
102 | (self.e, np.mat([[0.26726124,0.53452248,0.80178373],
103 | [1.,0.,0.]]))
104 | ]
105 |
106 | for matrix_, expected in test_cases:
107 | self.single_case_test(matrix_, expected, w)
108 |
109 | w = RowNormalization(criterion = "length")
110 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
111 | (self.c, self.c),
112 | (self.e, np.mat([[0.26726124,0.53452248,0.80178373],
113 | [1.,0.,0.]]))
114 | ]
115 |
116 | for matrix_, expected in test_cases:
117 | self.single_case_test(matrix_, expected, w)
118 |
119 | w = RowNormalization(criterion = "sum")
120 | test_cases = [(self.b, np.mat([[0.16666667,0.33333333,0.5]])),
121 | (self.c, self.c),
122 | (self.e, np.mat([[0.16666667,0.33333333,0.5],
123 | [1.,0.,0.]]))
124 | ]
125 |
126 | for matrix_, expected in test_cases:
127 | self.single_case_test(matrix_, expected, w)
128 |
129 | def test_norm(self):
130 | w = Normalization()
131 | test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])),
132 | (self.c, self.c),
133 | (self.e, np.mat([[1/7.0,2/7.0,3/7.0],
134 | [1./7.0,0.,0.]]))
135 | ]
136 |
137 | for matrix_, expected in test_cases:
138 | self.single_case_test(matrix_, expected, w)
139 |
140 | w = Normalization(criterion = "length")
141 | test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
142 | (self.c, self.c),
143 | (self.e, np.mat([[0.25819889,0.51639778,0.77459667],
144 | [0.25819889,0. ,0. ]]))
145 | ]
146 |
147 | for matrix_, expected in test_cases:
148 | self.single_case_test(matrix_, expected, w)
149 |
150 |
151 | w = Normalization(criterion = "sum")
152 | test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])),
153 | (self.c, self.c),
154 | (self.e, np.mat([[1/7.0,2/7.0,3/7.0],
155 | [1./7.0,0.,0.]]))
156 | ]
157 |
158 | for matrix_, expected in test_cases:
159 | self.single_case_test(matrix_, expected, w)
160 |
161 | def test_epmi_raises(self):
162 | w = EpmiWeighting()
163 | test_cases = [(self.d, ValueError)]
164 |
165 | for matrix_, error_type in test_cases:
166 | self.single_case_raises_test(matrix_, error_type, w)
167 |
168 | def test_ppmi_raises(self):
169 | w = PpmiWeighting()
170 | test_cases = [(self.d, ValueError)]
171 |
172 | for matrix_, error_type in test_cases:
173 | self.single_case_raises_test(matrix_, error_type, w)
174 |
175 | def test_plmi_raises(self):
176 | w = PlmiWeighting()
177 | test_cases = [(self.d, ValueError)]
178 |
179 | for matrix_, error_type in test_cases:
180 | self.single_case_raises_test(matrix_, error_type, w)
181 |
182 |
183 | if __name__ == "__main__":
184 | #import sys;sys.argv = ['', 'Test.test_epmi']
185 | unittest.main()
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (http://tox.testrun.org/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist = py27
8 |
9 | [testenv]
10 | sitepackages = True
11 | commands = python setup.py test
12 | deps =
13 | numpy
14 | Cython
15 |
--------------------------------------------------------------------------------