├── .gitignore
├── .project
├── .pydevproject
├── .travis.yml
├── README.rst
├── resource
    └── unittest
    │   ├── config
    │       ├── neighbours_config.cfg
    │       └── sim_config.cfg
    │   ├── pipelines_test_resources
    │       ├── AN_mat.dm
    │       ├── A_mat.dm
    │       ├── N_mat.dm
    │       ├── aan_train_data.txt
    │       ├── an_train_data.txt
    │       ├── config1.txt
    │       ├── mat1.col
    │       ├── mat1.cols
    │       ├── mat1.pickle
    │       ├── mat1.row
    │       ├── mat1.sm
    │       ├── mat1.sm.gz
    │       ├── mat2.dm
    │       ├── mat2.dm.gz
    │       ├── mat3.cols
    │       ├── mat3.dm
    │       ├── mat3.sm
    │       ├── na_train_data.txt
    │       ├── neighbours_input.txt
    │       ├── pred1.txt
    │       ├── pred2.txt
    │       └── sim_input.txt
    │   └── space_test_resources
    │       ├── col1.col
    │       ├── col2.col
    │       ├── col3.col
    │       ├── col4.col
    │       ├── col5.col
    │       ├── data1.cols
    │       ├── data1.dense
    │       ├── data1.sparse
    │       ├── data10.dense
    │       ├── data10.sparse
    │       ├── data2.cols
    │       ├── data2.dense
    │       ├── data2.sparse
    │       ├── data3.cols
    │       ├── data3.dense
    │       ├── data3.sparse
    │       ├── data4.cols
    │       ├── data4.dense
    │       ├── data4.sparse
    │       ├── data5.cols
    │       ├── data7.cols
    │       ├── data7.dense
    │       ├── data7.sparse
    │       ├── data8.dense
    │       ├── data8.sparse
    │       ├── data9.cols
    │       ├── data9.dense
    │       ├── data9.sparse
    │       ├── row1.row
    │       ├── row2.row
    │       ├── row3.row
    │       ├── tmp.col
    │       ├── tmp.cols
    │       ├── tmp.dm
    │       ├── tmp.row
    │       ├── tmp.rows
    │       └── tmp.sm
├── setup.py
├── src
    ├── composes
    │   ├── __init__.py
    │   ├── composition
    │   │   ├── __init__.py
    │   │   ├── composition_model.py
    │   │   ├── dilation.py
    │   │   ├── full_additive.py
    │   │   ├── lexical_function.py
    │   │   ├── multiplicative.py
    │   │   └── weighted_additive.py
    │   ├── exception
    │   │   ├── __init__.py
    │   │   ├── illegal_state_error.py
    │   │   └── invalid_argument_error.py
    │   ├── matrix
    │   │   ├── __init__.py
    │   │   ├── dense_matrix.py
    │   │   ├── linalg.py
    │   │   ├── matrix.py
    │   │   └── sparse_matrix.py
    │   ├── semantic_space
    │   │   ├── __init__.py
    │   │   ├── operation.py
    │   │   ├── peripheral_space.py
    │   │   └── space.py
    │   ├── similarity
    │   │   ├── __init__.py
    │   │   ├── cos.py
    │   │   ├── dot_prod.py
    │   │   ├── euclidean.py
    │   │   ├── lin.py
    │   │   └── similarity.py
    │   ├── transformation
    │   │   ├── __init__.py
    │   │   ├── dim_reduction
    │   │   │   ├── __init__.py
    │   │   │   ├── dimensionality_reduction.py
    │   │   │   ├── nmf.py
    │   │   │   └── svd.py
    │   │   ├── feature_selection
    │   │   │   ├── __init__.py
    │   │   │   ├── feature_selection.py
    │   │   │   └── top_feature_selection.py
    │   │   └── scaling
    │   │   │   ├── __init__.py
    │   │   │   ├── epmi_weighting.py
    │   │   │   ├── normalization.py
    │   │   │   ├── plmi_weighting.py
    │   │   │   ├── plog_weighting.py
    │   │   │   ├── ppmi_weighting.py
    │   │   │   ├── row_normalization.py
    │   │   │   └── scaling.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── crossvalidation_utils.py
    │   │   ├── gen_utils.py
    │   │   ├── io_utils.py
    │   │   ├── log_utils.py
    │   │   ├── matrix_utils.py
    │   │   ├── mem_utils.py
    │   │   ├── num_utils.py
    │   │   ├── py_matrix_utils.py
    │   │   ├── regression_learner.py
    │   │   ├── scoring_utils.py
    │   │   └── space_utils.py
    ├── examples
    │   ├── __init__.py
    │   ├── cmd_ex01.sh
    │   ├── cmd_ex02.sh
    │   ├── cmd_ex03.sh
    │   ├── cmd_ex04.sh
    │   ├── cmd_ex05.sh
    │   ├── cmd_ex06.sh
    │   ├── cmd_ex07.sh
    │   ├── data
    │   │   ├── in
    │   │   │   ├── config1.cfg
    │   │   │   ├── config2.cfg
    │   │   │   ├── data_to_comp.txt
    │   │   │   ├── data_to_comp2.txt
    │   │   │   ├── ex01.cols
    │   │   │   ├── ex01.rows
    │   │   │   ├── ex01.sm
    │   │   │   ├── ex05.cols
    │   │   │   ├── ex05.sm
    │   │   │   ├── ex10.cols
    │   │   │   ├── ex10.rows
    │   │   │   ├── ex10.sm
    │   │   │   ├── ex19-n.cols
    │   │   │   ├── ex19-n.sm
    │   │   │   ├── ex19-svo.cols
    │   │   │   ├── ex19-svo.sm
    │   │   │   ├── sim_data.txt
    │   │   │   ├── sim_data2.txt
    │   │   │   ├── sim_data3.txt
    │   │   │   ├── train_data.txt
    │   │   │   ├── word_list.txt
    │   │   │   ├── word_pairs1.txt
    │   │   │   ├── word_pairs2.txt
    │   │   │   └── word_sims.txt
    │   │   └── out
    │   │   │   ├── COMPOSED_SS.ex10.pkl
    │   │   │   ├── PER_SS.ex05.pkl
    │   │   │   ├── PHRASE_SS.ex10.pkl
    │   │   │   ├── ex01.cols
    │   │   │   ├── ex01.dm
    │   │   │   ├── ex01.pkl
    │   │   │   ├── ex01.rows
    │   │   │   ├── ex01.sm
    │   │   │   ├── ex10.pkl
    │   │   │   ├── model01.params
    │   │   │   └── model01.pkl
    │   ├── ex01.py
    │   ├── ex02.py
    │   ├── ex03.py
    │   ├── ex04.py
    │   ├── ex05.py
    │   ├── ex06.py
    │   ├── ex07.py
    │   ├── ex08.py
    │   ├── ex09.py
    │   ├── ex10.py
    │   ├── ex11.py
    │   ├── ex12.py
    │   ├── ex13.py
    │   ├── ex14.py
    │   ├── ex15.py
    │   ├── ex16.py
    │   ├── ex17.py
    │   ├── ex18.py
    │   ├── ex19.py
    │   ├── ex20.py
    │   ├── exercise.sh
    │   └── full_example.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── apply_composition.py
    │   ├── build_core_space.py
    │   ├── build_peripheral_space.py
    │   ├── compute_neighbours.py
    │   ├── compute_similarities.py
    │   ├── evaluate_similarities.py
    │   ├── pipeline_utils.py
    │   └── train_composition.py
    └── unitest
    │   ├── __init__.py
    │   ├── ac_pipeline_test.py
    │   ├── bcs_pipeline_test.py
    │   ├── bps_pipeline_test.py
    │   ├── conftest.py
    │   ├── crossvalidation_utils_test.py
    │   ├── dense_matrix_test.py
    │   ├── dilation_test.py
    │   ├── dimensionality_reduction_test.py
    │   ├── es_pipeline_test.py
    │   ├── feat_selection_test.py
    │   ├── full_aditive_test.py
    │   ├── lexical_function_test.py
    │   ├── linalg_test.py
    │   ├── matrix_utils_test.py
    │   ├── model_export_test.py
    │   ├── neighbour_pipeline_test.py
    │   ├── operation_test.py
    │   ├── peripheral_space_test.py
    │   ├── regression_learner_utils_test.py
    │   ├── sim_pipeline_test.py
    │   ├── similarity_test.py
    │   ├── space_test.py
    │   ├── sparse_matrix_test.py
    │   ├── tc_pipeline_test.py
    │   ├── utils_test.py
    │   ├── weighted_additive_test.py
    │   └── weighting_test.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | 
 3 | resource/unittest/pipelines_test_resources/
 4 | 
 5 | .Python
 6 | .coverage
 7 | 
 8 | .tox/
 9 | 
10 | *.egg/
11 | src/dissect.egg-info/
12 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>gittoolkit</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?eclipse-pydev version="1.0"?><pydev_project>
 3 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
 4 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
 5 | <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
 6 | <path>/gittoolkit/src</path>
 7 | <path>/gittoolkit/src</path>
 8 | </pydev_pathproperty>
 9 | </pydev_project>
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | before_install:
 5 |   - sudo apt-get update -qq
 6 |   - sudo apt-get install -qq python-numpy python-scipy python-matplotlib
 7 |   - rm /home/travis/virtualenv/python2.7/lib/python2.7/no-global-site-packages.txt
 8 | # command to install dependencies
 9 | install:
10 |   - pip install cython --use-mirrors
11 |   - pip install . --use-mirrors
12 | # command to run tests
13 | script: python setup.py test
14 | after_success:
15 |   - sudo apt-get install python-yaml
16 |   - pip install coveralls pytest-cov . --use-mirrors
17 |   - py.test --cov=composes --cov=pipelines --cov-report=term-missing src/unitest
18 |   - coveralls
19 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | DIStributional SEmantics Composition Toolkit
2 | ============================================
3 | 
4 | 
5 | For documentation, please, refer to http://clic.cimec.unitn.it/composes/toolkit/
6 | 


--------------------------------------------------------------------------------
/resource/unittest/config/neighbours_config.cfg:
--------------------------------------------------------------------------------
 1 | # configuration file for similarity pipeline
 2 | [compute_neighbours]
 3 | 
 4 | #input file
 5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/neighbours_input.txt
 6 | 
 7 | # similarity measure
 8 | sim_measure=cos
 9 | 
10 | # output directory 
11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/
12 | 
13 | # space file(s)
14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle,/home/thenghia.pham/git/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.pickle
15 | 
16 | # number of neighbours
17 | no_neighbours=3
18 | 
19 | # log file
20 | log=/home/georgianadinu/work/localtoolkit/toolkit/log/sim_log.txt


--------------------------------------------------------------------------------
/resource/unittest/config/sim_config.cfg:
--------------------------------------------------------------------------------
 1 | # configuration file for similarity pipeline
 2 | [compute_similarities]
 3 | 
 4 | #input file
 5 | input=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/sim_input.txt
 6 | 
 7 | # similarity measure
 8 | sim_measure=cos,dot_prod,lin
 9 | 
10 | # output directory 
11 | output=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/
12 | 
13 | # space file(s)
14 | space=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl,/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/CORE_SS.mat3.raw.top_sum_3.svd_2.all.pkl
15 | # columns
16 | columns=0,1
17 | 
18 | # log file
19 | log=/home/georgianadinu/work/localtoolkit/toolkit/resource/unittest/


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/AN_mat.dm:
--------------------------------------------------------------------------------
1 |  big_car 3 4
2 |  big_man 5 6
3 |  


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/A_mat.dm:
--------------------------------------------------------------------------------
1 | big 3 4
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/N_mat.dm:
--------------------------------------------------------------------------------
1 | car 3 4
2 | man 5 6
3 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/aan_train_data.txt:
--------------------------------------------------------------------------------
1 | big big_car big_big_car
2 | big big_man big_big_man


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/an_train_data.txt:
--------------------------------------------------------------------------------
1 | big car big_car
2 | big man big_man
3 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/config1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/config1.txt


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.col:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.cols:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.pickle:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (ccomposes.semantic_space.space
  5 | Space
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_id2row'
 14 | p6
 15 | (lp7
 16 | S'red'
 17 | p8
 18 | asS'_column2id'
 19 | p9
 20 | (dp10
 21 | S'car'
 22 | p11
 23 | I0
 24 | ssS'_operations'
 25 | p12
 26 | (lp13
 27 | sS'_id2column'
 28 | p14
 29 | (lp15
 30 | g11
 31 | asS'_element_shape'
 32 | p16
 33 | (I1
 34 | tp17
 35 | sS'_cooccurrence_matrix'
 36 | p18
 37 | g0
 38 | (ccomposes.matrix.sparse_matrix
 39 | SparseMatrix
 40 | p19
 41 | g2
 42 | Ntp20
 43 | Rp21
 44 | (dp22
 45 | S'_mat'
 46 | p23
 47 | g0
 48 | (cscipy.sparse.csr
 49 | csr_matrix
 50 | p24
 51 | g2
 52 | Ntp25
 53 | Rp26
 54 | (dp27
 55 | S'format'
 56 | p28
 57 | S'csr'
 58 | p29
 59 | sS'_shape'
 60 | p30
 61 | (I1
 62 | I1
 63 | tp31
 64 | sS'indptr'
 65 | p32
 66 | cnumpy.core.multiarray
 67 | _reconstruct
 68 | p33
 69 | (cnumpy
 70 | ndarray
 71 | p34
 72 | (I0
 73 | tp35
 74 | S'b'
 75 | p36
 76 | tp37
 77 | Rp38
 78 | (I1
 79 | (I2
 80 | tp39
 81 | cnumpy
 82 | dtype
 83 | p40
 84 | (S'i4'
 85 | p41
 86 | I0
 87 | I1
 88 | tp42
 89 | Rp43
 90 | (I3
 91 | S'<'
 92 | p44
 93 | NNNI-1
 94 | I-1
 95 | I0
 96 | tp45
 97 | bI00
 98 | S'\x00\x00\x00\x00\x01\x00\x00\x00'
 99 | p46
100 | tp47
101 | bsS'indices'
102 | p48
103 | g33
104 | (g34
105 | (I0
106 | tp49
107 | g36
108 | tp50
109 | Rp51
110 | (I1
111 | (I1
112 | tp52
113 | g43
114 | I00
115 | S'\x00\x00\x00\x00'
116 | p53
117 | tp54
118 | bsS'maxprint'
119 | p55
120 | I50
121 | sS'data'
122 | p56
123 | g33
124 | (g34
125 | (I0
126 | tp57
127 | g36
128 | tp58
129 | Rp59
130 | (I1
131 | (I1
132 | tp60
133 | g40
134 | (S'f8'
135 | p61
136 | I0
137 | I1
138 | tp62
139 | Rp63
140 | (I3
141 | S'<'
142 | p64
143 | NNNI-1
144 | I-1
145 | I0
146 | tp65
147 | bI00
148 | S'\x00\x00\x00\x00\x00\x00\x08@'
149 | p66
150 | tp67
151 | bsbsbsS'_row2id'
152 | p68
153 | (dp69
154 | g8
155 | I0
156 | ssb.


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.row:
--------------------------------------------------------------------------------
1 | red
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.sm:
--------------------------------------------------------------------------------
1 | red	car	3.000000
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat1.sm.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat1.sm.gz


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat2.dm:
--------------------------------------------------------------------------------
1 | car 3 4 5
2 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat2.dm.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/pipelines_test_resources/mat2.dm.gz


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.cols:
--------------------------------------------------------------------------------
1 | f1
2 | f2
3 | f3
4 | f4
5 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.dm:
--------------------------------------------------------------------------------
1 | a 1 2 3 1
2 | b 2 4 6 1
3 | c 4 675 43 1
4 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/mat3.sm:
--------------------------------------------------------------------------------
 1 | a f1 1
 2 | a f2 2
 3 | a f3 3 
 4 | a f4 1
 5 | b f1 2
 6 | b f2 4
 7 | b f3 6
 8 | b f4 1
 9 | c f1 4
10 | c f2 675
11 | c f3 43
12 | c f4 1
13 | 
14 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/na_train_data.txt:
--------------------------------------------------------------------------------
1 | car big big_car
2 | man big big_man
3 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/neighbours_input.txt:
--------------------------------------------------------------------------------
1 | a
2 | b
3 | c


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/pred1.txt:
--------------------------------------------------------------------------------
1 | 23 car 23 sdrs
2 | 4 man 4 sdfs
3 | 13 cad 13 sfd
4 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/pred2.txt:
--------------------------------------------------------------------------------
1 | 23 car 23 sdrs
2 | 4 man 4 sdfs
3 | 13 cad 13 sfd
4 | 


--------------------------------------------------------------------------------
/resource/unittest/pipelines_test_resources/sim_input.txt:
--------------------------------------------------------------------------------
1 | a b 1
2 | a c 0
3 | a a 1
4 | b c 1


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col1.col:
--------------------------------------------------------------------------------
1 | man
2 | car


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col2.col:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col3.col:
--------------------------------------------------------------------------------
1 | man
2 | car
3 | man
4 | car
5 | car
6 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col4.col:
--------------------------------------------------------------------------------
1 | airplane
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/col5.col:
--------------------------------------------------------------------------------
1 | man sdrf
2 | car 3
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.dense:
--------------------------------------------------------------------------------
1 | red 3 5
2 | blue 0 10
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data1.sparse:
--------------------------------------------------------------------------------
1 | red car 3
2 | red man 5
3 | blue man 10
4 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data10.dense:
--------------------------------------------------------------------------------
1 | car man 3


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data10.sparse:
--------------------------------------------------------------------------------
1 | man car car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.cols:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.dense:
--------------------------------------------------------------------------------
1 | red 3
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data2.sparse:
--------------------------------------------------------------------------------
1 | red car 3
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.dense:
--------------------------------------------------------------------------------
1 | red 5 0
2 | red 10 0
3 | blue 0 6
4 | 
5 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data3.sparse:
--------------------------------------------------------------------------------
1 | red car 5
2 | red car 10
3 | blue man 6
4 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.cols:
--------------------------------------------------------------------------------
1 | car
2 | man
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.dense:
--------------------------------------------------------------------------------
1 | red 5 0
2 | blue 0 6


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data4.sparse:
--------------------------------------------------------------------------------
1 | red car 5
2 | blue man 6
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data5.cols:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data5.cols


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.cols:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.dense:
--------------------------------------------------------------------------------
1 | red 0


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data7.sparse:
--------------------------------------------------------------------------------
1 | red car 0
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data8.dense:
--------------------------------------------------------------------------------
1 | car 3 5 6
2 | man 3 5
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data8.sparse:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/data8.sparse


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.cols:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.dense:
--------------------------------------------------------------------------------
1 | car
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/data9.sparse:
--------------------------------------------------------------------------------
1 | man car 4 5
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row1.row:
--------------------------------------------------------------------------------
1 | red
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row2.row:
--------------------------------------------------------------------------------
1 | blue
2 | red


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/row3.row:
--------------------------------------------------------------------------------
1 | blue
2 | red
3 | blue
4 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.col:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.col


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.cols:
--------------------------------------------------------------------------------
1 | f1
2 | f2
3 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.dm:
--------------------------------------------------------------------------------
1 | a	0	0
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.row:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.row


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.rows:
--------------------------------------------------------------------------------
1 | a
2 | 


--------------------------------------------------------------------------------
/resource/unittest/space_test_resources/tmp.sm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/resource/unittest/space_test_resources/tmp.sm


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | from setuptools import setup
 5 | from setuptools.command.test import test as TestCommand
 6 | 
 7 | 
 8 | class PyTest(TestCommand):
 9 |     def finalize_options(self):
10 |         TestCommand.finalize_options(self)
11 |         self.test_args = 'src/unitest'
12 |         self.test_suite = True
13 | 
14 |     def run_tests(self):
15 |         #import here, cause outside the eggs aren't loaded
16 |         import pytest
17 |         errno = pytest.main(self.test_args)
18 |         sys.exit(errno)
19 | 
20 | 
21 | setup(
22 |     name='dissect',
23 |     version='0.1.0',
24 |     description='COMPOSES DISSECT TOOLKIT',
25 |     author='Georgiana Dinu, The Nghia Pham, Marco Baroni',
26 |     author_email='georgiana.dinu@unitn.it,thenghia.pham@unitn.it',
27 |     url='http://http://clic.cimec.unitn.it/composes/toolkit/',
28 |     install_requires=['numpy', 'scipy', 'sparsesvd'],
29 |     tests_require=['pytest>=2.4.2'],
30 |     cmdclass={'test': PyTest},
31 |     package_dir={'': 'src'},
32 |     packages=[
33 |         'composes',
34 |         'composes.composition',
35 |         'composes.matrix',
36 |         'composes.semantic_space',
37 |         'composes.exception',
38 |         'composes.similarity',
39 |         'composes.transformation',
40 |         'composes.utils',
41 |         'composes.transformation.dim_reduction',
42 |         'composes.transformation.feature_selection',
43 |         'composes.transformation.scaling',
44 |     ],
45 | )
46 | 


--------------------------------------------------------------------------------
/src/composes/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class NullHandler(logging.Handler):
 4 |     """For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""
 5 |     def emit(self, record):
 6 |         pass
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | if len(logger.handlers) == 0:    # To ensure reload() doesn't add another one
10 |     logger.addHandler(NullHandler())
11 | 
12 | #logging.basicConfig(filename='composes.log', filemode='w+',level=logging.DEBUG, format = "")
13 | 


--------------------------------------------------------------------------------
/src/composes/composition/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/composition/__init__.py


--------------------------------------------------------------------------------
/src/composes/composition/dilation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 15, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import numpy as np
 7 | from composition_model import CompositionModel
 8 | from composes.utils.num_utils import is_numeric
 9 | from composes.utils.py_matrix_utils import nonzero_invert
10 | 
11 | 
12 | class Dilation(CompositionModel):
13 |     """
14 |     Implements the dilation compositional model:
15 | 
16 |         :math:`\\vec{p} = (\\vec{u} \\cdot \\vec{u}) \\vec{v} + (\\lambda - 1) (\\vec{u} \\cdot \\vec{v}) \\vec{u}`
17 | 
18 |     where :math:`\\vec{p}` is the vector of the composed phrase, :math:`\\vec{u}, \\vec{v}` the vectors of the components
19 |     and :math:`\\lambda` is a scalar.
20 | 
21 |     """
22 | 
23 | 
24 |     _name = "dilation"
25 | 
26 |     _lambda = 2
27 | 
28 | 
29 |     def __init__(self, lambda_=None):
30 |         """
31 |         Constructor.
32 | 
33 |         Args:
34 |             lambda_ : numeric, value of the lambda parameter. Optional.
35 |         """
36 | 
37 |         if not lambda_ is None:
38 |             if not is_numeric(lambda_):
39 |                 raise ValueError("Parameter not numeric: %s " %(type(lambda_)))
40 |             else:
41 |                 self._lambda = lambda_
42 | 
43 |     def _solve(self, arg1_mat, arg2_mat, phrase_mat):
44 | 
45 |         v1_row_norms = arg1_mat.norm(1)
46 |         v1_row_sqr_norms = np.multiply(v1_row_norms, v1_row_norms)
47 | 
48 |         v2_minus_p = arg2_mat.scale_rows(v1_row_sqr_norms) - phrase_mat
49 |         v1_dot_prod_v2_minus_p = arg1_mat.multiply(v2_minus_p).sum(1)
50 | 
51 |         v1_v2 = arg1_mat.multiply(arg2_mat).sum(1)
52 |         v1_v2_sqr = np.multiply(v1_v2, v1_v2)
53 | 
54 |         nom = np.multiply(v1_v2_sqr, v1_row_sqr_norms).sum()
55 |         denom = np.multiply(v1_v2, v1_dot_prod_v2_minus_p).sum()
56 | 
57 |         if nom != 0:
58 |             self._lambda = 1 - denom/nom
59 |         else:
60 |             self._lambda = 2
61 | 
62 | 
63 |     def _compose(self, arg1_mat, arg2_mat):
64 |         # TO DO: this is inefficient here, we do 2 for s instead of one
65 |         # we do a for in get_rows in parent.compose() and a for here
66 |         # comp = ((self._lambda -1) * v1.multiply(v2).sum()/pow(v1.norm(),2)) * v1 + v2
67 | 
68 |         v1_row_norms = arg1_mat.norm(1)
69 |         scale_factors1 = arg1_mat.multiply(arg2_mat).sum(1)
70 |         scale_factors2 = np.multiply(v1_row_norms, v1_row_norms)
71 | 
72 |         arg1_mat_scaled = arg1_mat.scale_rows(scale_factors1)
73 |         arg2_mat_scaled = arg2_mat.scale_rows(scale_factors2)
74 | 
75 |         #print "FACTORS u:", ((self._lambda -1)*scale_factors1).sum()/float(len(scale_factors1))
76 |         #print "FACTORS v:", (scale_factors2).sum()/float(len(scale_factors2))
77 | 
78 |         result = (self._lambda - 1) * arg1_mat_scaled + arg2_mat_scaled
79 | 
80 |         return result
81 | 
82 |     def get_lambda(self):
83 |         return self._lambda
84 |     """
85 |     Lambda parameter. Default, set to lambda=2.
86 |     """
87 | 
88 | 
89 |     def _export(self, filename):
90 |         with open(filename, "w") as output_stream:
91 |             output_stream.write("lambda\t%f" % self._lambda)
92 | 


--------------------------------------------------------------------------------
/src/composes/composition/full_additive.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 5, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | from composition_model import CompositionModel
  8 | from composes.utils.gen_utils import assert_is_instance
  9 | from composes.utils.matrix_utils import is_array_or_matrix
 10 | from composes.utils.matrix_utils import padd_matrix
 11 | from composes.utils.matrix_utils import to_compatible_matrix_types
 12 | from composes.utils.regression_learner import LstsqRegressionLearner
 13 | from composes.utils.regression_learner import RegressionLearner
 14 | from composes.utils.matrix_utils import resolve_type_conflict
 15 | from composes.matrix.dense_matrix import DenseMatrix
 16 | from composes.exception.illegal_state_error import IllegalStateError
 17 | 
 18 | 
 19 | class FullAdditive(CompositionModel):
 20 |     """
 21 |     Implements the full additive compositional model:
 22 | 
 23 |         :math:`\\vec{p} = A \\vec{u} + B \\vec{v}`
 24 | 
 25 |     where :math:`\\vec{p}` is the vector of the composed phrase,
 26 |     :math:`\\vec{u}, \\vec{v}`, the vectors of the components
 27 |     and :math:`A`, :math:`B` are two matrices.
 28 | 
 29 |     """
 30 |     _name = "full_additive"
 31 |     _mat_a_t = None
 32 |     _mat_b_t = None
 33 | 
 34 | 
 35 |     def __init__(self, A=None, B=None, learner=LstsqRegressionLearner()):
 36 |         #TODO here; very important, should be able to set the intercept
 37 |         #when mat a and mat b are given , to true or false. now by default is
 38 |         #is false
 39 |         """
 40 |         Constructor.
 41 | 
 42 |         Args:
 43 |             A= : matrix A, of matrix-like type (Matrix, ndarray,
 44 |             numpy matrix, scipy matrix). Optional (parameters can be set
 45 |             through training.)
 46 | 
 47 |             B= : matrix B, matrix-like type. Optional.
 48 | 
 49 |             learner= : regression learner object, of type RegressionLearner.
 50 |             Optional, default LstsqRegressionLearner.
 51 |         """
 52 |         if A is not None and B is not None:
 53 |             mat_a = A
 54 |             mat_b = B
 55 |             if not is_array_or_matrix(mat_a):
 56 |                 raise TypeError("expected matrix type, received: %s"
 57 |                                 % type(mat_a))
 58 | 
 59 |             if not is_array_or_matrix(mat_b):
 60 |                 raise TypeError("expected matrix type, received: %s"
 61 |                                 % type(mat_b))
 62 | 
 63 |             mat_a, mat_b = to_compatible_matrix_types(mat_a, mat_b)
 64 |             self._mat_a_t = mat_a.transpose()
 65 |             self._mat_b_t = mat_b.transpose()
 66 |             self._has_intercept = False
 67 | 
 68 |         else:
 69 |             self._regression_learner = learner
 70 |             self._has_intercept = self._regression_learner.has_intercept()
 71 | 
 72 | 
 73 |     def _solve(self, arg1_mat, arg2_mat, phrase_mat):
 74 | 
 75 |         self._has_intercept = self._regression_learner.has_intercept()
 76 | 
 77 |         result = self._regression_learner.train(arg1_mat.hstack(arg2_mat), phrase_mat)
 78 | 
 79 |         self._mat_a_t = result[0:arg1_mat.shape[1], :]
 80 |         self._mat_b_t = result[arg1_mat.shape[1]:, :]
 81 | 
 82 | 
 83 |     def _compose(self, arg1_mat, arg2_mat):
 84 |         #NOTE when we get in this compose arg1 mat and arg2 mat have the same type
 85 |         [mat_a_t, mat_b_t, arg1_mat] = resolve_type_conflict([self._mat_a_t,
 86 |                                                               self._mat_b_t,
 87 |                                                               arg1_mat],
 88 |                                                              type(arg1_mat))
 89 |         if self._has_intercept:
 90 |             return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t
 91 |         else:
 92 |             return arg1_mat * mat_a_t + arg2_mat * mat_b_t
 93 | 
 94 |     def set_regression_learner(self, regression_learner):
 95 |         assert_is_instance(regression_learner, RegressionLearner)
 96 |         self._regression_learner = regression_learner
 97 | 
 98 |     def get_regression_learner(self):
 99 |         return self._regression_learner
100 | 
101 |     regression_learner = property(get_regression_learner, set_regression_learner)
102 |     """
103 |     Regression method to be used in training, of type RegressionLearner.
104 |     Default is LstsqRegressionLearner.
105 |     """
106 | 
107 |     def _build_id2column(self, arg1_space, arg2_space):
108 |         return []
109 | 
110 |     def _export(self, filename):
111 |         if self._mat_a_t is None or self._mat_b_t is None:
112 |             raise IllegalStateError("cannot export an untrained FullAdditive model.")
113 | 
114 |         with open(filename, "w") as output_stream:
115 |             output_stream.write("A\n")
116 |             output_stream.write(str(DenseMatrix(self._mat_a_t).mat.T))
117 |             output_stream.write("\nB\n")
118 | 
119 |             if self._has_intercept:
120 |                 output_stream.write(str(DenseMatrix(self._mat_b_t[:-1,]).mat.T))
121 |                 output_stream.write("\nIntercept\n")
122 |                 output_stream.write(str(DenseMatrix(self._mat_b_t[-1,]).mat.T))
123 |             else:
124 |                 output_stream.write(str(DenseMatrix(self._mat_b_t).mat.T))
125 | 
126 | 
127 |     def get_mat_a_t(self):
128 |         return self._mat_a_t
129 |     mat_a_t = property(get_mat_a_t)
130 |     """
131 |     Transpose of matrix A parameter, of type Matrix.
132 |     """
133 | 
134 |     def get_mat_b_t(self):
135 |         return self._mat_b_t
136 |     mat_b_t = property(get_mat_b_t)
137 |     """
138 |     Transpose of matrix B parameter, of type Matrix.
139 |     """
140 | 


--------------------------------------------------------------------------------
/src/composes/composition/multiplicative.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 5, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from composition_model import CompositionModel
 8 | from composes.exception.illegal_state_error import IllegalOperationError
 9 | 
10 | class Multiplicative(CompositionModel):
11 |     """
12 |     Implements the component-wise multiplication compositional model:
13 | 
14 |         :math:`\\vec{p} = \\vec{u} \\cdot \\vec{v}`
15 | 
16 |     where :math:`\\vec{p}` is the vector of the composed phrase and
17 |     :math:`\\vec{u}, \\vec{v}` are the vectors of the components.
18 | 
19 |     :math:`\\vec{u} \\cdot \\vec{v} = (u_1v_1,...,u_nv_n)`
20 |     """
21 | 
22 |     _name = "multiplicative"
23 | 
24 |     def __init__(self):
25 |         """
26 |         Constructor
27 |         """
28 | 
29 |     def train(self):
30 |         """
31 |         Current multiplicative model cannot be trained, it has no parameters.
32 |         """
33 |         raise IllegalOperationError("Cannot train multiplicative model!")
34 | 
35 |     def _compose(self, arg1_mat, arg2_mat):
36 |         return arg1_mat.multiply(arg2_mat)
37 | 
38 |     def export(self, filename):
39 |         """
40 |         Current multiplicative model cannot be exported, it has no parameters.
41 |         """
42 |         raise IllegalOperationError("cannot export a Multiplicative model.")
43 | 


--------------------------------------------------------------------------------
/src/composes/composition/weighted_additive.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 5, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | from composition_model import CompositionModel
  8 | from composes.matrix.dense_matrix import DenseMatrix
  9 | from composes.utils.num_utils import is_numeric
 10 | # from composes.utils.mem_utils import get_mem_usage
 11 | from composes.utils.matrix_utils import resolve_type_conflict
 12 | import numpy as np
 13 | import math
 14 | 
 15 | class WeightedAdditive(CompositionModel):
 16 |     """
 17 |     Implements weighted additive compositional model:
 18 | 
 19 |         :math:`\\vec{p} = \\alpha \\vec{u} + \\beta \\vec{v}`
 20 | 
 21 |     where :math:`\\vec{p}` is the vector of the composed phrase and
 22 |     :math:`\\vec{u}, \\vec{v}` are the vectors of the components
 23 | 
 24 |     When :math:`\\alpha=\\beta=0.5` the model performs simple vector addition.
 25 |     """
 26 | 
 27 |     _name = "weighted_additive"
 28 | 
 29 |     """
 30 |     double, in interval [0,1]
 31 |     maximum overhead allowed: MAX_MEM_OVERHEAD ratio of peripheral space memory
 32 |     """
 33 |     MAX_MEM_OVERHEAD = 0.2
 34 | 
 35 | 
 36 |     def __init__(self, alpha=None, beta=None):
 37 |         """
 38 |         Constructor.
 39 | 
 40 |         Args:
 41 |             alpha: alpha parameter, numeric type. Optional, can be set through
 42 |             training
 43 |             beta: beta parameter, numeric type. Optional, can be set through
 44 |             training.
 45 | 
 46 |         Raises:
 47 |             TypeError if alpha or beta are not numeric.
 48 |         """
 49 |         self._alpha = 0.5
 50 |         self._beta = 0.5
 51 |         if not alpha is None:
 52 |             if not is_numeric(alpha):
 53 |                 raise TypeError("Parameter not numeric: %s " % (type(alpha)))
 54 |             else:
 55 |                 self._alpha = alpha
 56 | 
 57 |         if not beta is None:
 58 |             if not is_numeric(beta):
 59 |                 raise TypeError("Parameter not numeric: %s " % (type(beta)))
 60 |             else:
 61 |                 self._beta = beta
 62 | 
 63 |         if not alpha is None and beta is None:
 64 |             self._beta = 1 - self._alpha
 65 | 
 66 | 
 67 |     def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list):
 68 | 
 69 |         # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
 70 |         # the /3.0 is needed
 71 |         # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
 72 |         chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] * self.MAX_MEM_OVERHEAD / 3.0) + 1
 73 | 
 74 |         arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (0, 0, 0, 0, 0)
 75 | 
 76 |         for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
 77 |             beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))
 78 | 
 79 |             arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
 80 |             arg2_mat = arg2_space.get_rows(arg2_list[beg:end])
 81 |             phrase_mat = phrase_space.get_rows(phrase_list[beg:end])
 82 | 
 83 |             [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat,
 84 |                                                                       arg2_mat,
 85 |                                                                       phrase_mat],
 86 |                                                                       DenseMatrix)
 87 | 
 88 |             res = self._process(arg1_mat, arg2_mat, phrase_mat)
 89 |             arg1_arg2_dot += res[0]
 90 |             arg1_phrase_dot += res[1]
 91 |             arg2_phrase_dot += res[2]
 92 |             arg1_norm_sqr += res[3]
 93 |             arg2_norm_sqr += res[4]
 94 | 
 95 | 
 96 |         self._solve(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr)
 97 | 
 98 | 
 99 |     def _process(self, arg1_mat, arg2_mat, phrase_mat):
100 | 
101 |         # debug here
102 |         # remove when done
103 |         # print "Using %s MB " % (get_mem_usage())
104 | 
105 |         arg1_arg2_dot = arg1_mat.multiply(arg2_mat).sum()
106 |         arg1_phrase_dot = arg1_mat.multiply(phrase_mat).sum()
107 |         arg2_phrase_dot = arg2_mat.multiply(phrase_mat).sum()
108 | 
109 |         arg1_norm_sqr = pow(arg1_mat.norm(), 2)
110 |         arg2_norm_sqr = pow(arg2_mat.norm(), 2)
111 | 
112 |         return arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr
113 | 
114 |     def _solve(self, arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr):
115 | 
116 |         a = np.linalg.pinv(np.mat([[arg1_norm_sqr,arg1_arg2_dot],
117 |                                    [arg1_arg2_dot,arg2_norm_sqr]]))
118 |         a = a * np.mat([[arg1_phrase_dot],[arg2_phrase_dot]])
119 |         self._alpha = a[0, 0]
120 |         self._beta = a[1, 0]
121 | 
122 | 
123 |     def _compose(self, arg1_mat, arg2_mat):
124 |         return self._alpha * arg1_mat + self._beta * arg2_mat
125 | 
126 |     def _export(self, filename):
127 |         with open(filename, "w") as output_stream:
128 |             output_stream.write("alpha\t%f\n" % self._alpha)
129 |             output_stream.write("beta\t%f" % self._beta)
130 | 
131 |     def get_alpha(self):
132 |         return self._alpha
133 |     alpha = property(get_alpha)
134 |     """
135 |     Alpha parameter, default 0.5.
136 |     """
137 | 
138 |     def get_beta(self):
139 |         return self._beta
140 |     beta = property(get_beta)
141 |     """
142 |     Beta parameter, default 0.5.
143 |     """
144 | 


--------------------------------------------------------------------------------
/src/composes/exception/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/exception/__init__.py


--------------------------------------------------------------------------------
/src/composes/exception/illegal_state_error.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jun 15, 2012
 3 | 
 4 | @author: thenghia.pham
 5 | '''
 6 | 
 7 | class IllegalStateError(Exception):
 8 |     '''
 9 |     '''
10 |     def __init__(self, msg):
11 |         self.__msg = msg
12 | 
13 | 
14 | class IllegalOperationError(Exception):
15 |     '''
16 |     '''
17 |     def __init__(self, msg):
18 |         self.__msg = msg


--------------------------------------------------------------------------------
/src/composes/exception/invalid_argument_error.py:
--------------------------------------------------------------------------------
1 | 
2 | class InvalidArgumentError(Exception):
3 |     '''
4 |     '''
5 |     def __init__(self, msg):
6 |         self.__msg = msg


--------------------------------------------------------------------------------
/src/composes/matrix/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/matrix/__init__.py


--------------------------------------------------------------------------------
/src/composes/matrix/matrix.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 17, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | from composes.utils.num_utils import is_numeric
  8 | from composes.utils.py_matrix_utils import is_array
  9 | 
 10 | class Matrix(object):
 11 |     """
 12 |     Provides a common interface for matrix implementations.
 13 | 
 14 |     Provides a common interface for different matrix implementations
 15 |     (sparse/dense). In vector space models, a matrix is used to encode
 16 |     a set of entities such as words or phrases (rows) described in terms
 17 |     of contextual features (columns).
 18 |     """
 19 | 
 20 |     def __init__(self, *args, **kwargs):
 21 |         raise NotImplementedError()
 22 | 
 23 | 
 24 |     def __add__(self, matrix_):
 25 |         ''' + operation'''
 26 |         self._assert_same_type(matrix_)
 27 |         return type(self)(self.mat + matrix_.mat)
 28 | 
 29 |     def __sub__(self, matrix_):
 30 |         ''' - operation'''
 31 |         self._assert_same_type(matrix_)
 32 |         return type(self)(self.mat - matrix_.mat)
 33 | 
 34 |     def __neg__(self):
 35 |         ''' - operation'''
 36 |         return type(self)(-self.mat)
 37 | 
 38 |     def __mul__(self, factor):
 39 |         ''' * operation'''
 40 |         if is_numeric(factor):
 41 |             return type(self)(self.mat * factor)
 42 |         else:
 43 |             self._assert_same_type(factor)
 44 |             return type(self)(self.mat * factor.mat)
 45 | 
 46 |     def __div__(self, factor):
 47 |         ''' / operation'''
 48 |         if is_numeric(factor):
 49 |             if factor == 0:
 50 |                 raise ZeroDivisionError("Division by zero")
 51 |         else:
 52 |             raise TypeError("expected numeric type, received %s" % (type(factor)))
 53 |         return type(self)(self.mat / float(factor))
 54 | 
 55 |     def __rmul__(self, factor):
 56 |         ''' * operation'''
 57 |         if is_numeric(factor):
 58 |             return self.__mul__(factor)
 59 |         raise TypeError("expected numeric type, received %s" % (type(factor)))
 60 | 
 61 | 
 62 |     #TODO move all these asserts somewhere else
 63 |     def _assert_same_type(self, operand):
 64 |         if type(self) != type(operand):
 65 |             raise TypeError("expected matrix of type %s, received %s" %
 66 |                              (type(self), type(operand)))
 67 | 
 68 |     def assert_same_shape(self, matrix_):
 69 |         """
 70 |         Asserts that the matrix has the same shape as a second matrix.
 71 | 
 72 |         Args:
 73 |             matrix_: A second matrix of type Matrix.
 74 | 
 75 |         Raises:
 76 |             ValueError: If the current matrix and the argument matrix
 77 |                 do not have the same shape.
 78 |         """
 79 | 
 80 |         if self.mat.shape != matrix_.mat.shape:
 81 |             raise ValueError("inconsistent shapes: %s %s"
 82 |                              % (str(self.mat.shape), str(matrix_.mat.shape) ))
 83 | 
 84 |     #TODO move all these asserts somewhere else
 85 |     def _assert_array(self, operand):
 86 |         if not is_array(operand):
 87 |             raise TypeError("expected array, received %s" % (type(operand)))
 88 | 
 89 | 
 90 |     def sum(self, axis=None):
 91 |         #return type is dense matrix of shape (1, dimy) or (dimx,1)
 92 |         #or a number if **kwargs is None
 93 |         return self.mat.sum(axis)
 94 | 
 95 |     def sorted_permutation(self, norm_function, axis_):
 96 |         """
 97 |         Computes the permutation resulted when sorting the matrix
 98 |         on an axis, according to a function, in descending order.
 99 | 
100 |         Sorts the rows or the columns (as given by axis)
101 |         of a matrix according to a norm_function and returns
102 |         the permutation of this as a np.array
103 | 
104 |         Args:
105 |             norm_function: One of sum/length. A function that
106 |                 takes an axis as an argument (i.e. 0 or 1) and
107 |                 returns an array of values (i.e. sum of all rows
108 |                 if axis = 0 and norm_function = sum).
109 | 
110 |             axis_: axis value, one of 0/1
111 | 
112 |         Returns:
113 |             perm_srtd: np.array containing the permutation of the
114 |                 sorting
115 |         """
116 | 
117 |         #norms = norm_function(axis=axis_)
118 | 
119 |         norms = norm_function(axis_).getA().flatten()
120 |         perm_srtd = sorted(range(len(norms)), key = norms.__getitem__,
121 |                            reverse=True)
122 | 
123 |         return perm_srtd
124 | 
125 |     def get_mat(self):
126 |         return self._mat
127 | 
128 |     def set_mat(self, mat_):
129 |         self._mat = mat_
130 | 
131 |     mat = property(get_mat, set_mat)
132 |     """
133 |     Stores the actual matrix structure of the Matrix object.
134 |     Of type numpy.matrix for DenseMatrix, and scipy.sparse.csr_matrix
135 |     for SparseMatrix.
136 |     """
137 | 
138 |     def get_shape(self):
139 |         return self.mat.shape
140 | 
141 |     shape = property(get_shape)
142 |     """
143 |     Shape of the matrix, tuple with two elements.
144 |     """
145 | 
146 |     def copy(self):
147 |         return type(self)(self.mat.copy())
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/src/composes/semantic_space/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/semantic_space/__init__.py


--------------------------------------------------------------------------------
/src/composes/semantic_space/peripheral_space.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 26, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | from space import Space
  8 | from numpy import array
  9 | from composes.utils.space_utils import list2dict
 10 | from composes.utils.space_utils import assert_dict_match_list
 11 | from composes.utils.space_utils import assert_shape_consistent
 12 | from composes.utils.space_utils import add_items_to_dict
 13 | from composes.semantic_space.operation import FeatureSelectionOperation
 14 | from composes.semantic_space.operation import DimensionalityReductionOperation
 15 | from composes.utils.gen_utils import assert_is_instance
 16 | from composes.matrix.matrix import Matrix
 17 | 
 18 | class PeripheralSpace(Space):
 19 |     '''
 20 |     classdocs
 21 |     '''
 22 | 
 23 | 
 24 |     def __init__(self, core_space, matrix_, id2row, row2id=None):
 25 |         """
 26 |         Constructor.
 27 | 
 28 |         Args:
 29 |             core_space: Space type, the core space that this is peripheral to.
 30 |             matrix_: Matrix type, the data matrix of the space
 31 |             id2row: list, the row elements
 32 |             row2id: dictionary, maps row strings to ids. Optional, built from
 33 |                 id2row by default.
 34 | 
 35 |         Returns:
 36 |              A peripheral semantic space (type PeripheralSpace) on which the
 37 |              core space operations have been projected. Column indexing structures
 38 |              and operations are taken over from the core space.
 39 | 
 40 |         Raises:
 41 |             TypeError: if matrix_ or core_space are not of the correct type
 42 |             ValueError: if element shape is not consistent with
 43 |                          the size of matrix rows
 44 |                         if the matrix and the provided row and column
 45 |                          indexing structures are not of consistent shapes.
 46 |         """
 47 |         assert_is_instance(matrix_, Matrix)
 48 |         assert_is_instance(core_space, Space)
 49 |         assert_is_instance(id2row, list)
 50 |         # TODO: assert it is not a peripheral space here!
 51 | 
 52 |         if row2id is None:
 53 |             row2id = list2dict(id2row)
 54 |         else:
 55 |             assert_dict_match_list(row2id, id2row)
 56 | 
 57 |         column2id = core_space.column2id
 58 |         id2column = core_space.id2column
 59 | 
 60 |         self._operations = list(core_space.operations)
 61 |         self._row2id = row2id
 62 |         self._id2row = id2row
 63 |         self._column2id = column2id
 64 |         self._id2column = id2column
 65 | 
 66 |         self._cooccurrence_matrix = self._project_core_operations(matrix_)
 67 |         assert_shape_consistent(self.cooccurrence_matrix, self._id2row,
 68 |                                  self._id2column, self._row2id, self._column2id)
 69 | 
 70 |         self._element_shape = (self._cooccurrence_matrix.shape[1],)
 71 | 
 72 | 
 73 |     def _project_core_operations(self, matrix_):
 74 | 
 75 |         for operation in self._operations:
 76 |             if isinstance(operation, DimensionalityReductionOperation):
 77 |                 self._id2column, self._column2id = [], {}
 78 | 
 79 |             if isinstance(operation, FeatureSelectionOperation):
 80 |                 if operation.original_columns:
 81 |                     self._id2column = list(array(operation.original_columns)[operation.selected_columns])
 82 |                     self._column2id = list2dict(self._id2column)
 83 |                 else:
 84 |                     self._id2column, self._column2id = [],{}
 85 | 
 86 |             matrix_ = operation.project(matrix_)
 87 |         return matrix_
 88 | 
 89 | 
 90 |     def add_rows(self, matrix_, id2row):
 91 |         """
 92 |         Adds rows to a peripheral space.
 93 | 
 94 |         Args:
 95 |             matrix_: Matrix type, the matrix of the elements to be added.
 96 |             id2row: list, string identifiers of the rows to be added.
 97 | 
 98 |         Modifies the current space by appending the new rows.
 99 |         All operations of the core space are projected to the new rows.
100 | 
101 |         Raises:
102 |             ValueError: if attempting to add row strings which are already
103 |                         in the space.
104 |                         matrix of the new data is not consistent in shape
105 |                         with the current data matrix.
106 |         """
107 | 
108 |         try:
109 |             self._row2id = add_items_to_dict(self.row2id, id2row)
110 |         except ValueError:
111 |             raise ValueError("Found duplicate keys when appending rows to\
112 |                             peripheral space.")
113 | 
114 |         if matrix_.mat.shape[0] != len(id2row):
115 |             raise ValueError("Matrix shape inconsistent with no. of rows:%s %s"
116 |                               % (matrix_.mat.shape, len(id2row)))
117 | 
118 |         self._id2row = self.id2row + id2row
119 |         matrix_ = self._project_core_operations(matrix_)
120 | 
121 |         self._cooccurrence_matrix = self._cooccurrence_matrix.vstack(matrix_)
122 |         assert_shape_consistent(self.cooccurrence_matrix, self.id2row,
123 |                                  self.id2column, self.row2id, self.column2id)
124 | 
125 |     @classmethod
126 |     def build(cls, core_space, **kwargs):
127 |         """
128 |         Reads in data files and extracts the data to construct a semantic space.
129 | 
130 |         If the data is read in dense format and no columns are provided,
131 |         the column indexing structures are set to empty.
132 | 
133 |         Args:
134 |             data: file containing the counts
135 |             format: format on the input data file: one of sm/dm
136 |             rows: file containing the row elements. Optional, if not provided,
137 |                 extracted from the data file.
138 |             cols: file containing the column elements
139 | 
140 |         Returns:
141 |             A semantic space build from the input data files.
142 | 
143 |         Raises:
144 |             ValueError: if one of data/format arguments is missing.
145 |                         if cols is missing and format is "sm"
146 |                         if the input columns provided are not consistent with
147 |                         the shape of the matrix (for "dm" format)
148 | 
149 |         """
150 | 
151 |         sp = Space.build(**kwargs)
152 | 
153 |         mat = sp._cooccurrence_matrix
154 |         id2row = sp.id2row
155 |         row2id = sp.row2id
156 |         return PeripheralSpace(core_space, mat, id2row, row2id)
157 | 
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/src/composes/similarity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/similarity/__init__.py


--------------------------------------------------------------------------------
/src/composes/similarity/cos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | """
 6 | import numpy as np
 7 | 
 8 | from composes.utils.py_matrix_utils import nonzero_invert
 9 | 
10 | from composes.similarity.similarity import Similarity
11 | from composes.similarity.dot_prod import DotProdSimilarity
12 | 
13 | 
14 | class CosSimilarity(Similarity):
15 |     """
16 |     Computes the cosine similarity of two vectors.
17 | 
18 |     :math:`sim(\\vec{u},\\vec{v}) = \\frac{<\\vec{u},\\vec{v}>}{\\sqrt{||\\vec{u}||||\\vec{v}||}}`
19 | 
20 |     """
21 | 
22 |     def _sim(self, v1, v2):
23 |         if v1.norm() == 0 or v2.norm() == 0:
24 |             return 0.0
25 |         s = DotProdSimilarity()._sim(v1, v2) / np.double(v1.norm() * v2.norm())
26 |         return s
27 | 
28 |     def _sims_to_matrix(self, vector, matrix_):
29 |         sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_)
30 | 
31 |         vector_norm = vector.norm()
32 |         row_norms = vector_norm * matrix_.norm(1)
33 |         row_norms = nonzero_invert(row_norms)
34 | 
35 |         return sims.scale_rows(row_norms)
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/src/composes/similarity/dot_prod.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | """
 6 | from composes.similarity.similarity import Similarity
 7 | 
 8 | 
 9 | class DotProdSimilarity(Similarity):
10 |     """
11 |     Computes the scalar product (dot product) of two vectors.
12 | 
13 |    :math:`sim(\\vec{u},\\vec{v}) = <\\vec{u},\\vec{v}> = \\sum_iu_iv_i`
14 | 
15 |     """
16 |     def _sim(self, v1, v2):
17 |         return v1.multiply(v2).sum()
18 | 
19 |     def _sims_to_matrix(self, vector, matrix_):
20 |         return matrix_ * vector.transpose()
21 | 


--------------------------------------------------------------------------------
/src/composes/similarity/euclidean.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | """
 6 | from composes.similarity.similarity import Similarity
 7 | 
 8 | 
 9 | class EuclideanSimilarity(Similarity):
10 |     """
11 |     Computes the euclidean similarity of two vectors as the inverse of their
12 |     euclidean distance.
13 | 
14 |     :math:`sim(\\vec{u},\\vec{v}) = \\frac{1}{||\\vec{u}-\\vec{v}|| + 1}`
15 |     """
16 | 
17 |     def _sim(self, v1, v2):
18 |         return 1 / (1 + (v1 - v2).norm())
19 | 


--------------------------------------------------------------------------------
/src/composes/similarity/lin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | """
 6 | import numpy as np
 7 | 
 8 | from composes.similarity.similarity import Similarity
 9 | 
10 | 
11 | class LinSimilarity(Similarity):
12 |     """
13 |     Computes the Lin similarity of two vectors.
14 | 
15 |     :math:`sim(\\vec{u},\\vec{v}) = \\frac{\\sum_{i \\in I}(u_i+v_i)}{\\sum_iu_i + \\sum_iv_i}`
16 | 
17 |     Where :math:`I=\\{i | u_i > 0 \\text{ and } v_i > 0\\}`, the set of components
18 |     on which both vectors are strictly positive.
19 | 
20 |     """
21 | 
22 |     def _sim(self, v1, v2):
23 | 
24 |         common = v1.multiply(v2)
25 |         common.to_ones()
26 |         denom = v1.sum() + v2.sum()
27 | 
28 |         if denom == 0:
29 |             return 0
30 |         else:
31 |             return common.multiply(v1 + v2).sum() / np.double(denom)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/src/composes/similarity/similarity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Oct 2, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | """
 6 | import numpy as np
 7 | 
 8 | from composes.utils.matrix_utils import (
 9 |     assert_is_array_or_matrix,
10 |     to_compatible_matrix_types,
11 | )
12 | 
13 | 
14 | class Similarity(object):
15 | 
16 |     def get_sim(self, v1, v2):
17 | 
18 |         assert_is_array_or_matrix(v1)
19 |         assert_is_array_or_matrix(v2)
20 | 
21 |         # TODO: figure out where these asserts belong!!
22 |         v1, v2 = to_compatible_matrix_types(v1, v2)
23 |         v1.assert_same_shape(v2)
24 | 
25 |         return self._sim(v1, v2)
26 | 
27 |     def get_sims_to_matrix(self, vector, matrix_):
28 | 
29 |         assert_is_array_or_matrix(vector)
30 |         assert_is_array_or_matrix(matrix_)
31 | 
32 |         vector, matrix_ = to_compatible_matrix_types(vector, matrix_)
33 | 
34 |         if vector.shape[1] != matrix_.shape[1] or vector.shape[0] != 1:
35 |             raise ValueError(
36 |                 'Inconsistent shapes {0} and {1}'.format(vector.shape, matrix_.shape)
37 |             )
38 | 
39 |         return self._sims_to_matrix(vector, matrix_)
40 | 
41 |     def _sims_to_matrix(self, vector, matrix_):
42 | 
43 |         result = np.zeros(shape=(matrix_.shape[0], 1))
44 |         for i in range(matrix_.shape[0]):
45 |             result[i] = self._sim(vector, matrix_[i, :])
46 |         return type(matrix_)(result)
47 | 


--------------------------------------------------------------------------------
/src/composes/transformation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/__init__.py


--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/dim_reduction/__init__.py


--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/dimensionality_reduction.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 28, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | from composes.semantic_space.operation import DimensionalityReductionOperation
 7 | 
 8 | class DimensionalityReduction(object):
 9 |     '''
10 |     classdocs
11 |     '''
12 | 
13 |     _name = "we are NOT stupid"
14 | 
15 |     def __init__(self, reduced_dimension):
16 |         '''
17 |         Constructor
18 |         '''
19 |         if reduced_dimension <= 0:
20 |             raise ValueError("Cannot reduce to non-positive dimensionality: %d"
21 |                              % reduced_dimension)
22 |         self._reduced_dimension = reduced_dimension
23 | 
24 |     def create_operation(self):
25 |         return DimensionalityReductionOperation(self)
26 | 
27 |     def get_reduced_dimension(self):
28 |         return self._reduced_dimension
29 | 
30 |     def get_name(self):
31 |         return self._name
32 | 
33 |     def __str__(self):
34 |         return self._name
35 | 
36 |     name = property(get_name)
37 |     reduced_dimension = property(get_reduced_dimension)
38 | 


--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/nmf.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 1, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | import numpy as np
  8 | from dimensionality_reduction import DimensionalityReduction
  9 | from composes.matrix.linalg import Linalg
 10 | from math import sqrt
 11 | 
 12 | class Nmf(DimensionalityReduction):
 13 |     """
 14 |     Performs Non-negative Matrix Factorization to reduced dimension :math:`k`.
 15 | 
 16 |     Given an input non-negative matrix :math:`X`, it computes the decomposition:
 17 | 
 18 |     :math:`X \\approx WH` where W and H are non-negative matrices which minimize
 19 |     :math:`||X-WH||_{2}`
 20 | 
 21 |     It returns the matrix W.
 22 |     """
 23 | 
 24 |     _name = "nmf"
 25 | 
 26 |     def __init__(self, reduced_dimension):
 27 |         '''
 28 |         Constructor
 29 |         '''
 30 |         super(Nmf, self).__init__(reduced_dimension)
 31 | 
 32 |     def apply(self, matrix_):
 33 | 
 34 |         matrix_.assert_positive()
 35 |         #w_init, h_init = self.nndsvd_init(matrix_)
 36 |         w_init, h_init = self.v_col_init(matrix_)
 37 |         #w_init, h_init = self.random_init(matrix_)
 38 |         w, h = Linalg.nmf(matrix_, w_init, h_init)
 39 |         return w, Linalg.pinv(h)
 40 | 
 41 |     def random_init(self, matrix_):
 42 | 
 43 |         # TODO: implement the fancier but still fast init (from nimfa: v_col)
 44 |         rndcol = np.random.random_integers(0, matrix_.shape[1] - 1,
 45 |                                            self._reduced_dimension)
 46 | 
 47 |         rndrow = np.random.random_integers(0, matrix_.shape[0] - 1,
 48 |                                            self._reduced_dimension)
 49 | 
 50 |         #otherwise we would have had to convert to DenseMatrix/SparseMatrix
 51 |         #type(matrix_)(result)
 52 |         w = matrix_[:, rndcol]
 53 |         h = matrix_[rndrow, :]
 54 | 
 55 |         return w, h
 56 | 
 57 |     def v_col_init(self, matrix_):
 58 |         w = np.zeros((matrix_.shape[0], self._reduced_dimension))
 59 |         h = np.zeros((self._reduced_dimension, matrix_.shape[1]))
 60 | 
 61 |         #in case there are less than 5 rows or columns
 62 |         p_col = matrix_.shape[1]//5 + 1
 63 |         p_row = matrix_.shape[0]//5 + 1
 64 |         for i in range(self._reduced_dimension):
 65 | 
 66 |             rndcol = np.random.random_integers(0, matrix_.shape[1] - 1,
 67 |                                            p_col)
 68 | 
 69 |             rndrow = np.random.random_integers(0, matrix_.shape[0] - 1,
 70 |                                            p_row)
 71 | 
 72 |             w[:, i] = (matrix_[:, rndcol].sum(1)/float(p_col)).flatten()
 73 |             h[i, :] = (matrix_[rndrow, :].sum(0)/float(p_row)).flatten()
 74 | 
 75 |         w = type(matrix_)(w)
 76 |         h = type(matrix_)(h)
 77 | 
 78 |         return w, h
 79 | 
 80 |     def nndsvd_init(self,matrix_):
 81 |         def matrix_abs(mat_):
 82 |             mat_p = mat_.get_non_negative()
 83 |             mat_n_abs = mat_p - mat_
 84 |             return mat_p + mat_n_abs
 85 | 
 86 |         def padd_zeros(matrix_, axis, thickness):
 87 |             matrix_type = type(matrix_)
 88 |             if axis == 0:
 89 |                 append_mat = matrix_type(np.zeros((thickness, matrix_.shape[1])))
 90 |                 return matrix_.vstack(append_mat)
 91 |             elif axis == 1:
 92 |                 append_mat = matrix_type(np.zeros((matrix_.shape[0], thickness)))
 93 |                 return matrix_.hstack(append_mat)
 94 | 
 95 |         u, s, v = Linalg.svd(matrix_, self._reduced_dimension);
 96 | 
 97 |         rank = u.shape[1]
 98 |         w = [[]]*rank
 99 |         h = [[]]*rank
100 | 
101 |         vt = v.transpose()
102 | 
103 |         w[0] = sqrt(s[0]) * matrix_abs(u[:,0])
104 |         h[0] = sqrt(s[0]) * matrix_abs(vt[0,:])
105 | 
106 |         for i in range(1,rank):
107 |             uu = u[:,i]
108 |             vv = vt[i,:]
109 |             uup = uu.get_non_negative()
110 |             uun = uup - uu
111 |             vvp = vv.get_non_negative()
112 |             vvn = vvp - vv
113 | 
114 |             n_uup = uup.norm()
115 |             n_uun = uun.norm()
116 |             n_vvp = vvp.norm()
117 |             n_vvn = vvn.norm()
118 | 
119 |             termp = n_uup * n_vvp; termn = n_uun * n_vvn
120 |             if (termp >= termn):
121 |                 w[i] = sqrt(s[i] * termp) * uup / n_uup
122 |                 h[i] = sqrt(s[i] * termp) * vvp / n_vvp
123 |             else:
124 |                 w[i] = sqrt(s[i] * termn) * uun / n_uun
125 |                 h[i] = sqrt(s[i] * termn) * vvn / n_vvn
126 | 
127 |         w = matrix_.nary_hstack(w)
128 |         h = matrix_.nary_vstack(h)
129 | 
130 |         w.remove_small_values(0.0000000001)
131 |         h.remove_small_values(0.0000000001)
132 | 
133 |         if (rank < self._reduced_dimension):
134 |             w = padd_zeros(w, 1, self._reduced_dimension - rank)
135 |             h = padd_zeros(h, 0, self._reduced_dimension - rank)
136 |         return w,h
137 | 


--------------------------------------------------------------------------------
/src/composes/transformation/dim_reduction/svd.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 28, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from dimensionality_reduction import DimensionalityReduction
 8 | from composes.matrix.linalg import Linalg
 9 | 
10 | class Svd(DimensionalityReduction):
11 |     """
12 |     Performs truncated Singular Value Decomposition to a reduced dimension :math:`k`.
13 | 
14 |     Given an input matrix :math:`X`, it computes the decomposition:
15 | 
16 |     :math:`X = U \\Sigma V^{T}`
17 | 
18 |     It returns :math:`U \\Sigma` truncated to dimension  :math:`min(k,rank(X))`
19 |     """
20 | 
21 |     _name = "svd"
22 | 
23 |     def __init__(self, reduced_dimension):
24 |         '''
25 |         Constructor
26 |         '''
27 |         super(Svd, self).__init__(reduced_dimension)
28 | 
29 |     def apply(self, matrix_):
30 | 
31 |         u, s, v = Linalg.svd(matrix_, self._reduced_dimension)
32 |         return u.scale_columns(s), v
33 | 
34 | 


--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/feature_selection/__init__.py


--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/feature_selection.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 5, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | from composes.semantic_space.operation import FeatureSelectionOperation
 7 | 
 8 | class FeatureSelection(object):
 9 |     '''
10 |     classdocs
11 |     '''
12 | 
13 | 
14 |     def __init__(self, reduced_dimension):
15 | 
16 |         if reduced_dimension <= 0:
17 |             raise ValueError("Cannot reduce to non-positive dimensionality: %d"
18 |                              % reduced_dimension)
19 |         self._reduced_dimension = reduced_dimension
20 | 
21 |     def create_operation(self):
22 |         return FeatureSelectionOperation(self)
23 | 
24 |     def get_reduced_dimension(self):
25 |         return self._reduced_dimension
26 | 
27 |     reduced_dimension = property(get_reduced_dimension)


--------------------------------------------------------------------------------
/src/composes/transformation/feature_selection/top_feature_selection.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 5, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | from warnings import warn
 7 | from feature_selection import FeatureSelection
 8 | 
 9 | class TopFeatureSelection(FeatureSelection):
10 |     """
11 |     Sorts the columns of a space according to some criterion and returns a space
12 |     containing only the top :math:`k` ones.
13 | 
14 |     Available criteria:
15 | 
16 |     sum: Default. Ranks columns according to the sum on their elements.
17 | 
18 |     length: Ranks columns according to their vector length.
19 | 
20 |     """
21 | 
22 |     _name = "top_feature_selection"
23 |     _valid_criteria = {"sum", "length"}
24 | 
25 |     def __init__(self, reduced_dimension, criterion='sum'):
26 |         '''
27 |         Constructor
28 |         '''
29 |         super(TopFeatureSelection, self).__init__(reduced_dimension)
30 | 
31 |         if criterion:
32 |             if criterion not in self._valid_criteria:
33 |                 raise ValueError("Unrecognized criterion: %s" % criterion)
34 |             self.criterion = criterion
35 | 
36 |     def apply(self, matrix_):
37 | 
38 |         if self.criterion == "sum":
39 |             norm_function = matrix_.sum
40 |         else:
41 |             norm_function = matrix_.norm
42 | 
43 |         if self._reduced_dimension >= matrix_.shape[1]:
44 |             warn("Reduced dimension larger than number of columns!")
45 | 
46 |         no_columns = min(self._reduced_dimension, matrix_.shape[1])
47 |         sorted_perm = matrix_.sorted_permutation(norm_function, 0)
48 | 
49 |         sorted_perm = sorted_perm[0:no_columns]
50 |         matrix_ = matrix_[:, sorted_perm]
51 | 
52 |         return matrix_, sorted_perm
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/transformation/scaling/__init__.py


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/epmi_weighting.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scaling import Scaling
 3 | from composes.utils.py_matrix_utils import nonzero_invert
 4 | 
 5 | class EpmiWeighting(Scaling):
 6 |     """
 7 |      Exponential Point-wise Mutual Information.
 8 | 
 9 |      :math:`epmi(r,c) = \\frac{P(r,c)}{P(r)P(c)}`
10 | 
11 |     """
12 | 
13 |     _name = 'epmi'
14 |     _uses_column_stats = True
15 | 
16 |     def apply(self, matrix_, column_marginal=None):
17 |         """
18 |         Performs epmi weighting.
19 | 
20 |         Args:
21 |             matrix_ (Matrix): Input matrix
22 | 
23 |             column_marginal (np.ndarray): column marginals of the
24 |                 core matrix if the matrix is a peripheral matrix
25 | 
26 |         Returns:
27 |             Matrix: the matrix after applying epmi.
28 | 
29 |         """
30 | 
31 |         matrix_.assert_positive()
32 |         row_sum = matrix_.sum(axis = 1)
33 | 
34 |         if not column_marginal is None:
35 |             col_sum = column_marginal
36 |         else:
37 |             col_sum = matrix_.sum(axis = 0)
38 | 
39 |         total = col_sum.sum()
40 | 
41 |         row_sum = nonzero_invert(row_sum)
42 |         col_sum = nonzero_invert(col_sum)
43 |         col_sum = col_sum * total
44 | 
45 |         matrix_ = matrix_.scale_rows(row_sum)
46 |         matrix_ = matrix_.scale_columns(col_sum)
47 | 
48 |         return matrix_
49 | 
50 |     def get_column_stats(self, matrix_):
51 |         return matrix_.sum(0)
52 | 
53 | 


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/normalization.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 4, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | from numpy import double
 7 | from warnings import warn
 8 | from scaling import Scaling
 9 | 
10 | class Normalization(Scaling):
11 |     """
12 |     Normalizes the a space according to a some criterion.
13 | 
14 |     Available criteria:
15 | 
16 |     sum: Default. The result matrix :math:`X` will satisfy: :math:`\\sum_{i,j} X_{ij}=1`
17 | 
18 |     length: The result matrix :math:`X` will satisfy: :math:`\\sqrt{\\sum_{i,j} X_{ij}^2}=1`
19 | 
20 |     """
21 |     _name = "row_normalization"
22 |     _valid_criteria = ["sum", "length"]
23 |     _uses_column_stats = True
24 | 
25 |     def __init__(self, criterion='sum'):
26 |         '''
27 |         Constructor
28 |         '''
29 |         if criterion:
30 |             if criterion not in self._valid_criteria:
31 |                 raise ValueError("Unrecognized criterion: %s" % criterion)
32 |             self.criterion = criterion
33 | 
34 | 
35 |     def apply(self, matrix_, total=None):
36 | 
37 |         if total is None:
38 |             if self.criterion == "length":
39 |                 total = matrix_.norm()
40 |             else:
41 |                 total = matrix_.sum()
42 | 
43 |         if total == 0:
44 |             warn("Could not normalize: sum/length of matrix is 0.")
45 |             return matrix_
46 | 
47 |         matrix_ = (1 / double(total)) * matrix_
48 |         return matrix_
49 | 
50 |     def get_column_stats(self, matrix_):
51 | 
52 |         if self.criterion == "length":
53 |             return matrix_.norm()
54 |         else:
55 |             return matrix_.sum()
56 | 


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/plmi_weighting.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scaling import Scaling
 3 | from ppmi_weighting import PpmiWeighting
 4 | 
 5 | class PlmiWeighting(Scaling):
 6 |     """
 7 |      Positive Local Mutual Information.
 8 | 
 9 |      :math:`plmi(r,c)=ppmi(r,c)count(r,c)`
10 | 
11 |     """
12 | 
13 |     _name = "plmi"
14 |     _uses_column_stats = True
15 | 
16 |     def apply(self, matrix_, column_marginal=None):
17 |         return matrix_.multiply(PpmiWeighting().apply(matrix_,
18 |                                                                 column_marginal))
19 | 
20 | 
21 |     def get_column_stats(self, matrix_):
22 |         return matrix_.sum(0)


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/plog_weighting.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scaling import Scaling
 3 | 
 4 | class PlogWeighting(Scaling):
 5 |     """
 6 |      Positive Log Weighting
 7 | 
 8 |      :math:`plog(r,c)= log(r,c) \\text{ if } log(r,c) \\geq 0 \\text{ else } 0`
 9 |     """
10 | 
11 |     _name = "plog"
12 | 
13 |     def apply(self, matrix_):
14 |         '''
15 |         Performs positive log weighting.
16 | 
17 |         Args:
18 |             matrix_ (Matrix): Input matrix
19 |             column_marginal (array): column marginals of the core matrix if the matrix is a peripheral matrix
20 | 
21 |         Returns:
22 |             Matrix: the matrix after applying plog
23 | 
24 |         '''
25 |         matrix_ = matrix_.copy()
26 |         matrix_.plog()
27 |         return matrix_
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/ppmi_weighting.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scaling import Scaling
 3 | from epmi_weighting import EpmiWeighting
 4 | 
 5 | class PpmiWeighting(Scaling):
 6 |     """
 7 |     Positive Point-wise Mutual Information.
 8 | 
 9 | 
10 |     :math:`pmi(r,c) = log\\frac{P(r,c)}{P(r)P(c)}`
11 | 
12 |     :math:`ppmi(r,c)= pmi(r,c) \\text{ if } pmi(r,c)\\geq 0 \\text{ else } 0`
13 |     """
14 | 
15 |     _name = "ppmi"
16 |     _uses_column_stats = True
17 | 
18 |     def apply(self, matrix_, column_marginal=None):
19 | 
20 |         matrix_ = EpmiWeighting().apply(matrix_, column_marginal)
21 |         matrix_.plog()
22 |         return matrix_
23 | 
24 |     def get_column_stats(self, matrix_):
25 |         return matrix_.sum(0)
26 | 
27 |     """
28 |     :math:`ppmi(r,c)=\\begin{cases}pmi(rc) & \\text{if }pmi(r,c)\\geq0
29 |                       0 & \\text{otherwise}\\end{cases}`
30 |     """


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/row_normalization.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 4, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from scaling import Scaling
 8 | from composes.utils.py_matrix_utils import nonzero_invert
 9 | 
10 | class RowNormalization(Scaling):
11 |     """
12 |     Normalizes the rows of a space according to a some criterion.
13 | 
14 |     Available criteria:
15 | 
16 |     length: Default. Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sqrt{\\sum_j X_{ij}^2}=1`
17 | 
18 | 
19 |     sum: Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sum_j X_{ij}=1`
20 | 
21 |     """
22 |     _name = "row_normalization"
23 |     _valid_criteria = ["sum", "length"]
24 | 
25 |     def __init__(self, criterion='length'):
26 |         '''
27 |         Constructor
28 |         '''
29 |         if criterion:
30 |             if criterion not in self._valid_criteria:
31 |                 raise ValueError("Unrecognized criterion: %s" % criterion)
32 |             self.criterion = criterion
33 | 
34 | 
35 |     def apply(self, matrix_):
36 | 
37 |         if self.criterion == "length":
38 |             row_norms = matrix_.norm(axis=1)
39 |         else:
40 |             row_norms = matrix_.sum(axis=1)
41 | 
42 |         inv_row_norm = nonzero_invert(row_norms)
43 |         matrix_ = matrix_.scale_rows(inv_row_norm)
44 |         return matrix_
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/src/composes/transformation/scaling/scaling.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 20, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from composes.semantic_space.operation import ScalingOperation
 8 | 
 9 | class Scaling(object):
10 |     '''
11 |     classdocs
12 |     '''
13 |     _name = "we are NOT stupid"
14 |     _uses_column_stats = False
15 | 
16 |     def get_name(self):
17 |         return self._name
18 | 
19 |     def get_uses_column_stats(self):
20 |         return self._uses_column_stats
21 | 
22 |     def create_operation(self):
23 |         return ScalingOperation(self)
24 | 
25 |     def __str__(self):
26 |         return self._name
27 | 
28 |     name = property(get_name)
29 |     uses_column_stats = property(get_uses_column_stats)


--------------------------------------------------------------------------------
/src/composes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/composes/utils/__init__.py


--------------------------------------------------------------------------------
/src/composes/utils/crossvalidation_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 9, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from random import shuffle
 8 | 
 9 | def get_split_indices(range_len, fold):
10 | 
11 |     if fold <= 0:
12 |         raise ValueError("Strictly positive number of folds required, received %s:"
13 |                          % fold)
14 | 
15 |     indices_list = []
16 |     if range_len < fold:
17 |         return get_split_indices(range_len, range_len)
18 | 
19 |     range_ = range(range_len)
20 |     shuffle(range_)
21 |     current_index = 0
22 |     for i in range(fold):
23 |         if i < len(range_)%fold:
24 |             slice_length = range_len // fold + 1
25 |         else:
26 |             slice_length = range_len // fold
27 | 
28 |         indices_list.append(range_[current_index:current_index + slice_length])
29 |         current_index += slice_length
30 | 
31 |     return indices_list
32 | 
33 | def get_submatrix_list(matrix_, indices_list):
34 |     return [matrix_[indices, :] for indices in indices_list]
35 | 
36 | 


--------------------------------------------------------------------------------
/src/composes/utils/gen_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 21, 2013
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | from composes.exception.invalid_argument_error import InvalidArgumentError
 7 | 
 8 | 
 9 | def assert_is_instance(object_, class_):
10 |     if not isinstance(object_, class_):
11 |         raise TypeError("expected %s, received %s" % (class_, type(object_)))
12 | 
13 | 
14 | def get_partitions(sorted_list, min_samples):
15 |     prev_idx = 0
16 |     range_list = []
17 |     for i in range(1, len(sorted_list)):
18 |         if sorted_list[i] != sorted_list[i - 1]:
19 |             if i - prev_idx >= min_samples:
20 |                 range_list.append((prev_idx, i))
21 | 
22 |             prev_idx = i
23 | 
24 |     if len(sorted_list) - prev_idx >= min_samples:
25 |         range_list.append((prev_idx, len(sorted_list)))
26 | 
27 |     keys = [sorted_list[range_list[i][0]] for i in xrange(len(range_list))]
28 | 
29 |     return keys, range_list


--------------------------------------------------------------------------------
/src/composes/utils/log_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 15, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | 
  7 | from numpy import double
  8 | import logging
  9 | from composes.utils.io_utils import create_parent_directories
 10 | 
 11 | def config_logging(file_name, level = logging.INFO, format_ =""):
 12 |     if not file_name is None:
 13 |         create_parent_directories(file_name)
 14 |         logging.basicConfig(filename=file_name, level=level, format=format_)
 15 |         logging.debug("start logging")
 16 | 
 17 | 
 18 | def get_ident(delim, ident_level):
 19 |     return delim * ident_level
 20 | 
 21 | def print_matrix_info(logger_, matrix_, ident_level, intro_string):
 22 |     delim = "  "
 23 |     ident = get_ident(delim, ident_level)
 24 |     logger_string = ident + intro_string
 25 |     ident = ident + delim
 26 | 
 27 |     logger_string += ("\n%sMatrix type:%s" % (ident, type(matrix_).__name__))
 28 |     logger_string += ("\n%sMatrix shape:%sx%s" % (ident, matrix_.shape[0],
 29 |                                           matrix_.shape[1]))
 30 | 
 31 |     if type(matrix_).__name__ == "SparseMatrix":
 32 |         perc_nnz = 100 * matrix_.mat.nnz/double(matrix_.shape[0]*matrix_.shape[1])
 33 |         logger_string += ("\n%sPerc. non-zero entries:%d" % (ident, perc_nnz))
 34 | 
 35 |     logger_.info(logger_string)
 36 | 
 37 | 
 38 | def get_learner_info(learner, ident):
 39 |     logger_string = ""
 40 | 
 41 |     if hasattr(learner, '_intercept'):
 42 |         logger_string += ("\n%sUsing intercept:%s" % (ident, learner._intercept))
 43 | 
 44 |     if hasattr(learner, '_crossvalidation'):
 45 |         logger_string += ("\n%sUsing crossvalidation:%s" % (ident, learner._crossvalidation))
 46 | 
 47 |         if learner._crossvalidation and hasattr(learner, '_folds'):
 48 |             logger_string += ("\n%sUsing number of folds:%s" % (ident, learner._folds))
 49 | 
 50 |     return logger_string
 51 | 
 52 | def print_composition_model_info(logger_, model, ident_level, intro_string):
 53 | 
 54 |     delim = "  "
 55 |     ident = get_ident(delim, ident_level)
 56 |     logger_string = ident + intro_string
 57 |     ident = ident + delim
 58 | 
 59 |     logger_.info(logger_string)
 60 | 
 61 |     print_name(logger_, model, ident_level, "Composition model type:")
 62 | 
 63 |     logger_string = ""
 64 |     if hasattr(model, '_regression_learner'):
 65 |         logger_string += ("\n%sUsing regression:%s" % (ident,
 66 |                                                        type(model.regression_learner).__name__))
 67 |         logger_string += get_learner_info(model.regression_learner, ident + delim)
 68 | 
 69 |     logger_.info(logger_string)
 70 | 
 71 | def print_transformation_info(logger_, trans, ident_level, intro_string):
 72 |     delim = "  "
 73 |     ident = get_ident(delim, ident_level)
 74 |     logger_string = ident + intro_string
 75 |     ident = ident + delim
 76 | 
 77 |     logger_string += ("\n%sTransformation type:%s" % (ident, type(trans).__name__))
 78 | 
 79 |     if hasattr(trans, '_reduced_dimension'):
 80 |         logger_string += ("\n%sReduced dimension:%s" % (ident, trans.reduced_dimension))
 81 | 
 82 | 
 83 |     logger_.info(logger_string)
 84 | 
 85 | def print_info(logger_, ident_level, text):
 86 |     delim = "  "
 87 |     ident = get_ident(delim, ident_level)
 88 |     logger_string = ident + ""
 89 | 
 90 |     logger_string += "\n%s%s" % (ident, text)
 91 |     logger_.info(logger_string)
 92 | 
 93 | def print_name(logger_, object_, ident_level, intro_string):
 94 |     delim = "  "
 95 |     ident = get_ident(delim, ident_level)
 96 |     logger_string = ident + intro_string
 97 |     ident = ident + delim
 98 | 
 99 |     logger_string += ("\n%s%s" % (ident, type(object_).__name__))
100 | 
101 |     logger_.info(logger_string)
102 | 
103 | def print_time_info(logger_, end, beg, ident_level):
104 |     delim = "  "
105 |     ident = get_ident(delim, ident_level)
106 |     logger_string = ident
107 |     logger_string += ("\n%sTiming:%s seconds" % (ident, end - beg))
108 | 
109 |     logger_.info(logger_string)
110 | 
111 | 


--------------------------------------------------------------------------------
/src/composes/utils/matrix_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from composes.matrix.sparse_matrix import SparseMatrix
  4 | from composes.matrix.dense_matrix import DenseMatrix
  5 | from composes.matrix.matrix import Matrix
  6 | from scipy.sparse import issparse
  7 | from py_matrix_utils import is_array
  8 | from warnings import warn
  9 | 
 10 | def to_matrix(matrix_):
 11 |     """
 12 |     Converts an array-like structure to a DenseMatrix/SparseMatrix
 13 |     """
 14 |     if issparse(matrix_):
 15 |         return SparseMatrix(matrix_)
 16 |     else:
 17 |         return DenseMatrix(matrix_)
 18 | 
 19 | def is_array_or_matrix(data):
 20 |     return is_array(data) or isinstance(data, Matrix)
 21 | 
 22 | 
 23 | def assert_is_array_or_matrix(data):
 24 |     if not is_array_or_matrix(data):
 25 |         raise TypeError("expected array-like or matrix, received %s"
 26 |                         % (type(data)))
 27 | 
 28 | def padd_matrix(matrix_, axis, value=1):
 29 |     matrix_type = type(matrix_)
 30 |     if axis == 0:
 31 |         append_mat = matrix_type(np.ones((1, matrix_.shape[1]))*value)
 32 |         return matrix_.vstack(append_mat)
 33 |     elif axis == 1:
 34 |         append_mat = matrix_type(np.ones((matrix_.shape[0], 1))*value)
 35 |         return matrix_.hstack(append_mat)
 36 |     else:
 37 |         raise ValueError("Invalid axis value:%s" % axis)
 38 | 
 39 | 
 40 | def assert_same_shape(matrix1, matrix2, axis=None):
 41 | 
 42 |     if axis is None:
 43 |         if matrix1.shape != matrix2.shape:
 44 |             raise ValueError("Inconsistent shapes")
 45 |     else:
 46 |         if not axis in [0, 1]:
 47 |             raise ValueError("Invalid axis value: %s, expected 0 or 1." % axis)
 48 |         if matrix1.shape[axis] != matrix2.shape[axis]:
 49 |             raise ValueError("Inconsistent shapes")
 50 | 
 51 | 
 52 | def to_compatible_matrix_types(v1, v2):
 53 | 
 54 |     if isinstance(v1, Matrix) and isinstance(v2, Matrix):
 55 |         v2 = type(v1)(v2)
 56 |     elif not isinstance(v1, Matrix) and isinstance(v2, Matrix):
 57 |         v1 = type(v2)(v1)
 58 |     elif not isinstance(v2, Matrix) and isinstance(v1, Matrix):
 59 |         v2 = type(v1)(v2)
 60 |     else:
 61 |         v1 = to_matrix(v1)
 62 |         v2 = type(v1)(v2)
 63 | 
 64 |     return v1, v2
 65 | 
 66 | 
 67 | 
 68 | def get_type_of_largest(matrix_list):
 69 |     max_dim = 0
 70 |     max_type = None
 71 |     for matrix_ in matrix_list:
 72 |         if matrix_.shape[0] * matrix_.shape[1] > max_dim:
 73 |             max_type = type(matrix_)
 74 |             max_dim = matrix_.shape[0] * matrix_.shape[1]
 75 | 
 76 |     return max_type
 77 | 
 78 | def resolve_type_conflict(matrix_list, matrix_type):
 79 |     new_matrix_list = []
 80 | 
 81 |     if matrix_type_conflict(matrix_list):
 82 |         warn("Efficiency warning: matrices should have the same dense/sparse type!")
 83 |         for matrix_ in matrix_list:
 84 |             new_matrix_list.append(matrix_type(matrix_))
 85 |         return new_matrix_list
 86 | 
 87 |     return list(matrix_list)
 88 | 
 89 | 
 90 | def matrix_type_conflict(matrix_list):
 91 | 
 92 |     if not matrix_list:
 93 |         return False
 94 | 
 95 |     matrix_type = type(matrix_list[0])
 96 |     for matrix_ in matrix_list:
 97 |         if not isinstance(matrix_, matrix_type):
 98 |             return True
 99 | 
100 |     return False
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/src/composes/utils/mem_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 21, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | """
 8 | Wrappers around psutil functions that display memory usage information.
 9 | """
10 | import numpy as np
11 | from os import getpid
12 | import psutil
13 | 
14 | def get_mem_usage():
15 |     p = psutil.Process(getpid())
16 |     return p.get_memory_info()[0]/np.double(1024*1024)


--------------------------------------------------------------------------------
/src/composes/utils/num_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 18, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | from numbers import Number
 8 | from numbers import Integral
 9 | import numpy as np
10 | 
11 | def is_numeric(operand):
12 |     return isinstance(operand, (Number, np.number))
13 | 
14 | def is_integer(operand):
15 |     return isinstance(operand, Integral)
16 | 


--------------------------------------------------------------------------------
/src/composes/utils/py_matrix_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 19, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import numpy as np
 7 | from scipy.sparse import spdiags
 8 | 
 9 | 
10 | def array_to_csr_diagonal(array_):
11 |     #array_ can't be a sparse matrix, if it is dense, it has to be a row matrix
12 |     #(i.e. shape = (1, x))
13 | 
14 |     flat_array = array_.flatten()
15 |     array_size = flat_array.size
16 |     csr_diag = spdiags(flat_array, [0], array_size, array_size, format = 'csr')
17 |     return csr_diag
18 | 
19 | def is_array(operand):
20 |     return hasattr(operand, 'dtype') and hasattr(operand, 'shape')
21 | 
22 | 
23 | def nonzero_invert(matrix_):
24 |     '''
25 |     Performs 1/x for all x, non-zero elements of the matrix.
26 | 
27 |     Params:
28 |         matrix_: np.matrix
29 |     '''
30 | 
31 |     matrix_ = matrix_.astype(np.double)
32 |     matrix_[matrix_ != 0] = np.array(1.0/matrix_[matrix_ != 0]).flatten()
33 |     return matrix_
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/src/composes/utils/regression_learner.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from composes.matrix.linalg import Linalg
  3 | 
  4 | 
  5 | class RegressionLearner(object):
  6 |     """
  7 |     Implements a set of regression methods.
  8 | 
  9 |     Supported regression methods are least squares regression and
 10 |     ridge regression. Ridge regression can be used with generalized
 11 |     cross validation. (Hastie, Tibshirani and Friedman, Second edition,
 12 |     page 244)
 13 |     """
 14 | 
 15 | 
 16 |     def __init__(self):
 17 |         '''
 18 |         Constructor
 19 |         '''
 20 | 
 21 |     def has_intercept(self):
 22 |         return self._intercept
 23 | 
 24 | 
 25 | class LstsqRegressionLearner(RegressionLearner):
 26 |     """
 27 |     This class performs Least Squares Regression.
 28 | 
 29 |     It finds the matrix X which solves:
 30 | 
 31 |     :math:`X = argmin(||AX - B||_2)`
 32 | 
 33 |     It can be used with intercept or without (by default intercept=True).
 34 | 
 35 |     """
 36 | 
 37 |     def __init__(self, intercept=True):
 38 |         self._intercept = intercept
 39 | 
 40 |     def train(self, matrix_a, matrix_b):
 41 |         return Linalg.lstsq_regression(matrix_a, matrix_b, self._intercept)
 42 | 
 43 | 
 44 | class RidgeRegressionLearner(RegressionLearner):
 45 |     """
 46 |     This class performs Ridge Regression.
 47 | 
 48 |     It finds the matrix X which solves:
 49 | 
 50 |     :math:`X = argmin(||AX - B||_2 + \\lambda||X||_2)`
 51 | 
 52 |     It can be used with intercept or without (by default intercept=True).
 53 |     Cross validation can be used with default :math:`\\lambda` range of
 54 |     :math:`linspace(0, 5, 11)`. By default Generalized cross validation is performed.
 55 |     If cross validation is set False it requires the input of a :math:`\\lambda` value.
 56 | 
 57 |     """
 58 | 
 59 |     def __init__(self, intercept=True, param_range=None, crossvalidation=True, param=None):
 60 |         self._intercept = intercept
 61 |         self._param_range = param_range if param_range is not None else np.linspace(0.0, 5, 11)
 62 | 
 63 |         self._param = param
 64 |         self._crossvalidation = crossvalidation
 65 | 
 66 |         if param:
 67 |             self._crossvalidation = False
 68 |             self._param = param
 69 | 
 70 |         if not self._crossvalidation and self._param is None:
 71 |             raise ValueError("Cannot run (no-crossvalidation) RidgeRegression with no lambda value!")
 72 | 
 73 | 
 74 |     def train(self, matrix_a, matrix_b):
 75 |         """
 76 |         If cross validation is set to True, it performs generalized
 77 |         cross validation. (Hastie, Tibshirani and Friedman, Second edition,
 78 |         page 244).
 79 |         """
 80 | 
 81 |         if not self._crossvalidation:
 82 |             return Linalg.ridge_regression(matrix_a, matrix_b, self._param,
 83 |                                            self._intercept)[0]
 84 | 
 85 |         else:
 86 |             min_err_param = 0
 87 |             min_err = np.Inf
 88 |             gcv_err = np.Inf
 89 | 
 90 |             N = matrix_a.shape[0]
 91 |             for param in self._param_range:
 92 | 
 93 |                 mat_x, S_trace, err1 = Linalg.ridge_regression(matrix_a, matrix_b, param,
 94 |                                                                self._intercept)
 95 | 
 96 |                 nom = pow(1 - S_trace / N, 2) * N
 97 |                 if nom != 0:
 98 |                     gcv_err = (err1 * err1) / nom
 99 | 
100 |                 if gcv_err < min_err:
101 |                     min_err = gcv_err
102 |                     min_err_param = param
103 | 
104 |             #print "lambda:", min_err_param
105 |             return Linalg.ridge_regression(matrix_a, matrix_b, min_err_param,
106 |                                            self._intercept)[0]
107 | 


--------------------------------------------------------------------------------
/src/composes/utils/scoring_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 17, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | import numpy as np
 8 | from scipy import stats
 9 | 
10 | 
11 | def score(gold, prediction, method):
12 |     if len(gold) != len(prediction):
13 |         raise ValueError("The two arrays must have the same length!")
14 | 
15 |     gold = np.array(gold, dtype=np.double)
16 |     prediction = np.array(prediction, dtype=np.double)
17 | 
18 |     if method == "pearson":
19 |         return pearson(gold, prediction)[0]
20 |     elif method == "spearman":
21 |         return spearman(gold, prediction)[0]
22 |     elif method == "auc":
23 |         return auc(gold, prediction)
24 |     else:
25 |         raise NotImplementedError("Unknown scoring measure:%s" % method)
26 | 
27 | def pearson(gold, prediction):
28 |     return stats.pearsonr(gold, prediction)
29 | 
30 | def spearman(gold, prediction):
31 |     return stats.spearmanr(gold, prediction, None)
32 | 
33 | def auc(gold, prediction):
34 | 
35 |     positive = float(gold[gold == 1].size)
36 |     negative = float(gold.size - positive)
37 | 
38 |     total_count = gold.size
39 |     point_set = np.empty(total_count, dtype = [('gold',float),('score',float)])
40 |     for i in range(total_count):
41 |         if not gold[i] in (0,1):
42 |             raise ValueError("For evaluating AUC, gold scores are required to be 0 or 1.")
43 |         point_set[i]=(gold[i], prediction[i])
44 | 
45 |     point_set.sort(order = 'score')
46 | 
47 |     xi = 1.0
48 |     yi = 1.0
49 |     xi_old = 1.0
50 |     true_positive = positive
51 |     false_positive = negative
52 |     auc = 0
53 | 
54 |     for i in range(total_count):
55 |         if (point_set[i][0] == 1):
56 |             true_positive -= 1
57 |             yi = true_positive / positive
58 |         else:
59 |             false_positive -= 1
60 |             xi = false_positive / negative
61 |             auc += (xi_old - xi) * yi
62 |             xi_old = xi
63 | 
64 |     return auc
65 | 


--------------------------------------------------------------------------------
/src/composes/utils/space_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 26, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | 
 8 | def list2dict(list_):
 9 |     return_dict = {}
10 | 
11 |     for idx, word in enumerate(list_):
12 |         if word in return_dict:
13 |             raise ValueError("duplicate string found in list: %s" % (word))
14 |         return_dict[word] = idx
15 | 
16 |     return return_dict
17 | 
18 | def add_items_to_dict(dict_, list_):
19 | 
20 |     no_els = len(dict_)
21 |     for idx, el in enumerate(list_):
22 |         if el in dict_:
23 |             raise ValueError("Found duplicate keys when appending elements to\
24 |                             dictionary.")
25 |         dict_[el] = no_els + idx
26 |     return dict_
27 | 
28 | def assert_dict_match_list(dict_, list_):
29 | 
30 |     match_err = ValueError("expected matching dictionary and list structures.")
31 | 
32 |     if not len(list_) == len(dict_):
33 |         raise match_err
34 |     for (k, v) in dict_.iteritems():
35 |         if not list_[v] == k:
36 |             raise match_err
37 | 
38 | 
39 | def assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id):
40 | 
41 |     no_rows = matrix_.mat.shape[0]
42 |     no_cols = matrix_.mat.shape[1]
43 | 
44 |     has_column_maps = column2id or id2column
45 | 
46 |     if not no_rows == len(id2row) or not no_rows == len(row2id):
47 |         raise ValueError("expected consistent shapes: %d %d %d"
48 |                          % (no_rows, len(id2row), len(row2id)))
49 | 
50 |     if (has_column_maps and
51 |         (not no_cols == len(id2column) or not no_cols == len(column2id))):
52 |         raise ValueError("expected consistent shapes: %d %d %d"
53 |                          % (no_cols, len(id2column), len(column2id)))
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/src/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/__init__.py


--------------------------------------------------------------------------------
/src/examples/cmd_ex01.sh:
--------------------------------------------------------------------------------
1 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm -o ../examples/data/out/
2 | python2.7 build_core_space.py -i ../examples/data/in/ex01 --input_format sm --output_format dm -w ppmi,plog -r svd_2 -n none,row -o ../examples/data/out/ -l ../examples/data/out/ex01.log
3 | #or
4 | python2.7 build_core_space.py ../examples/data/in/config1.cfg
5 | python2.7 build_core_space.py ../examples/data/in/config2.cfg
6 | 


--------------------------------------------------------------------------------
/src/examples/cmd_ex02.sh:
--------------------------------------------------------------------------------
1 | python2.7 build_peripheral_space.py -i ../examples/data/in/ex05 --input_format sm -o ../examples/data/out/ -c ../examples/data/out/CORE_SS.ex01.ppmi.svd_2.pkl
2 | 


--------------------------------------------------------------------------------
/src/examples/cmd_ex03.sh:
--------------------------------------------------------------------------------
1 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs1.txt -c 1,2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos,euclidean
2 | python2.7 compute_similarities.py -i ../examples/data/in/word_pairs2.txt -c 1,2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos,euclidean
3 | 


--------------------------------------------------------------------------------
/src/examples/cmd_ex04.sh:
--------------------------------------------------------------------------------
1 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl -o ../examples/data/out/ -m cos
2 | python2.7 compute_neighbours.py -i ../examples/data/in/word_list.txt -n 2 -s ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ -m cos


--------------------------------------------------------------------------------
/src/examples/cmd_ex05.sh:
--------------------------------------------------------------------------------
1 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m dilation --lambda 2 -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
2 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt -m mult -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
3 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl -o ../examples/data/out/ --output_format dm
4 | python2.7 apply_composition.py -i ../examples/data/in/data_to_comp2.txt --load_model ../examples/data/out/model01.pkl -a ../examples/data/out/ex01.pkl,../examples/data/out/PER_SS.ex05.pkl -o ../examples/data/out/ --output_format dm
5 | 


--------------------------------------------------------------------------------
/src/examples/cmd_ex06.sh:
--------------------------------------------------------------------------------
1 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True
2 | python2.7 train_composition.py -i ../examples/data/in/train_data.txt -m lexical_func -r ridge --lambda 0.0 -a ../examples/data/out/ex01.pkl -p ../examples/data/out/PHRASE_SS.ex10.pkl -o ../examples/data/out/ --export_params True


--------------------------------------------------------------------------------
/src/examples/cmd_ex07.sh:
--------------------------------------------------------------------------------
1 | python2.7 evaluate_similarities.py -i ../examples/data/in/sim_data.txt -c 3,5 -m pearson,spearman
2 | python2.7 evaluate_similarities.py --in_dir ../examples/data/in/ --filter sim_data -c 3,5 -m pearson,spearman
3 | 


--------------------------------------------------------------------------------
/src/examples/data/in/config1.cfg:
--------------------------------------------------------------------------------
 1 | [build_core_space]
 2 | 
 3 | #input file
 4 | input=../examples/data/in/ex01
 5 | 
 6 | # output directory
 7 | output=../examples/data/out/
 8 | 
 9 | # input format
10 | input_format=sm
11 | 
12 | 


--------------------------------------------------------------------------------
/src/examples/data/in/config2.cfg:
--------------------------------------------------------------------------------
 1 | [build_core_space]
 2 | 
 3 | #input file
 4 | input=../examples/data/in/ex01
 5 | 
 6 | # output directory
 7 | output=../examples/out/
 8 | 
 9 | # input format
10 | input_format=sm
11 | 
12 | # weighing schemes
13 | weighting=ppmi,plog
14 | 
15 | # reductions
16 | reduction=svd_2
17 | 
18 | # normalizations
19 | normalization=none,row
20 | 
21 | # additional output format
22 | output_format=dm
23 | 
24 | # log file
25 | log=../examples/data/out/ex01.log


--------------------------------------------------------------------------------
/src/examples/data/in/data_to_comp.txt:
--------------------------------------------------------------------------------
1 | book book book__book
2 | car book car__book
3 | car car car__car
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/data_to_comp2.txt:
--------------------------------------------------------------------------------
1 | book history_book book__history_book
2 | car sports_car car__sports_car
3 | book sports_car book__sports_book
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex01.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex01.rows:
--------------------------------------------------------------------------------
1 | car
2 | book
3 | 
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex01.sm:
--------------------------------------------------------------------------------
1 | car red 5
2 | book readable 6
3 | car blue 1
4 | book red 3
5 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex05.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex05.sm:
--------------------------------------------------------------------------------
1 | sports_car red 5
2 | history_book readable 1
3 | history_book red 1
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex10.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex10.rows:
--------------------------------------------------------------------------------
1 | book
2 | car
3 | bike
4 | good


--------------------------------------------------------------------------------
/src/examples/data/in/ex10.sm:
--------------------------------------------------------------------------------
1 | car red 5
2 | book readable 6
3 | car blue 1
4 | book red 3
5 | bike blue 4
6 | bike red 4
7 | good readable 3
8 | good blue 2
9 | good red 6


--------------------------------------------------------------------------------
/src/examples/data/in/ex19-n.cols:
--------------------------------------------------------------------------------
1 | book
2 | car


--------------------------------------------------------------------------------
/src/examples/data/in/ex19-n.sm:
--------------------------------------------------------------------------------
1 | man book 5
2 | man car 2
3 | boy book 7
4 | boy car 1
5 | woman book 5
6 | woman car 2
7 | 


--------------------------------------------------------------------------------
/src/examples/data/in/ex19-svo.cols:
--------------------------------------------------------------------------------
1 | book
2 | car


--------------------------------------------------------------------------------
/src/examples/data/in/ex19-svo.sm:
--------------------------------------------------------------------------------
1 | man_hate_boy car 4
2 | man_hate_boy book 3
3 | man_hate_man car 10
4 | boy_hate_boy book 2
5 | boy_hate_man car 6
6 | boy_hate_boy car 11
7 | 


--------------------------------------------------------------------------------
/src/examples/data/in/sim_data.txt:
--------------------------------------------------------------------------------
1 | book history_book 0.894427191 other_field 4
2 | car sports_car 0.980580675691 other_field 4
3 | book sports_car 0.4472135955 other_field 6


--------------------------------------------------------------------------------
/src/examples/data/in/sim_data2.txt:
--------------------------------------------------------------------------------
1 | book history_book 0.894427191 other_field 6
2 | car sports_car 0.980580675691 other_field 4
3 | book sports_car 0.4472135955 other_field 5


--------------------------------------------------------------------------------
/src/examples/data/in/sim_data3.txt:
--------------------------------------------------------------------------------
1 | book book 0.894427191 other_field 4
2 | car car 0.980580675691 other_field 4
3 | book car 0.4472135955 other_field 6


--------------------------------------------------------------------------------
/src/examples/data/in/train_data.txt:
--------------------------------------------------------------------------------
1 | book_function car my_car_book
2 | book_function book 2x_book
3 | 


--------------------------------------------------------------------------------
/src/examples/data/in/word_list.txt:
--------------------------------------------------------------------------------
1 | car
2 | book
3 | 


--------------------------------------------------------------------------------
/src/examples/data/in/word_pairs1.txt:
--------------------------------------------------------------------------------
1 | book book
2 | car book
3 | car car
4 | 


--------------------------------------------------------------------------------
/src/examples/data/in/word_pairs2.txt:
--------------------------------------------------------------------------------
1 | book history_book
2 | car sports_car
3 | book sports_car
4 | 
5 | 


--------------------------------------------------------------------------------
/src/examples/data/in/word_sims.txt:
--------------------------------------------------------------------------------
1 | book book 7
2 | car car 7
3 | book car 2


--------------------------------------------------------------------------------
/src/examples/data/out/COMPOSED_SS.ex10.pkl:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (ccomposes.semantic_space.space
  5 | Space
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_id2row'
 14 | p6
 15 | (lp7
 16 | S'my_car_book'
 17 | p8
 18 | aS'my_special_book'
 19 | p9
 20 | asS'_column2id'
 21 | p10
 22 | (dp11
 23 | S'blue'
 24 | p12
 25 | I1
 26 | sS'readable'
 27 | p13
 28 | I2
 29 | sS'red'
 30 | p14
 31 | I0
 32 | ssS'_operations'
 33 | p15
 34 | (lp16
 35 | sS'_id2column'
 36 | p17
 37 | (lp18
 38 | g14
 39 | ag12
 40 | ag13
 41 | asS'_element_shape'
 42 | p19
 43 | (I3
 44 | tp20
 45 | sS'_cooccurrence_matrix'
 46 | p21
 47 | g0
 48 | (ccomposes.matrix.sparse_matrix
 49 | SparseMatrix
 50 | p22
 51 | g2
 52 | Ntp23
 53 | Rp24
 54 | (dp25
 55 | S'_mat'
 56 | p26
 57 | g0
 58 | (cscipy.sparse.csr
 59 | csr_matrix
 60 | p27
 61 | g2
 62 | Ntp28
 63 | Rp29
 64 | (dp30
 65 | S'format'
 66 | p31
 67 | S'csr'
 68 | p32
 69 | sS'_shape'
 70 | p33
 71 | (I2
 72 | I3
 73 | tp34
 74 | sS'indptr'
 75 | p35
 76 | cnumpy.core.multiarray
 77 | _reconstruct
 78 | p36
 79 | (cnumpy
 80 | ndarray
 81 | p37
 82 | (I0
 83 | tp38
 84 | S'b'
 85 | p39
 86 | tp40
 87 | Rp41
 88 | (I1
 89 | (I3
 90 | tp42
 91 | cnumpy
 92 | dtype
 93 | p43
 94 | (S'i4'
 95 | p44
 96 | I0
 97 | I1
 98 | tp45
 99 | Rp46
100 | (I3
101 | S'<'
102 | p47
103 | NNNI-1
104 | I-1
105 | I0
106 | tp48
107 | bI00
108 | S'\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00'
109 | p49
110 | tp50
111 | bsS'indices'
112 | p51
113 | g36
114 | (g37
115 | (I0
116 | tp52
117 | g39
118 | tp53
119 | Rp54
120 | (I1
121 | (I5
122 | tp55
123 | g46
124 | I00
125 | S'\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00'
126 | p56
127 | tp57
128 | bsS'maxprint'
129 | p58
130 | I50
131 | sS'data'
132 | p59
133 | g36
134 | (g37
135 | (I0
136 | tp60
137 | g39
138 | tp61
139 | Rp62
140 | (I1
141 | (I5
142 | tp63
143 | g43
144 | (S'f8'
145 | p64
146 | I0
147 | I1
148 | tp65
149 | Rp66
150 | (I3
151 | S'<'
152 | p67
153 | NNNI-1
154 | I-1
155 | I0
156 | tp68
157 | bI00
158 | S'\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00(@'
159 | p69
160 | tp70
161 | bsbsbsS'_row2id'
162 | p71
163 | (dp72
164 | g8
165 | I0
166 | sg9
167 | I1
168 | ssb.


--------------------------------------------------------------------------------
/src/examples/data/out/PER_SS.ex05.pkl:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (ccomposes.semantic_space.peripheral_space
  5 | PeripheralSpace
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_id2row'
 14 | p6
 15 | (lp7
 16 | S'sports_car'
 17 | p8
 18 | aS'history_book'
 19 | p9
 20 | asS'_column2id'
 21 | p10
 22 | (dp11
 23 | S'blue'
 24 | p12
 25 | I1
 26 | sS'readable'
 27 | p13
 28 | I2
 29 | sS'red'
 30 | p14
 31 | I0
 32 | ssS'_operations'
 33 | p15
 34 | (lp16
 35 | g0
 36 | (ccomposes.semantic_space.operation
 37 | ScalingOperation
 38 | p17
 39 | g2
 40 | Ntp18
 41 | Rp19
 42 | (dp20
 43 | S'_ScalingOperation__scaling'
 44 | p21
 45 | g0
 46 | (ccomposes.transformation.scaling.ppmi_weighting
 47 | PpmiWeighting
 48 | p22
 49 | g2
 50 | Ntp23
 51 | Rp24
 52 | sS'_ScalingOperation__column_stats'
 53 | p25
 54 | cnumpy.core.multiarray
 55 | _reconstruct
 56 | p26
 57 | (cnumpy.matrixlib.defmatrix
 58 | matrix
 59 | p27
 60 | (I0
 61 | tp28
 62 | S'b'
 63 | p29
 64 | tp30
 65 | Rp31
 66 | (I1
 67 | (I1
 68 | I3
 69 | tp32
 70 | cnumpy
 71 | dtype
 72 | p33
 73 | (S'f8'
 74 | p34
 75 | I0
 76 | I1
 77 | tp35
 78 | Rp36
 79 | (I3
 80 | S'<'
 81 | p37
 82 | NNNI-1
 83 | I-1
 84 | I0
 85 | tp38
 86 | bI01
 87 | S'\x00\x00\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x18@'
 88 | p39
 89 | tp40
 90 | bsbasS'_id2column'
 91 | p41
 92 | (lp42
 93 | g14
 94 | ag12
 95 | ag13
 96 | asS'_element_shape'
 97 | p43
 98 | (I3
 99 | tp44
100 | sS'_cooccurrence_matrix'
101 | p45
102 | g0
103 | (ccomposes.matrix.sparse_matrix
104 | SparseMatrix
105 | p46
106 | g2
107 | Ntp47
108 | Rp48
109 | (dp49
110 | S'_mat'
111 | p50
112 | g0
113 | (cscipy.sparse.csr
114 | csr_matrix
115 | p51
116 | g2
117 | Ntp52
118 | Rp53
119 | (dp54
120 | S'format'
121 | p55
122 | S'csr'
123 | p56
124 | sS'_shape'
125 | p57
126 | (I2
127 | I3
128 | tp58
129 | sS'indptr'
130 | p59
131 | g26
132 | (cnumpy
133 | ndarray
134 | p60
135 | (I0
136 | tp61
137 | g29
138 | tp62
139 | Rp63
140 | (I1
141 | (I3
142 | tp64
143 | g33
144 | (S'i4'
145 | p65
146 | I0
147 | I1
148 | tp66
149 | Rp67
150 | (I3
151 | S'<'
152 | p68
153 | NNNI-1
154 | I-1
155 | I0
156 | tp69
157 | bI00
158 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
159 | p70
160 | tp71
161 | bsS'indices'
162 | p72
163 | g26
164 | (g60
165 | (I0
166 | tp73
167 | g29
168 | tp74
169 | Rp75
170 | (I1
171 | (I2
172 | tp76
173 | g67
174 | I00
175 | S'\x00\x00\x00\x00\x02\x00\x00\x00'
176 | p77
177 | tp78
178 | bsS'maxprint'
179 | p79
180 | I50
181 | sS'data'
182 | p80
183 | g26
184 | (g60
185 | (I0
186 | tp81
187 | g29
188 | tp82
189 | Rp83
190 | (I1
191 | (I2
192 | tp84
193 | g36
194 | I00
195 | S'\xaerF\xe8\x8f\x1d\xe4?"\x9a\x9a\xc7\xf7\x8f\xcc?'
196 | p85
197 | tp86
198 | bsbsbsS'_row2id'
199 | p87
200 | (dp88
201 | g9
202 | I1
203 | sg8
204 | I0
205 | ssb.


--------------------------------------------------------------------------------
/src/examples/data/out/PHRASE_SS.ex10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/PHRASE_SS.ex10.pkl


--------------------------------------------------------------------------------
/src/examples/data/out/ex01.cols:
--------------------------------------------------------------------------------
1 | red
2 | blue
3 | readable
4 | 


--------------------------------------------------------------------------------
/src/examples/data/out/ex01.dm:
--------------------------------------------------------------------------------
1 | car	5.0	1.0	0.0
2 | book	3.0	0.0	6.0
3 | 


--------------------------------------------------------------------------------
/src/examples/data/out/ex01.pkl:
--------------------------------------------------------------------------------
  1 | ccopy_reg
  2 | _reconstructor
  3 | p0
  4 | (ccomposes.semantic_space.space
  5 | Space
  6 | p1
  7 | c__builtin__
  8 | object
  9 | p2
 10 | Ntp3
 11 | Rp4
 12 | (dp5
 13 | S'_id2row'
 14 | p6
 15 | (lp7
 16 | S'car'
 17 | p8
 18 | aS'book'
 19 | p9
 20 | asS'_column2id'
 21 | p10
 22 | (dp11
 23 | S'blue'
 24 | p12
 25 | I1
 26 | sS'readable'
 27 | p13
 28 | I2
 29 | sS'red'
 30 | p14
 31 | I0
 32 | ssS'_operations'
 33 | p15
 34 | (lp16
 35 | sS'_id2column'
 36 | p17
 37 | (lp18
 38 | g14
 39 | ag12
 40 | ag13
 41 | asS'_element_shape'
 42 | p19
 43 | (I3
 44 | tp20
 45 | sS'_cooccurrence_matrix'
 46 | p21
 47 | g0
 48 | (ccomposes.matrix.sparse_matrix
 49 | SparseMatrix
 50 | p22
 51 | g2
 52 | Ntp23
 53 | Rp24
 54 | (dp25
 55 | S'_mat'
 56 | p26
 57 | g0
 58 | (cscipy.sparse.csr
 59 | csr_matrix
 60 | p27
 61 | g2
 62 | Ntp28
 63 | Rp29
 64 | (dp30
 65 | S'format'
 66 | p31
 67 | S'csr'
 68 | p32
 69 | sS'_shape'
 70 | p33
 71 | (I2
 72 | I3
 73 | tp34
 74 | sS'indptr'
 75 | p35
 76 | cnumpy.core.multiarray
 77 | _reconstruct
 78 | p36
 79 | (cnumpy
 80 | ndarray
 81 | p37
 82 | (I0
 83 | tp38
 84 | S'b'
 85 | p39
 86 | tp40
 87 | Rp41
 88 | (I1
 89 | (I3
 90 | tp42
 91 | cnumpy
 92 | dtype
 93 | p43
 94 | (S'i4'
 95 | p44
 96 | I0
 97 | I1
 98 | tp45
 99 | Rp46
100 | (I3
101 | S'<'
102 | p47
103 | NNNI-1
104 | I-1
105 | I0
106 | tp48
107 | bI00
108 | S'\x00\x00\x00\x00\x02\x00\x00\x00\x04\x00\x00\x00'
109 | p49
110 | tp50
111 | bsS'indices'
112 | p51
113 | g36
114 | (g37
115 | (I0
116 | tp52
117 | g39
118 | tp53
119 | Rp54
120 | (I1
121 | (I4
122 | tp55
123 | g46
124 | I00
125 | S'\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00'
126 | p56
127 | tp57
128 | bsS'maxprint'
129 | p58
130 | I50
131 | sS'data'
132 | p59
133 | g36
134 | (g37
135 | (I0
136 | tp60
137 | g39
138 | tp61
139 | Rp62
140 | (I1
141 | (I4
142 | tp63
143 | g43
144 | (S'f8'
145 | p64
146 | I0
147 | I1
148 | tp65
149 | Rp66
150 | (I3
151 | S'<'
152 | p67
153 | NNNI-1
154 | I-1
155 | I0
156 | tp68
157 | bI00
158 | S'\x00\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x18@'
159 | p69
160 | tp70
161 | bsbsbsS'_row2id'
162 | p71
163 | (dp72
164 | g8
165 | I0
166 | sg9
167 | I1
168 | ssb.


--------------------------------------------------------------------------------
/src/examples/data/out/ex01.rows:
--------------------------------------------------------------------------------
1 | car
2 | book
3 | 


--------------------------------------------------------------------------------
/src/examples/data/out/ex01.sm:
--------------------------------------------------------------------------------
1 | car	red	5.000000
2 | car	blue	1.000000
3 | book	red	3.000000
4 | book	readable	6.000000
5 | 


--------------------------------------------------------------------------------
/src/examples/data/out/ex10.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/ex10.pkl


--------------------------------------------------------------------------------
/src/examples/data/out/model01.params:
--------------------------------------------------------------------------------
1 | alpha	1.000000
2 | beta	1.000000


--------------------------------------------------------------------------------
/src/examples/data/out/model01.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/examples/data/out/model01.pkl


--------------------------------------------------------------------------------
/src/examples/ex01.py:
--------------------------------------------------------------------------------
 1 | #ex01.py
 2 | #-------
 3 | from composes.semantic_space.space import Space
 4 | 
 5 | #create a space from co-occurrence counts in sparse format
 6 | my_space = Space.build(data = "./data/in/ex01.sm",
 7 |                        rows = "./data/in/ex01.rows",
 8 |                        cols = "./data/in/ex01.cols",
 9 |                        format = "sm")
10 | 
11 | #export the space in sparse format
12 | my_space.export("./data/out/ex01", format = "sm")
13 | 
14 | #export the space in dense format
15 | my_space.export("./data/out/ex01", format = "dm")
16 | 


--------------------------------------------------------------------------------
/src/examples/ex02.py:
--------------------------------------------------------------------------------
 1 | #ex02.py
 2 | #-------
 3 | from composes.semantic_space.space import Space
 4 | from composes.utils import io_utils
 5 | 
 6 | #create a space from co-occurrence counts in sparse format
 7 | my_space = Space.build(data = "./data/in/ex01.sm",
 8 |                        rows = "./data/in/ex01.rows",
 9 |                        cols = "./data/in/ex01.cols",
10 |                        format = "sm")
11 | 
12 | #print the co-occurrence matrix of the space
13 | print my_space.cooccurrence_matrix
14 | 
15 | #save the Space object in pickle format
16 | io_utils.save(my_space, "./data/out/ex01.pkl")
17 | 
18 | #load the saved object
19 | my_space2 = io_utils.load("./data/out/ex01.pkl")
20 | 
21 | #print the co-occurrence matrix of the loaded space
22 | print my_space2.cooccurrence_matrix
23 | 
24 | 


--------------------------------------------------------------------------------
/src/examples/ex03.py:
--------------------------------------------------------------------------------
 1 | #ex03.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
 5 | 
 6 | #create a space from co-occurrence counts in sparse format
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | 
 9 | #print the co-occurrence matrix of the space
10 | print my_space.cooccurrence_matrix
11 | 
12 | #apply ppmi weighting
13 | my_space = my_space.apply(PpmiWeighting())
14 | 
15 | #print the co-occurrence matrix of the transformed space
16 | print my_space.cooccurrence_matrix
17 | 
18 | 


--------------------------------------------------------------------------------
/src/examples/ex04.py:
--------------------------------------------------------------------------------
 1 | #ex04.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.transformation.dim_reduction.svd import Svd
 5 | 
 6 | #load a space
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | 
 9 | #print the co-occurrence matrix and the columns of the space
10 | print my_space.cooccurrence_matrix
11 | print my_space.id2column
12 | 
13 | #apply svd reduction
14 | my_space = my_space.apply(Svd(2))
15 | 
16 | #print the transformed space
17 | print my_space.cooccurrence_matrix
18 | print my_space.id2column
19 | 


--------------------------------------------------------------------------------
/src/examples/ex05.py:
--------------------------------------------------------------------------------
 1 | #ex05.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.semantic_space.peripheral_space import PeripheralSpace
 5 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
 6 | 
 7 | 
 8 | #load a space and apply ppmi on it
 9 | my_space = io_utils.load("./data/out/ex01.pkl")
10 | my_space = my_space.apply(PpmiWeighting())
11 | 
12 | print my_space.cooccurrence_matrix
13 | print my_space.id2row
14 | 
15 | #create a peripheral space
16 | my_per_space = PeripheralSpace.build(my_space,
17 |                                      data="./data/in/ex05.sm",
18 |                                      cols="./data/in/ex05.cols",
19 |                                      format="sm")
20 | 
21 | print my_per_space.cooccurrence_matrix
22 | print my_per_space.id2row
23 | 
24 | #save the space
25 | io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
26 | 
27 | 


--------------------------------------------------------------------------------
/src/examples/ex06.py:
--------------------------------------------------------------------------------
 1 | #ex06.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.similarity.cos import CosSimilarity
 5 | 
 6 | #load a space
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | 
 9 | print my_space.cooccurrence_matrix
10 | print my_space.id2row
11 | 
12 | #compute similarity between two words in the space
13 | print my_space.get_sim("car", "car", CosSimilarity())
14 | print my_space.get_sim("car", "book", CosSimilarity())
15 | 


--------------------------------------------------------------------------------
/src/examples/ex07.py:
--------------------------------------------------------------------------------
 1 | #ex07.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.similarity.cos import CosSimilarity
 5 | 
 6 | #load two spaces
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
 9 | 
10 | print my_space.id2row
11 | print my_per_space.id2row
12 | 
13 | #compute similarity between a word and a phrase in the two spaces
14 | print my_space.get_sim("car", "sports_car", CosSimilarity(),
15 |                        space2 = my_per_space)
16 | 


--------------------------------------------------------------------------------
/src/examples/ex08.py:
--------------------------------------------------------------------------------
 1 | #ex08.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.similarity.cos import CosSimilarity
 5 | 
 6 | #load a space
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | 
 9 | #get the top 2 neighbours of "car"
10 | print my_space.get_neighbours("car", 2, CosSimilarity())
11 | 


--------------------------------------------------------------------------------
/src/examples/ex09.py:
--------------------------------------------------------------------------------
 1 | #ex09.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.similarity.cos import CosSimilarity
 5 | 
 6 | #load two spaces
 7 | my_space = io_utils.load("./data/out/ex01.pkl")
 8 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
 9 | 
10 | print my_space.id2row
11 | print my_space.cooccurrence_matrix
12 | print my_per_space.id2row
13 | print my_per_space.cooccurrence_matrix
14 | 
15 | #get the top two neighbours of "car" in a peripheral space
16 | print my_space.get_neighbours("car", 2, CosSimilarity(),
17 |                               space2 = my_per_space)
18 | 
19 | 


--------------------------------------------------------------------------------
/src/examples/ex10.py:
--------------------------------------------------------------------------------
 1 | #ex10.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.weighted_additive import WeightedAdditive
 5 | 
 6 | #load a space
 7 | my_space = io_utils.load("./data/out/ex10.pkl")
 8 | 
 9 | print my_space.id2row
10 | print my_space.cooccurrence_matrix
11 | 
12 | # instantiate a weighted additive model
13 | my_comp = WeightedAdditive(alpha = 1, beta = 1)
14 | 
15 | # use the model to compose words in my_space
16 | composed_space = my_comp.compose([("good", "book", "good_book"),
17 |                                   ("good", "car", "good_car")],
18 |                                  my_space)
19 | 
20 | print composed_space.id2row
21 | print composed_space.cooccurrence_matrix
22 | 
23 | #save the composed space
24 | io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/src/examples/ex11.py:
--------------------------------------------------------------------------------
 1 | #ex11.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.weighted_additive import WeightedAdditive
 5 | 
 6 | # instantiate a weighted additive model
 7 | my_comp = WeightedAdditive(alpha = 1, beta = 1)
 8 | 
 9 | #save it to pickle
10 | io_utils.save(my_comp, "./data/out/model01.pkl")
11 | 
12 | #print its parameters
13 | my_comp.export("./data/out/model01.params")
14 | 
15 | 


--------------------------------------------------------------------------------
/src/examples/ex12.py:
--------------------------------------------------------------------------------
 1 | #ex12.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | 
 5 | #load a previously saved weighted additive model
 6 | my_comp = io_utils.load("./data/out/model01.pkl")
 7 | 
 8 | #print its parameters
 9 | print "alpha:", my_comp.alpha
10 | print "beta:", my_comp.beta
11 | 
12 | #load two spaces
13 | my_space = io_utils.load("./data/out/ex10.pkl")
14 | my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl")
15 | 
16 | #apply the composition model to them
17 | composed_space = my_comp.compose([("good", "history_book", "good_history_book")],
18 |                                  (my_space, my_per_space))
19 | 
20 | print composed_space.id2row
21 | print composed_space.cooccurrence_matrix
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/examples/ex13.py:
--------------------------------------------------------------------------------
 1 | #ex13.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.weighted_additive import WeightedAdditive
 5 | 
 6 | 
 7 | #training data
 8 | train_data = [("good", "car", "good_car"),
 9 |               ("good", "book", "good_book")
10 |               ]
11 | 
12 | #load an argument space
13 | arg_space = io_utils.load("./data/out/ex10.pkl")
14 | print arg_space.id2row
15 | print arg_space.cooccurrence_matrix
16 | 
17 | #load a phrase space
18 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
19 | print phrase_space.id2row
20 | print phrase_space.cooccurrence_matrix
21 | 
22 | #train a weighted additive model on the data
23 | my_comp = WeightedAdditive()
24 | my_comp.train(train_data, arg_space, phrase_space)
25 | 
26 | #print its parameters
27 | print "alpha:", my_comp.alpha
28 | print "beta:", my_comp.beta
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/src/examples/ex14.py:
--------------------------------------------------------------------------------
 1 | #ex14.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.dilation import Dilation
 5 | 
 6 | #training data
 7 | train_data = [("good", "car", "good_car"),
 8 |               ("good", "book", "good_book")
 9 |               ]
10 | 
11 | #load an argument space
12 | arg_space = io_utils.load("./data/out/ex10.pkl")
13 | 
14 | #load a phrase space
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | print "Training phrase space"
17 | print phrase_space.id2row
18 | print phrase_space.cooccurrence_matrix
19 | 
20 | #train a Dilation model on the data
21 | my_comp = Dilation()
22 | my_comp.train(train_data, arg_space, phrase_space)
23 | 
24 | #print its parameters
25 | print "\nlambda:", my_comp._lambda
26 | 
27 | #use the model to compose the train data
28 | composed_space  = my_comp.compose([("good", "bike", "good_bike")],
29 |                                   arg_space)
30 | print "\nComposed space:"
31 | print composed_space.id2row
32 | print composed_space.cooccurrence_matrix


--------------------------------------------------------------------------------
/src/examples/ex15.py:
--------------------------------------------------------------------------------
 1 | #ex15.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.full_additive import FullAdditive
 5 | 
 6 | #training data
 7 | train_data = [("good", "car", "good_car"),
 8 |               ("good", "book", "good_book")
 9 |               ]
10 | 
11 | #load an argument space
12 | arg_space = io_utils.load("./data/out/ex10.pkl")
13 | 
14 | #load a phrase space
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | print "Training phrase space"
17 | print phrase_space.id2row
18 | print phrase_space.cooccurrence_matrix
19 | 
20 | #train a FullAdditive model on the data
21 | my_comp = FullAdditive()
22 | my_comp.train(train_data, arg_space, phrase_space)
23 | 
24 | #print its parameters
25 | print "\nA:", my_comp._mat_a_t.transpose()
26 | print "B:", my_comp._mat_b_t.transpose()
27 | 
28 | #use the model to compose the train data
29 | composed_space  = my_comp.compose([("good", "bike", "good_bike")],
30 |                                   arg_space)
31 | print "\nComposed space:"
32 | print composed_space.id2row
33 | print composed_space.cooccurrence_matrix
34 | 


--------------------------------------------------------------------------------
/src/examples/ex16.py:
--------------------------------------------------------------------------------
 1 | #ex16.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.lexical_function import LexicalFunction
 5 | from composes.similarity.cos import CosSimilarity
 6 | 
 7 | #training data
 8 | #trying to learn a "good" function
 9 | train_data = [("good_function", "car", "good_car"),
10 |               ("good_function", "book", "good_book")
11 |               ]
12 | 
13 | #load argument and phrase space
14 | arg_space = io_utils.load("./data/out/ex10.pkl")
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | 
17 | #train a lexical function model on the data
18 | my_comp = LexicalFunction()
19 | my_comp.train(train_data, arg_space, phrase_space)
20 | 
21 | #print its parameters
22 | print "\nLexical function space:"
23 | print my_comp.function_space.id2row
24 | cooc_mat = my_comp.function_space.cooccurrence_matrix
25 | cooc_mat.reshape(my_comp.function_space.element_shape)
26 | print cooc_mat
27 | 
28 | #similarity within the learned functional space
29 | print "\nSimilarity between good and good in the function space:"
30 | print my_comp.function_space.get_sim("good_function", "good_function",
31 |                                      CosSimilarity())


--------------------------------------------------------------------------------
/src/examples/ex17.py:
--------------------------------------------------------------------------------
 1 | #ex17.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.lexical_function import LexicalFunction
 5 | from composes.utils.regression_learner import RidgeRegressionLearner
 6 | 
 7 | #training data
 8 | #trying to learn a "good" function
 9 | train_data = [("good_function", "car", "good_car"),
10 |               ("good_function", "book", "good_book")
11 |               ]
12 | 
13 | #load argument and phrase space
14 | arg_space = io_utils.load("./data/out/ex10.pkl")
15 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
16 | 
17 | print "\nDefault regression:"
18 | my_comp = LexicalFunction()
19 | print type(my_comp.regression_learner).__name__
20 | my_comp.train(train_data, arg_space, phrase_space)
21 | 
22 | #print its parameters
23 | print "Lexical function space:"
24 | print my_comp.function_space.id2row
25 | cooc_mat = my_comp.function_space.cooccurrence_matrix
26 | cooc_mat.reshape(my_comp.function_space.element_shape)
27 | print cooc_mat
28 | 
29 | print "\nRidge Regression with lambda = 2"
30 | rr_learner=RidgeRegressionLearner(param = 2,
31 |                                   intercept = False,
32 |                                   crossvalidation=False)
33 | my_comp = LexicalFunction(learner = rr_learner)
34 | my_comp.train(train_data, arg_space, phrase_space)
35 | 
36 | #print its parameters
37 | print "Lexical function space:"
38 | print my_comp.function_space.id2row
39 | cooc_mat = my_comp.function_space.cooccurrence_matrix
40 | cooc_mat.reshape(my_comp.function_space.element_shape)
41 | print cooc_mat
42 | 


--------------------------------------------------------------------------------
/src/examples/ex18.py:
--------------------------------------------------------------------------------
 1 | #ex18.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.composition.lexical_function import LexicalFunction
 5 | 
 6 | #training data
 7 | #trying to learn a "book" function
 8 | train_data = [("good_function", "car", "good_car"),
 9 |               ("good_function", "book", "good_book")
10 |               ]
11 | 
12 | #load argument and phrase space
13 | arg_space = io_utils.load("./data/out/ex10.pkl")
14 | phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")
15 | 
16 | #train a lexical function model on the data
17 | my_comp = LexicalFunction()
18 | my_comp.train(train_data, arg_space, phrase_space)
19 | 
20 | #apply the trained model
21 | comp_sp1 = my_comp.compose([("good_function", "car",
22 |                              "good_car")],
23 |                            arg_space)
24 | 
25 | #apply the trained model a second time
26 | comp_sp2 = my_comp.compose([("good_function", "good_car",
27 |                              "good_good_car")],
28 |                            comp_sp1)
29 | 
30 | 
31 | #print the composed spaces:
32 | print "\nComposed space 1:"
33 | print comp_sp1.id2row
34 | print comp_sp1.cooccurrence_matrix
35 | 
36 | print "\nComposed space 2:"
37 | print comp_sp2.id2row
38 | print comp_sp2.cooccurrence_matrix
39 | 


--------------------------------------------------------------------------------
/src/examples/ex19.py:
--------------------------------------------------------------------------------
 1 | #ex19.py
 2 | #-------
 3 | from composes.semantic_space.space import Space
 4 | from composes.composition.lexical_function import LexicalFunction
 5 | from composes.utils.regression_learner import LstsqRegressionLearner
 6 | 
 7 | #training data1: VO N -> SVO
 8 | train_vo_data = [("hate_boy", "man", "man_hate_boy"),
 9 |                  ("hate_man", "man", "man_hate_man"),
10 |                  ("hate_boy", "boy", "boy_hate_boy"),
11 |                  ("hate_man", "boy", "boy_hate_man")
12 |                  ]
13 | 
14 | #training data2: V N -> VO
15 | train_v_data = [("hate", "man", "hate_man"),
16 |                 ("hate", "boy", "hate_boy")
17 |                 ]
18 | 
19 | #load N and SVO spaces
20 | n_space = Space.build(data = "./data/in/ex19-n.sm",
21 |                       cols = "./data/in/ex19-n.cols",
22 |                       format = "sm")
23 | 
24 | svo_space = Space.build(data = "./data/in/ex19-svo.sm",
25 |                         cols = "./data/in/ex19-svo.cols",
26 |                         format = "sm")
27 | 
28 | print "\nInput SVO training space:"
29 | print svo_space.id2row
30 | print svo_space.cooccurrence_matrix
31 | 
32 | #1. train a model to learn VO functions on train data: VO N -> SVO
33 | print "\nStep 1 training"
34 | vo_model = LexicalFunction(learner=LstsqRegressionLearner())
35 | vo_model.train(train_vo_data, n_space, svo_space)
36 | 
37 | #2. train a model to learn V functions on train data: V N -> VO
38 | # where VO space: function space learned in step 1
39 | print "\nStep 2 training"
40 | vo_space = vo_model.function_space
41 | v_model = LexicalFunction(learner=LstsqRegressionLearner())
42 | v_model.train(train_v_data, n_space, vo_space)
43 | 
44 | #print the learned model
45 | print "\n3D Verb space"
46 | print v_model.function_space.id2row
47 | print v_model.function_space.cooccurrence_matrix
48 | 
49 | 
50 | #3. use the trained models to compose new SVO sentences
51 | 
52 | #3.1 use the V model to create new VO combinations
53 | vo_composed_space = v_model.compose([("hate", "woman", "hate_woman"),
54 |                                      ("hate", "man", "hate_man")],
55 |                                     n_space)
56 | 
57 | #3.2 the new VO combinations will be used as functions:
58 | # load the new VO combinations obtained through composition into
59 | # a new composition model
60 | expanded_vo_model = LexicalFunction(function_space=vo_composed_space,
61 |                                     intercept=v_model._has_intercept)
62 | 
63 | #3.3 use the new VO combinations by composing them with subject nouns
64 | # in order to obtain new SVO sentences
65 | svo_composed_space = expanded_vo_model.compose([("hate_woman", "woman", "woman_hates_woman"),
66 |                                                 ("hate_man", "man", "man_hates_man")],
67 |                                                 n_space)
68 | 
69 | #print the composed spaces:
70 | print "\nVO composed space:"
71 | print vo_composed_space.id2row
72 | print vo_composed_space.cooccurrence_matrix
73 | 
74 | #print the composed spaces:
75 | print "\nSVO composed space:"
76 | print svo_composed_space.id2row
77 | print svo_composed_space.cooccurrence_matrix
78 | 
79 | 


--------------------------------------------------------------------------------
/src/examples/ex20.py:
--------------------------------------------------------------------------------
 1 | #ex20.py
 2 | #-------
 3 | from composes.utils import io_utils
 4 | from composes.utils import scoring_utils
 5 | from composes.similarity.cos import CosSimilarity
 6 | 
 7 | #read in a space
 8 | my_space = io_utils.load("data/out/ex01.pkl")
 9 | 
10 | #compute similarities of a list of word pairs
11 | fname = "data/in/word_sims.txt"
12 | word_pairs = io_utils.read_tuple_list(fname, fields=[0,1])
13 | predicted = my_space.get_sims(word_pairs, CosSimilarity())
14 | 
15 | #compute correlations
16 | gold = io_utils.read_list(fname, field=2)
17 | print "Spearman"
18 | print scoring_utils.score(gold, predicted, "spearman")
19 | print "Pearson"
20 | print scoring_utils.score(gold, predicted, "pearson")


--------------------------------------------------------------------------------
/src/examples/exercise.sh:
--------------------------------------------------------------------------------
 1 | # set pythonpath
 2 | export PYTHONPATH=/home/thenghia.pham/git/toolkit/src:$PYTHONPATH
 3 | export TOOLKIT_DIR=/home/thenghia.pham/git/toolkit
 4 | export OUT_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/data/tutorial
 5 | export DATA_DIR=/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial
 6 | export LOG_FILE=$OUT_DIR/log/exercises.log
 7 | 
 8 | #**************************************************************************************
 9 | echo Step1
10 | echo STARTING BUILDING CORE
11 | export CORE_IN_FILE_PREFIX=CORE_SS.verbnoun.core
12 | export CORE_OUT_DIR=$OUT_DIR/core
13 | 
14 | # run build core space pipeline
15 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_core_space.py -i $DATA_DIR/$CORE_IN_FILE_PREFIX --input_format=pkl -o $CORE_OUT_DIR -w ppmi -s top_sum_2000 -r svd_100 --output_format=dm -l $LOG_FILE
16 | 
17 | echo FINISHED BUILDING CORE
18 | 
19 | #**************************************************************************************
20 | echo Step2
21 | echo STARTING PERIPHERAL PIPELINE
22 | export CORE_SPC=CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl
23 | 
24 | export PER_RAW_FILE=$DATA_DIR/per.raw.SV
25 | export PER_OUT_DIR=$OUT_DIR/per
26 | 
27 | # run build peripheral space pipeline
28 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/build_peripheral_space.py -i $PER_RAW_FILE --input_format sm -c $CORE_OUT_DIR/$CORE_SPC -o $PER_OUT_DIR dm -l $LOG_FILE
29 | 
30 | echo FINISHED PERIPHERAL PIPELINE
31 | 
32 | #**************************************************************************************
33 | echo step3
34 | echo STARTING TRAINING
35 | 
36 | export MODEL_DIR=$OUT_DIR/trained
37 | export TRAIN_FILE=$DATA_DIR/ML08_SV_train.txt
38 | export PER_SPC=PER_SS.per.raw.SV.CORE_SS.CORE_SS.verbnoun.core.ppmi.top_sum_2000.svd_100.pkl
39 | export MODEL=lexical_func
40 | 
41 | # run training pipeline
42 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/train_composition.py -i $TRAIN_FILE -m $MODEL -o $MODEL_DIR -a $CORE_OUT_DIR/$CORE_SPC -p $PER_OUT_DIR/$PER_SPC --regression ridge --intercept True --crossvalidation False --lambda 2.0 -l $LOG_FILE
43 | 
44 | echo FINISHED TRAINING
45 | #**************************************************************************************
46 | echo step 4
47 | echo STARTING COMPOSING SPACE
48 | 
49 | export TRNED_MODEL=TRAINED_COMP_MODEL.lexical_func.ML08_SV_train.txt.pkl
50 | export COMP_DIR=$OUT_DIR/composed
51 | export COMP_FILE=$DATA_DIR/ML08nvs_test.txt
52 | 
53 | # run apply composition pipeline
54 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/apply_composition.py -i $COMP_FILE --load_model $MODEL_DIR/$TRNED_MODEL -o $COMP_DIR -a $CORE_OUT_DIR/$CORE_SPC -l $LOG_FILE
55 | 
56 | echo FINISHED COMPOSING SPACE
57 | #**************************************************************************************
58 | echo step 5
59 | echo STARTING COMPUTING SIMS
60 | 
61 | export COMP_SPC=COMPOSED_SS.LexicalFunction.ML08nvs_test.txt.pkl
62 | export SIM_DIR=$OUT_DIR/similarity
63 | export TEST_FILE=$DATA_DIR/ML08data_new.txt
64 | 
65 | # create output directory for similarity if the directory doesn't exist
66 | if [ ! -d "$SIM_DIR" ]; then
67 |     mkdir $SIM_DIR
68 | fi
69 | 
70 | # run sim pipeline
71 |  /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/compute_similarities.py -i $TEST_FILE -s $COMP_DIR/$COMP_SPC -o $SIM_DIR -m cos,lin,dot_prod,euclidean -c 1,2 -l $LOG_FILE
72 | 
73 | echo FINISH COMPUTE SIMS
74 | #**************************************************************************************
75 | echo step 6
76 | echo STARTING EVAL SIMS
77 | 
78 | # run evaluation pipeline
79 | /opt/python/bin/python2.7 $TOOLKIT_DIR/src/pipelines/evaluate_similarities.py --in_dir $SIM_DIR -m spearman,pearson -c 3,4 -l $LOG_FILE
80 | echo FINISH EVAL SIMS
81 | 


--------------------------------------------------------------------------------
/src/examples/full_example.py:
--------------------------------------------------------------------------------
  1 | from composes.similarity.cos import CosSimilarity
  2 | from composes.semantic_space.peripheral_space import PeripheralSpace
  3 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
  4 | from composes.transformation.dim_reduction.svd import Svd
  5 | from composes.transformation.feature_selection.top_feature_selection import TopFeatureSelection
  6 | from composes.composition.lexical_function import LexicalFunction
  7 | from composes.composition.full_additive import FullAdditive
  8 | from composes.composition.weighted_additive import WeightedAdditive
  9 | from composes.composition.multiplicative import Multiplicative
 10 | from composes.composition.dilation import Dilation
 11 | from composes.utils.regression_learner import RidgeRegressionLearner
 12 | 
 13 | import composes.utils.io_utils as io_utils
 14 | import composes.utils.scoring_utils as scoring_utils
 15 | 
 16 | #load a core space
 17 | print "Loading the data..."
 18 | data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/"
 19 | 
 20 | space_file = data_path + "CORE_SS.verbnoun.core.pkl"
 21 | space = io_utils.load(space_file)
 22 | 
 23 | print "Applying PPMI..."
 24 | space = space.apply(PpmiWeighting())
 25 | 
 26 | print "Applying feature selection..."
 27 | space = space.apply(TopFeatureSelection(2000))
 28 | 
 29 | print "Applying SVD..."
 30 | space = space.apply(Svd(100))
 31 | 
 32 | print "Creating peripheral space.."
 33 | per_space = PeripheralSpace.build(space,
 34 |                                   data = data_path + "per.raw.SV.sm",
 35 |                                   cols = data_path + "per.raw.SV.cols",
 36 |                                   format = "sm"
 37 |                                   )
 38 | 
 39 | #reading in train data
 40 | train_data_file = data_path + "ML08_SV_train.txt"
 41 | train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2])
 42 | 
 43 | print "Training Lexical Function composition model..."
 44 | comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2))
 45 | comp_model.train(train_data, space, per_space)
 46 | 
 47 | print "Composing phrases..."
 48 | test_phrases_file = data_path + "ML08nvs_test.txt"
 49 | test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2])
 50 | composed_space = comp_model.compose(test_phrases, space)
 51 | 
 52 | print "Reading similarity test data..."
 53 | test_similarity_file = data_path + "ML08data_new.txt"
 54 | test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1])
 55 | gold = io_utils.read_list(test_similarity_file, field=2)
 56 | 
 57 | print "Computing similarity with lexical function..."
 58 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 59 | 
 60 | #use this composed space to assign similarities
 61 | print "Scoring lexical function..."
 62 | print scoring_utils.score(gold, pred, "spearman")
 63 | 
 64 | 
 65 | print "Training Full Additive composition model..."
 66 | comp_model = FullAdditive(learner = RidgeRegressionLearner(param=2))
 67 | comp_model.train(train_data, space, per_space)
 68 | composed_space = comp_model.compose(test_phrases, space)
 69 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 70 | print scoring_utils.score(gold, pred, "spearman")
 71 | 
 72 | print "Training Weighted Additive composition model..."
 73 | comp_model = WeightedAdditive()
 74 | comp_model.train(train_data, space, per_space)
 75 | print "alpha, beta:", comp_model.alpha, comp_model.beta
 76 | composed_space = comp_model.compose(test_phrases, space)
 77 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 78 | print scoring_utils.score(gold, pred, "spearman")
 79 | 
 80 | print "Training Dilation composition model..."
 81 | comp_model = Dilation()
 82 | comp_model.train(train_data, space, per_space)
 83 | print "lambda:", comp_model._lambda
 84 | composed_space = comp_model.compose(test_phrases, space)
 85 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 86 | print scoring_utils.score(gold, pred, "spearman")
 87 | 
 88 | print "Multiplicative composition model..."
 89 | comp_model = Multiplicative()
 90 | composed_space = comp_model.compose(test_phrases, space)
 91 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 92 | print scoring_utils.score(gold, pred, "spearman")
 93 | 
 94 | print "Simple additive composition model..."
 95 | comp_model = WeightedAdditive(1,1)
 96 | composed_space = comp_model.compose(test_phrases, space)
 97 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
 98 | print scoring_utils.score(gold, pred, "spearman")
 99 | 
100 | print "Simple dilation composition model..."
101 | comp_model = Dilation()
102 | composed_space = comp_model.compose(test_phrases, space)
103 | pred = composed_space.get_sims(test_pairs, CosSimilarity())
104 | print scoring_utils.score(gold, pred, "spearman")
105 | 


--------------------------------------------------------------------------------
/src/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/composes-toolkit/dissect/671b1778f0f5ba962f6fe5c5c384e7647f1e7d60/src/pipelines/__init__.py


--------------------------------------------------------------------------------
/src/pipelines/compute_neighbours.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 17, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | '''
  7 | Created on Oct 17, 2012
  8 | 
  9 | @author: Georgiana Dinu, Pham The Nghia
 10 | '''
 11 | 
 12 | '''
 13 | Created on Jun 12, 2012
 14 | 
 15 | @author: thenghia.pham
 16 | '''
 17 | 
 18 | 
 19 | import sys
 20 | import getopt
 21 | from ConfigParser import ConfigParser
 22 | from composes.semantic_space.space import Space
 23 | from composes.similarity.cos import CosSimilarity
 24 | from composes.similarity.lin import LinSimilarity
 25 | from composes.similarity.dot_prod import DotProdSimilarity
 26 | from composes.similarity.euclidean import EuclideanSimilarity
 27 | from composes.utils import io_utils
 28 | from composes.utils import log_utils
 29 | import pipeline_utils as utils
 30 | import logging
 31 | logger = logging.getLogger("test vector space construction pipeline")
 32 | 
 33 | 
 34 | 
 35 | def usage(errno=0):
 36 |     print >>sys.stderr,\
 37 |     """Usage:
 38 |     python compute_similarities.py [options] [config_file]
 39 | 
 40 |     Options:
 41 |     -i --input <file>: input file.
 42 |     -o --output <dir>: output directory.
 43 |     -s --space <file[,file2]>: file of semantic space. The second
 44 |             word of a word pair is interpreted in the second space argument,
 45 |             if provided.
 46 |     -m --sim_measure <string>: similarity measure
 47 |     -n --no_neighbours <int>: number of neighbours to be returned
 48 |     -l --log <file>: log file. Optional.
 49 |     -h --help : help
 50 | 
 51 |     Arguments:
 52 |     config_file: <file>, used as default values for configuration options above.
 53 |             If you don't specify these options in [options] the value from the
 54 |             config_file will be used.
 55 | 
 56 |     Example:
 57 |     """
 58 |     sys.exit(errno)
 59 | 
 60 | 
 61 | def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files):
 62 |     sim_dict = {"cos": CosSimilarity(),
 63 |                 "lin": LinSimilarity(),
 64 |                 "dot_prod": DotProdSimilarity(),
 65 |                 "euclidean": EuclideanSimilarity()}
 66 | 
 67 |     if not sim_measure in sim_dict:
 68 |         raise ValueError("Similarity measure:%s not defined" % sim_measure)
 69 | 
 70 |     space = io_utils.load(space_files[0], Space)
 71 |     space2 = None
 72 |     space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
 73 |     if len(space_files) == 2:
 74 |         space2 = io_utils.load(space_files[1], Space)
 75 |         space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])
 76 | 
 77 |     sim = sim_dict[sim_measure]
 78 | 
 79 |     descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
 80 |     out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
 81 |     io_utils.create_parent_directories(out_file)
 82 | 
 83 |     data = io_utils.read_list(in_file)
 84 | 
 85 |     print "Computing neighbours: %s" % sim_measure
 86 |     with open(out_file,"w") as out_stream:
 87 |         for word in data:
 88 |             out_stream.write("%s\n" % word)
 89 |             result = space.get_neighbours(word, no_neighbours, sim, space2)
 90 |             for neighbour, neighbour_sim in result:
 91 |                 out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
 92 | 
 93 | def main(sys_argv):
 94 |     try:
 95 |         opts, argv = getopt.getopt(sys_argv[1:], "hi:o:s:m:n:l:",
 96 |                                    ["help", "input=", "output=", "sim_measures=",
 97 |                                     "space=", "log=", "no_neighbours="])
 98 |     except getopt.GetoptError, err:
 99 |         print str(err)
100 |         usage()
101 |         sys.exit(1)
102 | 
103 |     section = "compute_neighbours"
104 | 
105 |     out_dir = None
106 |     in_file = None
107 |     sim_measure = None
108 |     spaces = None
109 |     log_file = None
110 |     no_neighbours = "20"
111 | 
112 | 
113 |     if (len(argv) == 1):
114 |         config_file = argv[0]
115 |         with open(config_file) as f:
116 |             pass
117 |         config = ConfigParser()
118 |         config.read(config_file)
119 |         out_dir = utils.config_get(section, config, "output", None)
120 |         in_file = utils.config_get(section, config, "input", None)
121 |         sim_measure = utils.config_get(section, config, "sim_measure", None)
122 |         spaces = utils.config_get(section, config, "space", None)
123 |         if not spaces is None:
124 |             spaces = spaces.split(",")
125 |         no_neighbours = utils.config_get(section, config, "no_neighbours", no_neighbours)
126 |         log_file = utils.config_get(section, config, "log", None)
127 | 
128 |     for opt, val in opts:
129 |         if opt in ("-i", "--input"):
130 |             in_file = val
131 |         elif opt in ("-o", "--output"):
132 |             out_dir = val
133 |         elif opt in ("-m", "--sim_measure"):
134 |             sim_measure = val
135 |         elif opt in ("-s", "--space"):
136 |             spaces = val.split(",")
137 |         elif opt in ("-n", "--no_neighbours"):
138 |             no_neighbours = val
139 |         elif opt in ("-l", "--log"):
140 |             log_file = val
141 |         elif opt in ("-h", "--help"):
142 |             usage()
143 |             sys.exit(0)
144 |         else:
145 |             usage(1)
146 | 
147 |     log_utils.config_logging(log_file)
148 | 
149 |     no_neighbours = int(no_neighbours)
150 | 
151 |     utils.assert_option_not_none(in_file, "Input file required", usage)
152 |     utils.assert_option_not_none(out_dir, "Output directory required", usage)
153 |     utils.assert_option_not_none(sim_measure, "Similarity measure required", usage)
154 |     utils.assert_option_not_none(spaces, "Semantic space file required", usage)
155 | 
156 |     compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, spaces)
157 | 
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     main(sys.argv)


--------------------------------------------------------------------------------
/src/pipelines/evaluate_similarities.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 17, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | '''
  7 | Created on Oct 17, 2012
  8 | 
  9 | @author: Georgiana Dinu, Pham The Nghia
 10 | '''
 11 | 
 12 | '''
 13 | Created on Jun 12, 2012
 14 | 
 15 | @author: thenghia.pham
 16 | '''
 17 | 
 18 | 
 19 | import sys
 20 | import getopt
 21 | import os
 22 | from ConfigParser import ConfigParser
 23 | from composes.utils import scoring_utils
 24 | from composes.utils import log_utils
 25 | import pipeline_utils as utils
 26 | 
 27 | import logging
 28 | logger = logging.getLogger("test vector space construction pipeline")
 29 | 
 30 | 
 31 | 
 32 | def usage(errno=0):
 33 |     print >>sys.stderr,\
 34 |     """Usage:
 35 |     python compute_similarities.py [options] [config_file]
 36 | 
 37 |     Options:
 38 |     -i --input <file>: input file.
 39 |     --in_dir: <dir>: input directory, all files that pass the --filter are tested.
 40 |                 -i value is ignored. Optional.
 41 |     --filter: <string>: when --in_dir, it acts as a filter on the files to be tested:
 42 |                 only files containing this substring are tested. Optional,
 43 |                 default all files in in_dir are tested.
 44 |     -m --correlation_measure <list(string)>: comma-separated correlation measures
 45 |     -c --columns <(int,int)>: pair of columns, indicating which columns contain
 46 |             the words to be compared
 47 |     -l --log <file>: log file. Optional, default ./build_core_space.log
 48 |     -h --help : help
 49 | 
 50 |     Arguments:
 51 |     config_file: <file>, used as default values for configuration options above.
 52 |             If you don't specify these options in [options] the value from the
 53 |             config_file will be used.
 54 | 
 55 |     Example:
 56 |     """
 57 |     sys.exit(errno)
 58 | 
 59 | def evaluate_sim(in_file, columns, corr_measures):
 60 | 
 61 |     if not len(columns) == 2:
 62 |         raise ValueError("Column description unrecognized!")
 63 |     col0 = int(columns[0]) - 1
 64 |     col1 = int(columns[1]) - 1
 65 | 
 66 |     gold = []
 67 |     prediction = []
 68 |     with open(in_file) as in_stream:
 69 |         for line in in_stream:
 70 |             if not line.strip() == "":
 71 |                 elems = line.strip().split()
 72 |                 gold.append(float(elems[col0]))
 73 |                 prediction.append(float(elems[col1]))
 74 | 
 75 |     for corr_measure in corr_measures:
 76 |         print "CORRELATION:%s" % corr_measure
 77 |         corr = scoring_utils.score(gold, prediction, corr_measure)
 78 |         print "\t%f" % corr
 79 | 
 80 | 
 81 | def evaluate_sim_batch(in_dir, columns, corr_measures, filter_=""):
 82 | 
 83 |     if not os.path.exists(in_dir):
 84 |         raise ValueError("Input directory not found: %s" % in_dir)
 85 | 
 86 |     if not in_dir.endswith("/"):
 87 |         in_dir = in_dir + "/"
 88 | 
 89 |     for file_ in os.listdir(in_dir):
 90 |         if file_.find(filter_) != -1:
 91 |             print file_
 92 |             evaluate_sim(in_dir + file_, columns, corr_measures)
 93 | 
 94 | 
 95 | def main(sys_argv):
 96 |     try:
 97 |         opts, argv = getopt.getopt(sys_argv[1:], "hi:m:c:l:",
 98 |                                    ["help", "input=", "correlation_measure=",
 99 |                                     "columns=", "log=", "in_dir=", "filter="])
100 | 
101 |     except getopt.GetoptError, err:
102 |         print str(err)
103 |         usage()
104 |         sys.exit(1)
105 | 
106 |     in_file = None
107 |     in_dir = None
108 |     filter_ = ""
109 |     corr_measures = None
110 |     columns = None
111 |     log_file = None
112 | 
113 |     section = "evaluate_similarities"
114 | 
115 |     if (len(argv) == 1):
116 |         config_file = argv[0]
117 |         config = ConfigParser()
118 |         config.read(config_file)
119 |         in_file = utils.config_get(section, config, "input", None)
120 |         in_dir = utils.config_get(section, config, "in_dir", None)
121 |         filter_ = utils.config_get(section, config, "filter", filter_)
122 |         corr_measures = utils.config_get(section, config, "correlation_measure", None)
123 |         if not corr_measures is None:
124 |             corr_measures = corr_measures.split(",")
125 |         columns = utils.config_get(section, config, "columns", None)
126 |         if not columns is None:
127 |             columns = columns.split(",")
128 |         log_file = utils.config_get(section, config, "log", None)
129 | 
130 |     for opt, val in opts:
131 |         if opt in ("-i", "--input"):
132 |             in_file = val
133 |         elif opt in ("-m", "--correlation_measure"):
134 |             corr_measures = val.split(",")
135 |         elif opt in ("-c", "--columns"):
136 |             columns = val.split(",")
137 |         elif opt == "--in_dir":
138 |             in_dir = val
139 |         elif opt == "--filter":
140 |             filter_ = val
141 |         elif opt in ("-l", "--log"):
142 |             log_file = val
143 |         elif opt in ("-h", "--help"):
144 |             usage()
145 |             sys.exit(0)
146 |         else:
147 |             usage(1)
148 | 
149 |     log_utils.config_logging(log_file)
150 | 
151 |     utils.assert_option_not_none(corr_measures, "Correlation measures required", usage)
152 |     utils.assert_option_not_none(columns, "Columns to be read from input file required", usage)
153 | 
154 |     if len(columns) != 2:
155 |         raise ValueError("Columns (-c) field should contain two comma-separated integers (e.g. -c 3,4)")
156 | 
157 |     if not in_dir is None:
158 |         evaluate_sim_batch(in_dir, columns, corr_measures, filter_)
159 |     else:
160 |         utils.assert_option_not_none(in_file, "Input file required", usage)
161 |         evaluate_sim(in_file, columns, corr_measures)
162 | 
163 | if __name__ == '__main__':
164 |     main(sys.argv)
165 | 


--------------------------------------------------------------------------------
/src/pipelines/pipeline_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 20, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | 
 7 | def assert_bool(option, message, usage):
 8 |     if option not in (True, False):
 9 |         print message
10 |         usage(1)
11 | 
12 | def assert_option_not_none(option, message, usage):
13 |     if option is None:
14 |         print message
15 |         usage(1)
16 | 
17 | def assert_xor_options(option1, option2, message, usage):
18 |     if not ((option1 is None) ^ (option2 is None)):
19 |         print message
20 |         usage(1)
21 | 
22 | def config_get(section, config, option, default):
23 |     return config.get(section, option) if config.has_option(section, option) else default
24 | 


--------------------------------------------------------------------------------
/src/unitest/__init__.py:
--------------------------------------------------------------------------------
1 | current_file = __file__
2 | toolkit_dir = "/".join(current_file.split("/")[0:-3])
3 | data_dir = toolkit_dir + "/resource/unittest/"


--------------------------------------------------------------------------------
/src/unitest/bps_pipeline_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 18, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | 
  8 | import numpy as np
  9 | 
 10 | from pipelines import build_peripheral_space as bps
 11 | from pipelines import build_core_space as bcs
 12 | from composes.semantic_space.space import Space
 13 | 
 14 | from unitest import data_dir
 15 | import pytest
 16 | 
 17 | 
 18 | class Test(unittest.TestCase):
 19 | 
 20 |     def setUp(self):
 21 |         self.dir_ = data_dir + "pipelines_test_resources/"
 22 | 
 23 |     def _test_equal_spaces_structs(self, sp, new_sp):
 24 |         self.assertListEqual(sp.id2row, new_sp.id2row)
 25 |         self.assertListEqual(sp.id2column, new_sp.id2column)
 26 |         self.assertDictEqual(sp.row2id, new_sp.row2id)
 27 |         self.assertDictEqual(sp.column2id, new_sp.column2id)
 28 | 
 29 |     def _test_equal_spaces_dense(self, sp, new_sp):
 30 | 
 31 |         self._test_equal_spaces_structs(sp, new_sp)
 32 |         np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat, new_sp.cooccurrence_matrix.mat, 6)
 33 | 
 34 |     def _test_equal_spaces_sparse(self, sp, new_sp):
 35 | 
 36 |         self._test_equal_spaces_structs(sp, new_sp)
 37 |         np.testing.assert_array_almost_equal(sp.cooccurrence_matrix.mat.todense(), new_sp.cooccurrence_matrix.mat.todense(), 6)
 38 | 
 39 |     def test_raises(self):
 40 |         with pytest.raises(SystemExit):
 41 |             bps.main(["build_peripheral_space.py", "-h"])
 42 | 
 43 |         with pytest.raises(SystemExit):
 44 |             bps.main([
 45 |                 "build_peripheral_space.py",
 46 |                 "-l", '/tmp/test_build_peripheral_space.log',
 47 |                 "-h",
 48 |             ])
 49 | 
 50 |     def tttest_simple_sparse_batch(self):
 51 | 
 52 |         bps.main(["build_peripheral_space.py",
 53 |                   "-l", self.dir_ + "log1.txt",
 54 |                   "-i", self.dir_ + "mat1",
 55 |                   "-o", self.dir_,
 56 |                   "--core_in_dir", self.dir_,
 57 |                   "--core_filter", "CORE_SS.mat1.pkl",
 58 |                   "--input_format", "sm",
 59 |                   "--output_format", "sm"
 60 |                   ])
 61 | 
 62 |         s1 = Space.build(data=self.dir_ + "mat1.sm",
 63 |                          cols=self.dir_ + "mat1.cols",
 64 |                          format="sm")
 65 |         s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
 66 |                          cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
 67 |                          format="sm")
 68 |         s3 = Space.build(data=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.sm",
 69 |                          cols=self.dir_ + "PER_SS.mat1.PER_SS.mat1.CORE_SS.mat1.cols",
 70 |                          format="sm")
 71 | 
 72 |         self._test_equal_spaces_sparse(s1, s2)
 73 |         self._test_equal_spaces_sparse(s1, s3)
 74 | 
 75 |     def test_simple_sparse(self):
 76 | 
 77 |         bps.main(["build_peripheral_space.py",
 78 |                   "-l", self.dir_ + "log1.txt",
 79 |                   "-i", self.dir_ + "mat1",
 80 |                   "-o", self.dir_,
 81 |                   "-c", self.dir_ + "CORE_SS.mat1.pkl",
 82 |                   "--input_format", "sm",
 83 |                   "--output_format", "sm"
 84 |                   ])
 85 | 
 86 |         s1 = Space.build(data=self.dir_ + "mat1.sm",
 87 |                          cols=self.dir_ + "mat1.cols",
 88 |                          format="sm")
 89 |         s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
 90 |                          cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
 91 |                          format="sm")
 92 | 
 93 |         self._test_equal_spaces_sparse(s1, s2)
 94 | 
 95 |     def test_simple_dense(self):
 96 |         bps.main(["build_peripheral_space.py",
 97 |                   "-l", self.dir_ + "log1.txt",
 98 |                   "-i", self.dir_ + "mat2",
 99 |                   "-o", self.dir_,
100 |                   "-c", self.dir_ + "CORE_SS.mat2.pkl",
101 |                   "--input_format", "dm",
102 |                   "--output_format", "dm"
103 |                   ])
104 |         s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
105 |         s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm")
106 | 
107 |         self._test_equal_spaces_dense(s1, s2)
108 | 
109 |     def test_simple_ops(self):
110 | 
111 |         bcs.main(["build_core_space.py",
112 |                   "-l", self.dir_ + "log1.txt",
113 |                   "-i", self.dir_ + "mat3",
114 |                   "-w", "raw",
115 |                   "-s", "top_sum_3,top_length_3,top_sum_4",
116 |                   "-r", "svd_2,svd_1",
117 |                   "-o", self.dir_,
118 |                   "--input_format", "dm",
119 |                   "--output_format", "dm"
120 |                   ])
121 | 
122 |         core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2",
123 |                      "CORE_SS.mat3.raw.top_sum_3.svd_1",
124 |                      "CORE_SS.mat3.raw.top_length_3.svd_2",
125 |                      "CORE_SS.mat3.raw.top_length_3.svd_1",
126 |                      "CORE_SS.mat3.raw.top_sum_4.svd_2",
127 |                      "CORE_SS.mat3.raw.top_sum_4.svd_1"
128 |                      ]
129 | 
130 |         core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats]
131 | 
132 |         for i, core_mat in enumerate(core_mats):
133 |             bps.main(["build_peripheral_space.py",
134 |                       "-l", self.dir_ + "log1.txt",
135 |                       "-i", self.dir_ + "mat3",
136 |                       "-o", self.dir_,
137 |                       "-c", self.dir_ + core_mat + ".pkl",
138 |                       "--input_format", "dm",
139 |                       "--output_format", "dm"
140 |                       ])
141 | 
142 |             s1 = core_spaces[i]
143 |             data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
144 |             s2 = Space.build(data=data_file, format="dm")
145 |             self._test_equal_spaces_dense(s1, s2)
146 | 
147 |             bps.main(["build_peripheral_space.py",
148 |                       "-l", self.dir_ + "log1.txt",
149 |                       "-i", self.dir_ + "mat3",
150 |                       "-o", self.dir_,
151 |                       "-c", self.dir_ + core_mat + ".pkl",
152 |                       "--input_format", "sm",
153 |                       "--output_format", "dm"
154 |                       ])
155 | 
156 |             s1 = core_spaces[i]
157 |             data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
158 |             s2 = Space.build(data=data_file, format="dm")
159 | 
160 |             self._test_equal_spaces_dense(s1, s2)
161 | 


--------------------------------------------------------------------------------
/src/unitest/conftest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import py
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def toolkit_dir():
 8 |     return py.path.local(__file__).dirpath().join('..', '..')
 9 | 
10 | 
11 | @pytest.fixture
12 | def data_dir(toolkit_dir):
13 |     return toolkit_dir.join('resource', 'unittest')
14 | 
15 | 
16 | @pytest.fixture
17 | def config_dir(tmpdir):
18 |     return tmpdir.mkdir('config')
19 | 
20 | 
21 | @pytest.fixture
22 | def pipelines_test_resources(data_dir):
23 |     return data_dir.join('pipelines_test_resources')
24 | 
25 | 
26 | @pytest.fixture
27 | def sim_input(pipelines_test_resources):
28 |     return str(pipelines_test_resources.join('sim_input.txt'))
29 | 
30 | 


--------------------------------------------------------------------------------
/src/unitest/crossvalidation_utils_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 9, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import unittest
 7 | from composes.utils.crossvalidation_utils import get_split_indices
 8 | 
 9 | class Test(unittest.TestCase):
10 | 
11 | 
12 |     def test_get_split_indicec(self):
13 | 
14 |         test_cases = [(10, 3, 4), (9, 10, 1), (10, 10, 1), (109, 10, 11), (1, 1, 1)]
15 | 
16 |         for range_, fold, max_len in test_cases:
17 | 
18 |             indices = get_split_indices(range_, fold)
19 |             self.assertGreaterEqual(fold, len(indices))
20 | 
21 |             for i in range(len(indices)):
22 |                 self.assertTrue(len(indices[i]) >= range_//fold or fold >= range_)
23 |                 self.assertGreaterEqual(max_len, len(indices[i]))
24 | 
25 | 
26 |         indices = get_split_indices(10, 3)
27 |         self.assertEqual(len(indices[0]), 4)
28 |         self.assertEqual(len(indices[1]), 3)
29 |         self.assertEqual(len(indices[2]), 3)
30 | 
31 | if __name__ == "__main__":
32 |     #import sys;sys.argv = ['', 'Test.test_get_split_indicec']
33 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/dense_matrix_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 17, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | import numpy.testing
  9 | from scipy.sparse import csr_matrix
 10 | from scipy.sparse import csc_matrix
 11 | from composes.matrix.sparse_matrix import SparseMatrix
 12 | from composes.matrix.dense_matrix import DenseMatrix
 13 | 
 14 | class TestDenseMatrix(unittest.TestCase):
 15 | 
 16 | 
 17 |     def setUp(self):
 18 |         self.a = np.array([[1,2,3],[4,0,5]])
 19 |         self.b = np.array([[0,0,0],[0,0,0]])
 20 | 
 21 |         self.c = np.array([[0,0],[0,0],[0,0]])
 22 |         self.d = np.array([[1,0],[0,1]])
 23 |         self.e = np.array([1,10])
 24 |         self.f = np.array([1,10,100])
 25 | 
 26 |         self.matrix_a = DenseMatrix(self.a)
 27 |         self.matrix_b = DenseMatrix(self.b)
 28 | 
 29 |         self.matrix_c = DenseMatrix(self.c)
 30 |         self.matrix_d = DenseMatrix(self.d)
 31 | 
 32 |     def tearDown(self):
 33 |         pass
 34 | 
 35 |     def test_init(self):
 36 |         nparr = self.a
 37 |         test_cases = [nparr,
 38 |                    np.mat(nparr),
 39 |                    csr_matrix(nparr),
 40 |                    csc_matrix(nparr),
 41 |                    SparseMatrix(nparr)]
 42 | 
 43 |         for inmat in test_cases:
 44 |             outmat = DenseMatrix(inmat)
 45 |             self.assertIsInstance(outmat.mat, np.matrix)
 46 |             numpy.testing.assert_array_equal(nparr, np.array(outmat.mat))
 47 | 
 48 | 
 49 |     def test_add(self):
 50 |         test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
 51 |                       (self.matrix_a, self.matrix_b, self.matrix_a.mat)
 52 |                       ]
 53 | 
 54 | 
 55 |         for (term1, term2, expected) in test_cases:
 56 |             sum_ = term1 + term2
 57 |             numpy.testing.assert_array_equal(sum_.mat, expected)
 58 |             self.assertIsInstance(sum_, type(term1))
 59 | 
 60 |     def test_add_raises(self):
 61 |         test_cases = [(self.matrix_a, self.a),
 62 |                       (self.matrix_a, SparseMatrix(self.a))]
 63 | 
 64 |         for (term1, term2) in test_cases:
 65 |             self.assertRaises(TypeError, term1.__add__, term2)
 66 | 
 67 |     def test_div(self):
 68 |         test_cases = [(self.matrix_a, 2, np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]])),
 69 |                       (self.matrix_c, 2, np.mat(self.c))
 70 |                      ]
 71 | 
 72 |         for (term1, term2, expected) in test_cases:
 73 |             sum_ = term1 / term2
 74 |             numpy.testing.assert_array_equal(sum_.mat, expected)
 75 |             self.assertIsInstance(sum_, DenseMatrix)
 76 | 
 77 |     def test_div_raises(self):
 78 |         test_cases = [(self.matrix_a, self.a, TypeError),
 79 |                       (self.matrix_a, SparseMatrix(self.a), TypeError),
 80 |                       (self.matrix_a, "3", TypeError),
 81 |                       (self.matrix_a, 0, ZeroDivisionError)
 82 |                       ]
 83 | 
 84 |         for (term1, term2, error_type) in test_cases:
 85 |             self.assertRaises(error_type, term1.__div__, term2)
 86 | 
 87 | 
 88 |     def test_mul(self):
 89 |         test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])),
 90 |                       (self.matrix_d, self.matrix_a, self.matrix_a.mat),
 91 |                       (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])),
 92 |                       (2, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
 93 |                       (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]])),
 94 |                       (np.int64(2), self.matrix_a, np.mat([[2,4,6],[8,0,10]]))
 95 |                       ]
 96 | 
 97 |         for (term1, term2, expected) in test_cases:
 98 |             sum_ = term1 * term2
 99 |             numpy.testing.assert_array_equal(sum_.mat, expected)
100 |             self.assertIsInstance(sum_, DenseMatrix)
101 | 
102 |     def test_mul_raises(self):
103 |         test_cases = [(self.matrix_a, self.a),
104 |                       (self.matrix_a, SparseMatrix(self.a)),
105 |                       (self.matrix_a, "3"),
106 |                       ("3", self.matrix_a)]
107 | 
108 |         for (term1, term2) in test_cases:
109 |             self.assertRaises(TypeError, term1.__mul__, term2)
110 | 
111 |     def test_multiply(self):
112 |         test_cases = [(self.matrix_a, self.matrix_a, np.mat([[1,4,9],[16,0,25]])),
113 |                       (self.matrix_a, self.matrix_b, np.mat(self.b))
114 |                       ]
115 | 
116 |         for (term1, term2, expected) in test_cases:
117 |             mult1 = term1.multiply(term2)
118 |             mult2 = term2.multiply(term1)
119 | 
120 |             numpy.testing.assert_array_equal(mult1.mat, expected)
121 |             numpy.testing.assert_array_equal(mult2.mat, expected)
122 | 
123 |             self.assertIsInstance(mult1, DenseMatrix)
124 |             self.assertIsInstance(mult2, DenseMatrix)
125 | 
126 |     def test_multiply_raises(self):
127 | 
128 |         test_cases = [(self.matrix_a, self.matrix_d, ValueError),
129 |                       (self.matrix_a, self.a, TypeError),
130 |                       (self.matrix_a, SparseMatrix(self.a), TypeError),
131 |                       ]
132 | 
133 |         for (term1, term2, error_type) in test_cases:
134 |             self.assertRaises(error_type, term1.multiply, term2)
135 | 
136 |     def test_scale_rows(self):
137 |         outcome = np.mat([[1,2,3],[40,0,50]])
138 |         test_cases = [(self.matrix_a, self.e, outcome),
139 |                       (self.matrix_a, np.mat(self.e).T, outcome),
140 |                       ]
141 | 
142 |         for (term1, term2, expected) in test_cases:
143 |             term1 = term1.scale_rows(term2)
144 |             numpy.testing.assert_array_equal(term1.mat, expected)
145 | 
146 |     def test_scale_columns(self):
147 |         test_cases = [(self.matrix_a, self.f, np.mat([[1,20,300],[4,0,500]]))]
148 | 
149 |         for (term1, term2, expected) in test_cases:
150 |             term1 = term1.scale_columns(term2)
151 |             numpy.testing.assert_array_equal(term1.mat, expected)
152 | 
153 | 
154 |     def test_scale_raises(self):
155 |         test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
156 |                       (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
157 |                       (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
158 |                       (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
159 |                       (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
160 |                       ]
161 |         for (term1, term2, error_type, function) in test_cases:
162 |             self.assertRaises(error_type, function, term2)
163 | 
164 | 
165 |     def test_plog(self):
166 |         m = DenseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]]))
167 |         m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]])
168 |         a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]])
169 |         test_cases = [(self.matrix_a.copy(), a_expected),
170 |                       (m, m_expected)
171 |                      ]
172 | 
173 |         for (term, expected) in test_cases:
174 |             term.plog()
175 |             numpy.testing.assert_array_almost_equal(term.mat, expected, 3)
176 | 
177 | if __name__ == "__main__":
178 |     #import sys;sys.argv = ['', 'Test.testName']
179 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/dilation_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 15, 2012
  3 | 
  4 | @author: nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | from composes.matrix.dense_matrix import DenseMatrix
  9 | #from composes.composition.dilation_1 import DilationModel
 10 | from composes.composition.dilation import Dilation
 11 | 
 12 | class Test(unittest.TestCase):
 13 | 
 14 |     def setUp(self):
 15 |         self.m11 = DenseMatrix(np.mat([[4],[2]]))
 16 |         self.m21 = DenseMatrix(np.mat([[3],[6]]))
 17 |         #self.ph1 = DenseMatrix(np.mat([[5],[10]]))
 18 |         self.ph1 = DenseMatrix(np.mat([[80],[40]]))
 19 | 
 20 |         self.m12 = DenseMatrix(np.mat([[2,0],[3,0]]))
 21 |         self.m22 = DenseMatrix(np.mat([[3,3],[6,4]]))
 22 |         #self.ph2 = DenseMatrix(np.mat([[5,2],[10,5]]))
 23 |         self.ph2 = DenseMatrix(np.mat([[20,8],[90,45]]))
 24 | 
 25 |         self.m13 = DenseMatrix(np.mat([[4,3],[3,4]]))
 26 |         self.m23 = DenseMatrix(np.mat([[0,5],[0,5]]))
 27 |         #self.ph3 = DenseMatrix(np.mat([[12,14],[12,21]]))
 28 |         self.ph3 = DenseMatrix(np.mat([[300,350],[300,525]]))
 29 | 
 30 |         self.m14 = DenseMatrix(np.mat([[4,3],[3,4],[0,0]]))
 31 |         self.m24 = DenseMatrix(np.mat([[0,5],[0,5],[0,0]]))
 32 |         #self.ph4 = DenseMatrix(np.mat([[12,14],[12,21],[0,0]]))
 33 |         self.ph4 = DenseMatrix(np.mat([[300,350],[300,525],[0,0]]))
 34 | 
 35 |         self.m15 = DenseMatrix(np.mat([[2,0],[0,0],[3,0]]))
 36 |         self.m25 = DenseMatrix(np.mat([[3,3],[0,0],[6,4]]))
 37 |         #self.ph5 = DenseMatrix(np.mat([[5,2],[0,0],[10,5]]))
 38 |         self.ph5 = DenseMatrix(np.mat([[20,8],[0,0],[90,45]]))
 39 | 
 40 |         self.m16 = DenseMatrix(np.mat([[0,0],[0,0]]))
 41 |         self.m26 = DenseMatrix(np.mat([[0,0],[0,0]]))
 42 |         self.ph6 = DenseMatrix(np.mat([[0,0],[0,0]]))
 43 | 
 44 |         self.m17 = DenseMatrix(np.mat([[2,0],[3,0]]))
 45 |         self.m27 = DenseMatrix(np.mat([[0,1],[0,2]]))
 46 |         #self.ph7 = DenseMatrix(np.mat([[4,5],[5,4]]))
 47 |         self.ph7 = DenseMatrix(np.mat([[16,20],[45,36]]))
 48 | 
 49 |     def test_train_exact(self):
 50 |         test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)),
 51 |                       (self.m12, self.m22, self.ph2, 5 / (3.0)),
 52 |                       (self.m13, self.m23, self.ph3, 6),
 53 |                       (self.m14, self.m24, self.ph4, 6),
 54 |                       (self.m15, self.m25, self.ph5, 5 / (3.0)),
 55 |                       (self.m16, self.m26, self.ph6, 2),
 56 |                       (self.m17, self.m27, self.ph7, 2)
 57 |                       ]
 58 | 
 59 |         for arg1, arg2, phrase, lambda_ in test_cases:
 60 |             m = Dilation()
 61 |             m._solve(arg1, arg2, phrase)
 62 |             self.assertAlmostEqual(m._lambda, lambda_)
 63 | #
 64 |     def test_compose_exact(self):
 65 | 
 66 |         test_cases = [(self.m11, self.m21, self.ph1, 5 / (3.0)),
 67 |                       (self.m13, self.m23, self.ph3, 6),
 68 |                       (self.m14, self.m24, self.ph4, 6)
 69 |                       ]
 70 |         for arg1, arg2, phrase, lambda_ in test_cases:
 71 | 
 72 |             m = Dilation()
 73 |             m._solve(arg1, arg2, phrase)
 74 |             res = m._compose(arg1, arg2)
 75 |             np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2)
 76 | 
 77 |             m = Dilation(lambda_)
 78 |             res = m._compose(arg1, arg2)
 79 |             np.testing.assert_array_almost_equal(res.mat, phrase.mat, 2)
 80 | 
 81 | 
 82 |     def test_train_random(self):
 83 |         test_cases = [1.0,2.0,3.0]
 84 |         rows = 4
 85 |         cols = 3
 86 |         m1 = np.random.rand(rows,cols)
 87 |         m2 = np.random.rand(rows,cols)
 88 | 
 89 | 
 90 |         for lambda_ in test_cases:
 91 |             m = Dilation(lambda_)
 92 |             result_p = m._compose(DenseMatrix(m1), DenseMatrix(m2))
 93 | 
 94 |             m = Dilation()
 95 |             m._solve(DenseMatrix(m1),DenseMatrix(m2),result_p)
 96 |             self.assertAlmostEqual(lambda_, m._lambda)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     #import sys;sys.argv = ['', 'Test.testName']
101 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/dimensionality_reduction_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 28, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | from composes.transformation.dim_reduction.svd import Svd
  9 | from composes.transformation.dim_reduction.nmf import Nmf
 10 | from composes.matrix.linalg import Linalg
 11 | from composes.matrix.dense_matrix import DenseMatrix
 12 | from composes.matrix.sparse_matrix import SparseMatrix
 13 | 
 14 | class DimReductionTest(unittest.TestCase):
 15 | 
 16 | 
 17 |     def setUp(self):
 18 |         pass
 19 | 
 20 | 
 21 |     def tearDown(self):
 22 |         pass
 23 | 
 24 |     def test_nmf(self):
 25 |         test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double),
 26 |                       np.mat([[1,0,0]], dtype = np.double)]
 27 | 
 28 |         for in_mat in test_cases:
 29 |             red = Nmf(2)
 30 |             d_mat = DenseMatrix(in_mat)
 31 |             #wd_init, hd_init = red.random_init(d_mat)
 32 |             wd_init, hd_init = red.v_col_init(d_mat)
 33 | 
 34 |             s_mat = SparseMatrix(in_mat)
 35 |             ws_init = SparseMatrix(wd_init)
 36 |             hs_init = SparseMatrix(hd_init)
 37 | 
 38 |             wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init)
 39 |             ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init)
 40 | 
 41 |             #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD
 42 |             #print wd_mat.mat
 43 |             #print hd_mat.mat
 44 |             #print ws_mat.mat.todense()
 45 |             #print hs_mat.mat.todense()
 46 |             print "V:", in_mat
 47 |             print "WH:", (ws_mat*hs_mat).mat.todense()
 48 | 
 49 |             np.testing.assert_array_almost_equal(wd_mat.mat,
 50 |                                                  ws_mat.mat.todense(), 2)
 51 |             np.testing.assert_array_almost_equal(hd_mat.mat,
 52 |                                                  hs_mat.mat.todense(), 2)
 53 | 
 54 |     def test_svd(self):
 55 |         test_cases = [(DenseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
 56 |                        np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
 57 |                                [  4.38544220e+00,   6.06349536e+00, 0],
 58 |                                [  6.76369708e+02,  -4.91431927e-02, 0]]),
 59 |                        np.mat([[0.0059,0.9979,0.0636],
 60 |                                [0.3255,-0.0621,0.9434],
 61 |                                [0.945,0.015,-0.325]]).transpose())]
 62 | 
 63 | 
 64 | 
 65 |         for x, us_expected, v_expected in test_cases:
 66 | 
 67 |             svd_red = Svd(2)
 68 |             us, transmat = svd_red.apply(x)
 69 |             np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
 70 |             np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
 71 | 
 72 |             svd_red = Svd(3)
 73 |             us, transmat = svd_red.apply(x)
 74 |             np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
 75 |             np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
 76 | 
 77 |             svd_red = Svd(6)
 78 |             us, transmat = svd_red.apply(x)
 79 |             np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
 80 |             np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)
 81 | 
 82 |             svd_red = Svd(1)
 83 |             us, transmat = svd_red.apply(x)
 84 |             np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:1], 2)
 85 |             np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:1], 2)
 86 | 
 87 | 
 88 |         test_cases = [(SparseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
 89 |                        np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
 90 |                                [  4.38544220e+00,   6.06349536e+00, 0],
 91 |                                [  6.76369708e+02,  -4.91431927e-02, 0]]),
 92 |                        np.mat([[0.0059,0.9979,0.0636],
 93 |                                [0.3255,-0.0621,0.9434],
 94 |                                [0.945,0.015,-0.325]]).transpose())]
 95 | 
 96 | 
 97 |         for x, us_expected, v_expected in test_cases:
 98 |             us_expected = np.abs(us_expected)
 99 |             v_expected = np.abs(v_expected)
100 | 
101 |             svd_red = Svd(2)
102 |             us, transmat = svd_red.apply(x)
103 |             np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
104 |             np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
105 | 
106 |             svd_red = Svd(3)
107 |             us, transmat = svd_red.apply(x)
108 |             np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
109 |             np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
110 | 
111 |             svd_red = Svd(6)
112 |             us, transmat = svd_red.apply(x)
113 |             np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
114 |             np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)
115 | 
116 |             svd_red = Svd(1)
117 |             us, transmat = svd_red.apply(x)
118 |             np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:1], 2)
119 |             np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:1], 2)
120 | 
121 | if __name__ == "__main__":
122 |     #import sys;sys.argv = ['', 'Test.test_svd']
123 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/es_pipeline_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 19, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import unittest
 7 | from unitest import data_dir
 8 | from pipelines import evaluate_similarities as es
 9 | 
10 | class Test(unittest.TestCase):
11 | 
12 | 
13 |     def setUp(self):
14 |         self.dir_ = data_dir + "pipelines_test_resources/"
15 | 
16 | 
17 |     def tearDown(self):
18 |         pass
19 | 
20 | 
21 |     def test_simple(self):
22 | 
23 |         es.main(["evaluate_similarities.py",
24 |                   "-l", self.dir_ + "log1.txt",
25 |                   "-i", self.dir_ + "pred1.txt",
26 |                   "-m", "pearson,spearman",
27 |                   "-c", "1,3",
28 |                   ])
29 | 
30 |         es.main(["evaluate_similarities.py",
31 |                   "-l", self.dir_ + "log1.txt",
32 |                   "-i", self.dir_ + "pred1.txt",
33 |                   "--in_dir", self.dir_,
34 |                   "--filter", "pred",
35 |                   "-m", "pearson,spearman",
36 |                   "-c", "1,3",
37 |                   ])
38 | 
39 | if __name__ == "__main__":
40 |     #import sys;sys.argv = ['', 'Test.testName']
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/src/unitest/matrix_utils_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 12, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import unittest
 7 | import numpy as np
 8 | from composes.matrix.dense_matrix import DenseMatrix
 9 | from composes.matrix.sparse_matrix import SparseMatrix
10 | from composes.utils.matrix_utils import resolve_type_conflict
11 | from scipy.sparse import csr_matrix
12 | 
13 | class Test(unittest.TestCase):
14 | 
15 | 
16 |     def test_resolve_type_conflict(self):
17 | 
18 |         arr = np.mat([1,2])
19 | 
20 |         a = DenseMatrix(arr)
21 |         b = SparseMatrix(arr)
22 | 
23 |         [c,d] = resolve_type_conflict([a,b], DenseMatrix)
24 |         [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix)
25 |         h = resolve_type_conflict([], DenseMatrix)
26 | 
27 |         [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix)
28 | 
29 |         self.assertIsInstance(c, DenseMatrix)
30 |         self.assertIsInstance(d, DenseMatrix)
31 |         self.assertIsInstance(e, DenseMatrix)
32 |         self.assertIsInstance(f, DenseMatrix)
33 |         self.assertIsInstance(g, DenseMatrix)
34 |         self.assertListEqual([], h)
35 | 
36 |         self.assertIsInstance(g, DenseMatrix)
37 | 
38 |         self.assertIsInstance(u, DenseMatrix)
39 |         self.assertIsInstance(v, DenseMatrix)
40 | 
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     #import sys;sys.argv = ['', 'Test.testName']
45 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/model_export_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 16, 2012
 3 | 
 4 | @author: nghia
 5 | '''
 6 | import unittest
 7 | import numpy as np
 8 | from unitest import data_dir
 9 | from composes.matrix.dense_matrix import DenseMatrix
10 | from composes.semantic_space.space import Space
11 | 
12 | from composes.composition.weighted_additive import WeightedAdditive
13 | from composes.composition.full_additive import FullAdditive
14 | from composes.composition.dilation import Dilation
15 | from composes.composition.lexical_function import LexicalFunction
16 | from composes.exception.illegal_state_error import IllegalStateError
17 | 
18 | class ModelExportingTest(unittest.TestCase):
19 | 
20 |     def setUp(self):
21 |         self.prefix = data_dir + "output/model"
22 |     def test_weighted_additive(self):
23 | 
24 |         self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
25 |         self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
26 |         self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
27 |         self.row = ["a", "b"]
28 |         self.ft = ["f1","f2"]
29 |         self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
30 |         self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
31 |         m = WeightedAdditive()
32 |         m.export(self.prefix + ".add1")
33 |         m.train([("a","a","a_a")], self.space1, self.space2)
34 |         m.export(self.prefix + ".add2")
35 | 
36 |     def test_full_additive(self):
37 | 
38 |         self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
39 |         self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
40 |         self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
41 |         self.row = ["a", "b"]
42 |         self.ft = ["f1","f2"]
43 |         self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
44 |         self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
45 |         m = FullAdditive()
46 |         self.assertRaises(IllegalStateError, m.export,self.prefix + ".full1")
47 |         m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2)
48 | 
49 |         m.export(self.prefix + ".full2")
50 | 
51 |     def test_dilation(self):
52 | 
53 |         self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
54 |         self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
55 |         self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
56 |         self.row = ["a", "b"]
57 |         self.ft = ["f1","f2"]
58 |         self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
59 |         self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
60 |         m = Dilation()
61 |         m.export(self.prefix + ".dil1")
62 |         m.train([("a","b","a_b")], self.space1, self.space2)
63 |         m.export(self.prefix + ".dil2")
64 | 
65 |     def test_lexical_function(self):
66 | 
67 |         self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
68 |         self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
69 |         self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
70 |         self.row = ["a", "b"]
71 |         self.ft = ["f1","f2"]
72 |         self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
73 |         self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
74 |         m = LexicalFunction()
75 |         m._MIN_SAMPLES = 1
76 |         self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
77 |         m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2)
78 |         m.export(self.prefix + ".lf2")
79 | 
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     #import sys;sys.argv = ['', 'Test.test_weighted_additive']
84 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/neighbour_pipeline_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 19, 2012
  3 | 
  4 | @author: thenghia.pham
  5 | '''
  6 | import unittest
  7 | from unitest import data_dir
  8 | from unitest import toolkit_dir
  9 | import pipelines.compute_neighbours as find_neighbours
 10 | from pipelines import build_core_space as bcs
 11 | 
 12 | import pytest
 13 | 
 14 | 
 15 | def read_neighbours_list(file_name):
 16 |     result = []
 17 |     word = None
 18 |     neighbours = []
 19 |     with open(file_name) as f:
 20 |         for line in f:
 21 |             line = line.strip()
 22 |             if (line != ""):
 23 |                 elements = line.split()
 24 |                 if (len(elements) == 1):
 25 |                     if word != None:
 26 |                         result.append((word,neighbours))
 27 |                         neighbours = []
 28 |                     else:
 29 |                         word = elements[0]
 30 |                 else:
 31 |                     neighbours.append((elements[0],elements[1]))
 32 |         if word != None:
 33 |             result.append((word,neighbours))
 34 |     return result
 35 | 
 36 | 
 37 | @pytest.mark.xfail(run=False)
 38 | class NeighboursPipelineTest(unittest.TestCase):
 39 | 
 40 | 
 41 |     def setUp(self):
 42 |         self.dir_ = data_dir
 43 |         self.log_dir = toolkit_dir + "/log/"
 44 | 
 45 |         #create the spaces required in the tests
 46 |         bcs.main(["build_core_space.py",
 47 |           "-l", self.dir_ + "pipelines_test_resources/log1.txt",
 48 |           "-i", self.dir_ + "pipelines_test_resources/mat3",
 49 |           "-w", "raw",
 50 |           "-s", "top_sum_3",
 51 |           "-r", "svd_2",
 52 |           "-o", self.dir_ + "pipelines_test_resources/",
 53 |           "--input_format", "dm"
 54 |           ])
 55 | 
 56 |     def test_find_neighbours(self):
 57 |         """
 58 |         find_neighbours.main(["compute_neighbours.py",
 59 |                            "-l", self.log_dir + "neighbours_log.txt",
 60 |                            "-i", self.dir_ + "neighbours_input.txt",
 61 |                            "-m", "dot_prod",
 62 |                            "-n", "3",
 63 |                            "-s", self.dir_ + "CORE_SS.mat3.raw.top_sum_3.svd_2.pkl",
 64 |                            "-o", self.dir_
 65 |                            ])
 66 | 
 67 |         find_neighbours.main(["compute_neighbours.py",
 68 |                              "%sconfig/neighbours_config.cfg" %self.dir_
 69 |                               ])
 70 | 
 71 |         find_neighbours.main(["compute_neighbours.py",
 72 |                                "-m", "lin",
 73 |                                "%sconfig/neighbours_config.cfg" %self.dir_
 74 |                               ])
 75 | 
 76 |         find_neighbours.main(["compute_neighbours.py",
 77 |                                "-m", "euclidean",
 78 |                                "%sconfig/neighbours_config.cfg" %self.dir_
 79 |                               ])
 80 | 
 81 |         find_neighbours.main(["compute_neighbours.py",
 82 |                                "-m", "euclidean",
 83 |                                "--space", "%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%sCORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
 84 |                                "%sconfig/neighbours_config.cfg" %self.dir_
 85 |                               ])
 86 |         """
 87 |         find_neighbours.main(["compute_neighbours.py",
 88 |                                "-m", "euclidean",
 89 |                                "-n", "2",
 90 |                                "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
 91 |                                "%sconfig/neighbours_config.cfg" %self.dir_
 92 |                               ])
 93 |         find_neighbours.main(["compute_neighbours.py",
 94 |                        "-m", "euclidean",
 95 |                        "-i", self.dir_ + "pipelines_test_resources/neighbours_input.txt",
 96 |                        "-n", "2",
 97 |                        "--space", "%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl,%spipelines_test_resources/CORE_SS.mat3.raw.top_sum_3.svd_2.pkl" %(self.dir_,self.dir_),
 98 |                        "-o" "/home/georgianadinu/work/FAKE_PATH"
 99 |                       ])
100 |         #neighbours_list = read_neighbours_list(self.dir_ + "NEIGHBOURS.neighbours_input.txt.euclidean")
101 |         #print len(neighbours_list)
102 | 
103 | 
104 |     def tearDown(self):
105 |         pass
106 | 
107 | 
108 |     def testName(self):
109 |         pass
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     #import sys;sys.argv = ['', 'Test.testName']
114 |     unittest.main()
115 | 


--------------------------------------------------------------------------------
/src/unitest/operation_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 26, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
  9 | from composes.transformation.dim_reduction.svd import Svd
 10 | from composes.transformation.dim_reduction.nmf import Nmf
 11 | from composes.semantic_space.operation import ScalingOperation
 12 | from composes.semantic_space.operation import DimensionalityReductionOperation
 13 | from composes.matrix.dense_matrix import DenseMatrix
 14 | from composes.matrix.sparse_matrix import SparseMatrix
 15 | from composes.exception.illegal_state_error import IllegalStateError
 16 | 
 17 | 
 18 | class Test(unittest.TestCase):
 19 | 
 20 | 
 21 |     def setUp(self):
 22 |         self.m1 = np.array([[1,2,3]])
 23 |         self.m2 = np.array([[3]])
 24 |         self.m3 = np.array([[4,2,6]])
 25 |         self.m4 = np.array([[2]])
 26 | 
 27 |         self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
 28 |         self.us = np.mat([[  2.19272110e+00,   3.03174768e+00],
 29 |                                [  4.38544220e+00,   6.06349536e+00],
 30 |                                [  6.76369708e+02,  -4.91431927e-02]])
 31 | 
 32 |         self.xnmf = np.mat([[1,2,3],[2,4,6],[4,17,13]])
 33 | 
 34 |     def tearDown(self):
 35 |         pass
 36 | 
 37 | 
 38 |     def test_apply_dimensionality_reduction(self):
 39 | 
 40 |         test_cases =[(self.x, self.us)]
 41 |         red = Svd(2)
 42 | 
 43 |         for in_mat, expected_us_mat in test_cases:
 44 |             op = DimensionalityReductionOperation(red)
 45 |             tmp_mat = in_mat.copy()
 46 | 
 47 |             out_us_mat = op.apply(DenseMatrix(in_mat)).mat
 48 |             np.testing.assert_array_almost_equal(expected_us_mat, out_us_mat, 2)
 49 | 
 50 |             np.testing.assert_array_equal(in_mat, tmp_mat)
 51 |             self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat))
 52 |             self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat))
 53 | 
 54 | 
 55 |     def test_project_dimensionality_reduction(self):
 56 | 
 57 |         test_cases =[(self.x, self.us)]
 58 |         red = Svd(2)
 59 | 
 60 |         for in_mat, expected_us_mat in test_cases:
 61 |             op = DimensionalityReductionOperation(red)
 62 |             tmp_mat = in_mat.copy()
 63 | 
 64 |             self.assertRaises(IllegalStateError, op.project, DenseMatrix(in_mat))
 65 | 
 66 |             op.apply(DenseMatrix(in_mat)).mat
 67 |             out_proj_mat = op.project(DenseMatrix(in_mat)).mat
 68 |             np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat, 2)
 69 | 
 70 |             np.testing.assert_array_equal(in_mat, tmp_mat)
 71 | 
 72 |             self.assertRaises(IllegalStateError, op.apply, SparseMatrix(in_mat))
 73 | 
 74 |             out_proj_mat2 = op.project(DenseMatrix(in_mat)).mat
 75 |             np.testing.assert_array_almost_equal(expected_us_mat, out_proj_mat2, 2)
 76 | 
 77 |     def test_project_dimensionality_reduction_nmf(self):
 78 | 
 79 |         test_cases = [self.xnmf]
 80 |         red = Nmf(2)
 81 | 
 82 |         for in_mat in test_cases:
 83 |             d_in_mat = DenseMatrix(in_mat)
 84 |             op = DimensionalityReductionOperation(red)
 85 |             tmp_mat = in_mat.copy()
 86 | 
 87 |             self.assertRaises(IllegalStateError, op.project, d_in_mat)
 88 | 
 89 |             out_core_mat = op.apply(d_in_mat).mat
 90 |             out_proj_mat = op.project(d_in_mat).mat
 91 |             np.testing.assert_array_almost_equal(out_proj_mat, out_core_mat, 5)
 92 | 
 93 |             np.testing.assert_array_equal(in_mat, tmp_mat)
 94 | 
 95 |             self.assertRaises(IllegalStateError, op.apply, d_in_mat)
 96 | 
 97 |             out_proj_mat2 = op.project(d_in_mat).mat
 98 |             np.testing.assert_array_almost_equal(out_proj_mat2, out_core_mat, 5)
 99 | 
100 | 
101 |     def test_apply_weighting_operation(self):
102 |         test_cases = [(self.m1, np.array([[0,0,0]])),
103 |                       (self.m2, np.array([[0]]))]
104 |         w = PpmiWeighting()
105 |         for in_mat, expected_mat in test_cases:
106 |             op = ScalingOperation(w)
107 |             tmp_mat = in_mat.copy()
108 |             out_mat = op.apply(DenseMatrix(in_mat)).mat
109 |             np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
110 |             np.testing.assert_array_equal(in_mat, tmp_mat)
111 |             self.assertRaises(IllegalStateError, op.apply, DenseMatrix(in_mat))
112 | 
113 |     def test_project_weighting_operation(self):
114 |         test_cases = [(self.m1, self.m3,
115 |                        np.array([[0.69314718,0,0]])),
116 |                       (self.m2, self.m4, np.array([[0]]))]
117 |         w = PpmiWeighting()
118 |         for (core_mat, per_mat, expected_mat) in test_cases:
119 |             op = ScalingOperation(w)
120 |             tmp_mat = per_mat.copy()
121 | 
122 |             self.assertRaises(IllegalStateError, op.project,
123 |                               DenseMatrix(per_mat))
124 | 
125 |             op.apply(DenseMatrix(core_mat))
126 |             out_mat = op.project(DenseMatrix(per_mat)).mat
127 |             np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
128 |             np.testing.assert_array_equal(per_mat, tmp_mat)
129 | 
130 |             out_mat = op.project(DenseMatrix(per_mat)).mat
131 |             np.testing.assert_array_almost_equal(expected_mat, out_mat, 7)
132 | 
133 | if __name__ == "__main__":
134 |     #import sys;sys.argv = ['', 'Test.testName']
135 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/regression_learner_utils_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 9, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import unittest
 7 | import numpy as np
 8 | from composes.utils.regression_learner import RidgeRegressionLearner
 9 | from composes.utils.regression_learner import LstsqRegressionLearner
10 | from composes.matrix.dense_matrix import DenseMatrix
11 | from composes.utils.matrix_utils import padd_matrix
12 | 
13 | class Test(unittest.TestCase):
14 | 
15 | 
16 |     def test_trivial_crossvalidation(self):
17 | 
18 |         for i in range(1, 10):
19 |             m_a = DenseMatrix(np.mat(np.random.random((i + 1,4))))
20 |             m_b = DenseMatrix(np.mat(np.random.random((i + 1,4))))
21 |             tmp_a = m_a.mat.copy()
22 |             tmp_b = m_b.mat.copy()
23 | 
24 |             learner = RidgeRegressionLearner(param_range=[3], intercept=False)
25 |             solution = learner.train(m_a, m_b)
26 | 
27 |             learner2 = RidgeRegressionLearner(param = 3, intercept=False)
28 |             solution2 = learner2.train(m_a, m_b)
29 | 
30 |             np.testing.assert_array_equal(tmp_a, m_a.mat)
31 |             np.testing.assert_array_equal(tmp_b, m_b.mat)
32 |             np.testing.assert_array_equal(solution.mat, solution2.mat)
33 | 
34 |             learner = RidgeRegressionLearner(param_range=[3], intercept=False)
35 |             solution = learner.train(m_a, m_b)
36 | 
37 |             np.testing.assert_array_equal(tmp_a, m_a.mat)
38 |             np.testing.assert_array_equal(tmp_b, m_b.mat)
39 |             np.testing.assert_array_equal(solution.mat, solution2.mat)
40 | 
41 |             learner = RidgeRegressionLearner(param_range=[0], intercept=False)
42 |             solution = learner.train(m_a, m_b)
43 | 
44 |             learner2 = LstsqRegressionLearner(intercept=False)
45 |             solution2 = learner2.train(m_a, m_b)
46 | 
47 |             np.testing.assert_array_almost_equal(solution.mat, solution2.mat, 3)
48 | 
49 | 
50 |     def test_crossvalidation(self):
51 | 
52 |         a = DenseMatrix(np.matrix([[1, 1],[2, 3],[4, 6]]))
53 |         b = DenseMatrix(np.matrix([[12, 15, 18],[21, 27, 33],[35, 46, 57]]))
54 |         res = DenseMatrix(np.matrix([[1, 2, 3],[4, 5, 6],[7, 8, 9]]))
55 | 
56 |         learner = RidgeRegressionLearner(intercept=True, param_range=[0])
57 |         learner2 = LstsqRegressionLearner(intercept=False)
58 | 
59 |         res1 = learner2.train(a, b)
60 |         res2 = learner.train(a, b)
61 | 
62 |         np.testing.assert_array_almost_equal(res2.mat[:-1,:], res[0:2,:].mat, 6)
63 |         np.testing.assert_array_almost_equal(res2.mat[-1,:], res[2:3,:].mat, 6)
64 | 
65 |         new_a = padd_matrix(a, 1)
66 |         self.assertGreater(((a * res1) - b).norm(), ((new_a * res2) - b).norm())
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     #import sys;sys.argv = ['', 'Test.test_trivial_cases']
71 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/sparse_matrix_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 18, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | import numpy.testing
  9 | from scipy.sparse import csr_matrix
 10 | from scipy.sparse import csc_matrix
 11 | from scipy.sparse.sputils import isintlike
 12 | from composes.matrix.sparse_matrix import SparseMatrix
 13 | from composes.matrix.dense_matrix import DenseMatrix
 14 | 
 15 | 
 16 | 
 17 | class TestSparseMatrix(unittest.TestCase):
 18 | 
 19 |     def setUp(self):
 20 |         self.a = np.array([[1,2,3],[4,0,5]])
 21 |         self.b = np.array([[0,0,0],[0,0,0]])
 22 | 
 23 |         self.c = np.array([[0,0],[0,0],[0,0]])
 24 |         self.d = np.array([[1,0],[0,1]])
 25 |         self.e = np.array([1,10])
 26 |         self.f = np.array([1,10,100])
 27 | 
 28 |         self.matrix_a = SparseMatrix(self.a)
 29 |         self.matrix_b = SparseMatrix(self.b)
 30 | 
 31 |         self.matrix_c = SparseMatrix(self.c)
 32 |         self.matrix_d = SparseMatrix(self.d)
 33 | 
 34 | 
 35 |     def tearDown(self):
 36 |         pass
 37 | 
 38 |     def test_reshape(self):
 39 | 
 40 |         test_cases = [(self.matrix_a, (1,6), self.a.reshape((1,6))),
 41 |                       (self.matrix_a, (3,2), self.a.reshape((3,2))),
 42 |                       (self.matrix_b, (1,6), self.b.reshape((1,6))),
 43 |                       (self.matrix_b, (6,1), self.b.reshape((6,1))),
 44 |                       (self.matrix_b, (2,3), self.b.reshape((2,3))),
 45 |                       ]
 46 | 
 47 |         for mat, shape, expected in test_cases:
 48 |             mat.reshape(shape)
 49 |             np.testing.assert_array_equal(mat.mat.todense(), expected)
 50 |             self.assertTupleEqual(shape, mat.shape)
 51 | 
 52 | 
 53 |     def test_reshape_raises(self):
 54 | 
 55 |         test_cases = [(3,0), (3,3), 3, (3,3,3), ("3","5"), (2,"4")]
 56 | 
 57 |         for shape in test_cases:
 58 |             self.assertRaises(ValueError, self.matrix_a.reshape, shape)
 59 | 
 60 | 
 61 |     def test_init(self):
 62 |         nparr = self.a
 63 |         test_cases = [nparr,
 64 |                    np.mat(nparr),
 65 |                    csr_matrix(nparr),
 66 |                    csc_matrix(nparr),
 67 |                    DenseMatrix(nparr)]
 68 | 
 69 |         for inmat in test_cases:
 70 |             outmat = SparseMatrix(inmat)
 71 |             self.assertIsInstance(outmat.mat, csr_matrix)
 72 |             numpy.testing.assert_array_equal(nparr,
 73 |                                              np.array(outmat.mat.todense()))
 74 | 
 75 |     def test_add(self):
 76 |         test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
 77 |                       (self.matrix_a, self.matrix_b, np.mat(self.a))
 78 |                       ]
 79 | 
 80 |         for (term1, term2, expected) in test_cases:
 81 |             sum_ = term1 + term2
 82 |             numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
 83 |             self.assertIsInstance(sum_, type(term1))
 84 | 
 85 |     def test_add_raises(self):
 86 |         test_cases = [(self.matrix_a, self.a),
 87 |                       (self.matrix_a, DenseMatrix(self.a))]
 88 | 
 89 |         for (term1, term2) in test_cases:
 90 |             self.assertRaises(TypeError, term1.__add__, term2)
 91 | 
 92 |     def test_mul(self):
 93 |         test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])),
 94 |                       (self.matrix_d, self.matrix_a, self.matrix_a.mat.todense()),
 95 |                       (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])),
 96 |                       (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]]))
 97 |                       ]
 98 | 
 99 |         for (term1, term2, expected) in test_cases:
100 |             sum_ = term1 * term2
101 |             numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
102 |             self.assertIsInstance(sum_, type(term1))
103 | 
104 |     def test_mul_raises(self):
105 |         test_cases = [(self.matrix_a, self.a),
106 |                       (self.matrix_a, DenseMatrix(self.a)),
107 |                       (self.matrix_a, "3")]
108 | 
109 |         for (term1, term2) in test_cases:
110 |             self.assertRaises(TypeError, term1.__mul__, term2)
111 | 
112 |     def test_get_item(self):
113 | 
114 |         out_mat = SparseMatrix(self.a)[0,:]
115 |         np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
116 | 
117 |         out_int = SparseMatrix(self.a)[0,1]
118 |         self.assertEqual(out_int, 2)
119 | 
120 |         out_mat = SparseMatrix(self.a)[0,1:2]
121 |         np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,1:2]))
122 | 
123 |         out_mat = SparseMatrix(self.a)[0]
124 |         np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
125 | 
126 | 
127 |     def test_scale_rows(self):
128 |         outcome = np.mat([[1,2,3],[40,0,50]])
129 |         test_cases = [(self.matrix_a.copy(), self.e, outcome),
130 |                       (self.matrix_a.copy(), np.mat(self.e).T, outcome),
131 |                       ]
132 | 
133 |         for (term1, term2, expected) in test_cases:
134 |             term1 = term1.scale_rows(term2)
135 |             numpy.testing.assert_array_equal(term1.mat.todense(), expected)
136 | 
137 |     def test_scale_columns(self):
138 |         test_cases = [(self.matrix_a.copy(), self.f, np.mat([[1,20,300],[4,0,500]]))]
139 | 
140 |         for (term1, term2, expected) in test_cases:
141 |             term1 = term1.scale_columns(term2)
142 |             numpy.testing.assert_array_equal(term1.mat.todense(), expected)
143 |             self.assertIsInstance(term1.mat, csr_matrix)
144 | 
145 |     def test_scale_raises(self):
146 |         test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
147 |                       (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
148 |                       (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
149 |                       (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
150 |                       (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
151 |                       ]
152 |         for (term1, term2, error_type, function) in test_cases:
153 |             self.assertRaises(error_type, function, term2)
154 | 
155 |     def test_plog(self):
156 |         m = SparseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]]))
157 |         m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]])
158 |         a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]])
159 |         test_cases = [(self.matrix_a.copy(), a_expected),
160 |                       (m, m_expected)
161 |                      ]
162 | 
163 |         for (term, expected) in test_cases:
164 |             term.plog()
165 |             numpy.testing.assert_array_almost_equal(term.mat.todense(), expected, 3)
166 | 
167 | if __name__ == "__main__":
168 |     #import sys;sys.argv = ['', 'Test.testName']
169 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/tc_pipeline_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 19, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | from unitest import data_dir
  9 | import pipelines.train_composition as tc
 10 | import pipelines.build_core_space as bcs
 11 | from composes.utils import io_utils
 12 | from composes.semantic_space.space import Space
 13 | 
 14 | class Test(unittest.TestCase):
 15 | 
 16 | 
 17 |     def setUp(self):
 18 |         self.dir_ = data_dir + "pipelines_test_resources/"
 19 | 
 20 |         #use as a conversion tool, creates the files we want
 21 |         bcs.main(["build_core_space.py",
 22 |                   "-l", self.dir_ + "log1.txt",
 23 |                   "-i", self.dir_ + "N_mat",
 24 |                   "-o", self.dir_,
 25 |                   "--input_format", "dm",
 26 |                   ])
 27 | 
 28 |         bcs.main(["build_core_space.py",
 29 |                   "-l", self.dir_ + "log1.txt",
 30 |                   "-i", self.dir_ + "AN_mat",
 31 |                   "-o", self.dir_,
 32 |                   "--input_format", "dm",
 33 |                   ])
 34 | 
 35 |     def tearDown(self):
 36 |         pass
 37 | 
 38 |     def _test_equal_spaces_structs(self, sp, new_sp):
 39 |         self.assertListEqual(sp.id2row, new_sp.id2row)
 40 |         self.assertListEqual(sp.id2column, new_sp.id2column)
 41 |         self.assertDictEqual(sp.row2id, new_sp.row2id)
 42 |         self.assertDictEqual(sp.column2id, new_sp.column2id)
 43 | 
 44 |     def _test_equal_spaces_dense(self, sp, new_sp):
 45 | 
 46 |         self._test_equal_spaces_structs(sp, new_sp)
 47 |         np.testing.assert_array_equal(sp.cooccurrence_matrix.mat,
 48 |                                       new_sp.cooccurrence_matrix.mat)
 49 | 
 50 |     def _test_equal_spaces_sparse(self, sp, new_sp):
 51 | 
 52 |         self._test_equal_spaces_structs(sp, new_sp)
 53 |         np.testing.assert_array_equal(sp.cooccurrence_matrix.mat.todense(),
 54 |                                       new_sp.cooccurrence_matrix.mat.todense())
 55 | 
 56 |     def test_simple_lstsq_inter(self):
 57 | 
 58 |         tc.main(["train_composition.py",
 59 |                   "-l", self.dir_ + "log1.txt",
 60 |                   "-i", self.dir_ + "an_train_data.txt",
 61 |                   "-o", self.dir_,
 62 |                   "-m", "lexical_func",
 63 |                   "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
 64 |                   "-a", self.dir_ + "CORE_SS.N_mat.pkl",
 65 |                   "-r", "lstsq",
 66 |                   "--intercept", "True",
 67 |                   "--export_params", "True",
 68 |                   ])
 69 | 
 70 |         trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
 71 |         new_space = trained.function_space
 72 | 
 73 |         np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
 74 |                                              np.mat([[0.66666667,0.33333333,
 75 |                                                       -0.33333333,0.33333333,
 76 |                                                       0.66666667,0.33333333]]),
 77 |                                               7)
 78 | 
 79 |         self.assertTupleEqual(new_space.element_shape, (2,3))
 80 |         self.assertListEqual(new_space.id2row, ["big"])
 81 |         self.assertListEqual(new_space.id2column, [])
 82 | 
 83 | 
 84 |         a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
 85 |                               format="dm")
 86 | 
 87 |         self._test_equal_spaces_dense(a_space, new_space)
 88 | 
 89 | 
 90 |     def test_simple_lstsq_no_inter(self):
 91 |         tc.main(["train_composition.py",
 92 |                   "-l", self.dir_ + "log1.txt",
 93 |                   "-i", self.dir_ + "an_train_data.txt",
 94 |                   "-o", self.dir_,
 95 |                   "-m", "lexical_func",
 96 |                   "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
 97 |                   "-a", self.dir_ + "CORE_SS.N_mat.pkl",
 98 |                   "-r", "lstsq",
 99 |                   "--intercept", "False",
100 |                   "--export_params", "True"
101 |                   ])
102 | 
103 |         trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
104 |         new_space = trained.function_space
105 |         np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
106 |                                              np.mat([1,0,0,1]), 10)
107 |         self.assertTupleEqual(new_space.element_shape, (2,2))
108 |         self.assertListEqual(new_space.id2row, ["big"])
109 |         self.assertListEqual(new_space.id2column, [])
110 | 
111 |         a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
112 |                               format="dm")
113 | 
114 |         self._test_equal_spaces_dense(a_space, new_space)
115 | 
116 |         tc.main(["train_composition.py",
117 |                   "-l", self.dir_ + "log1.txt",
118 |                   "-i", self.dir_ + "an_train_data.txt",
119 |                   "-o", self.dir_,
120 |                   "-m", "lexical_func",
121 |                   "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
122 |                   "-a", self.dir_ + "CORE_SS.N_mat.pkl",
123 |                   "-r", "ridge",
124 |                   "--lambda", "0",
125 |                   "--crossvalidation", "False",
126 |                   "--intercept", "False",
127 |                   "--export_params", "True"
128 |                   ])
129 | 
130 |         trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
131 |         new_space2 = trained.function_space
132 |         np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat,
133 |                                              np.mat([1,0,0,1]), 10)
134 |         self.assertTupleEqual(new_space2.element_shape, (2,2))
135 |         self.assertListEqual(new_space2.id2row, ["big"])
136 |         self.assertListEqual(new_space2.id2column, [])
137 | 
138 |         a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
139 |                               format="dm")
140 | 
141 |         self._test_equal_spaces_dense(a_space, new_space2)
142 | 
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     #import sys;sys.argv = ['', 'Test.testName']
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/src/unitest/utils_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 26, 2012
 3 | 
 4 | @author: Georgiana Dinu, Pham The Nghia
 5 | '''
 6 | import unittest
 7 | from composes.utils.space_utils import list2dict
 8 | #from composes.utils.py_matrix_utils import coo
 9 | 
10 | class UtilsTest(unittest.TestCase):
11 | 
12 | 
13 |     def test_list2dict(self):
14 |         test_cases = [(["a","v","d"], {"a":0, "v":1, "d":2}), ([], {})]
15 | 
16 |         for list_, expected in test_cases:
17 |             outcome = list2dict(list_)
18 |             self.assertDictEqual(outcome, expected)
19 | 
20 |         self.assertRaises(ValueError, list2dict, ["a","v","a"])
21 | 
22 |     #def test_coo(self):
23 |     #    coo()
24 | 
25 | if __name__ == "__main__":
26 |     #import sys;sys.argv = ['', 'Test.testName']
27 |     unittest.main()


--------------------------------------------------------------------------------
/src/unitest/weighting_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 20, 2012
  3 | 
  4 | @author: Georgiana Dinu, Pham The Nghia
  5 | '''
  6 | import unittest
  7 | import numpy as np
  8 | import numpy.testing
  9 | from composes.matrix.dense_matrix import DenseMatrix
 10 | from composes.matrix.sparse_matrix import SparseMatrix
 11 | from composes.transformation.scaling.epmi_weighting import EpmiWeighting
 12 | from composes.transformation.scaling.plog_weighting import PlogWeighting
 13 | from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
 14 | from composes.transformation.scaling.plmi_weighting import PlmiWeighting
 15 | from composes.transformation.scaling.row_normalization import RowNormalization
 16 | from composes.transformation.scaling.normalization import Normalization
 17 | 
 18 | class Test(unittest.TestCase):
 19 | 
 20 | 
 21 |     def setUp(self):
 22 |         self.a = np.array([[1,2,3],[4,0,5]])
 23 |         self.b = np.array([[1,2,3]])
 24 | 
 25 |         self.c = np.array([[0,0],[0,0],[0,0]])
 26 |         self.d = np.array([[1,-1],[0,1]])
 27 | 
 28 |         self.e = np.array([[1,2,3],[1,0,0]])
 29 |         self.f = np.array([1,10,100])
 30 | 
 31 | 
 32 | 
 33 |     def tearDown(self):
 34 |         pass
 35 | 
 36 | 
 37 |     def single_case_test(self, matrix_, expected, w):
 38 | 
 39 |         matrix_copy = matrix_.copy()
 40 |         dm = DenseMatrix(matrix_)
 41 |         sm = SparseMatrix(matrix_)
 42 | 
 43 |         out1 = w.apply(dm)
 44 |         out2 = w.apply(sm)
 45 | 
 46 |         numpy.testing.assert_array_almost_equal(out1.mat, expected, 7)
 47 |         numpy.testing.assert_array_almost_equal(out2.mat.todense(), expected, 7)
 48 | 
 49 |         numpy.testing.assert_array_equal(dm.mat, matrix_copy)
 50 |         numpy.testing.assert_array_equal(matrix_, matrix_copy)
 51 |         numpy.testing.assert_array_equal(sm.mat.todense(), matrix_copy)
 52 | 
 53 |     def single_case_raises_test(self, matrix_, error_type, w):
 54 |         dm = DenseMatrix(matrix_)
 55 |         sm = SparseMatrix(matrix_)
 56 | 
 57 |         self.assertRaises(error_type, w.apply, dm)
 58 |         self.assertRaises(error_type, w.apply, sm)
 59 | 
 60 | 
 61 |     def test_epmi(self):
 62 |         w = EpmiWeighting()
 63 |         test_cases = [(self.b, np.mat([[1,1,1]])),
 64 |                       (self.c, self.c)
 65 |                       ]
 66 |         for matrix_, expected in test_cases:
 67 |             self.single_case_test(matrix_, expected, w)
 68 | 
 69 |     def test_plog(self):
 70 |         w = PlogWeighting()
 71 |         test_cases = [(np.mat([[1,1,1]]), np.mat([[0,0,0]])),
 72 |                       (self.c, self.c)
 73 |                       ]
 74 |         for matrix_, expected in test_cases:
 75 |             self.single_case_test(matrix_, expected, w)
 76 | 
 77 |     def test_ppmi(self):
 78 |         w = PpmiWeighting()
 79 |         test_cases = [(self.b, np.mat([[0,0,0]])),
 80 |                       (self.c, self.c)
 81 |                       ]
 82 | 
 83 |         for matrix_, expected in test_cases:
 84 |             self.single_case_test(matrix_, expected, w)
 85 | 
 86 | 
 87 |     def test_plmi(self):
 88 |         w = PlmiWeighting()
 89 |         test_cases = [(self.b, np.mat([[0,0,0]])),
 90 |                       (self.c, self.c),
 91 |                       (self.e, np.mat([[0.,0.30830136,0.46245204],
 92 |                                               [1.25276297,0.,0.]]))
 93 |                       ]
 94 | 
 95 |         for matrix_, expected in test_cases:
 96 |             self.single_case_test(matrix_, expected, w)
 97 | 
 98 |     def test_row_norm(self):
 99 |         w = RowNormalization()
100 |         test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
101 |                       (self.c, self.c),
102 |                       (self.e, np.mat([[0.26726124,0.53452248,0.80178373],
103 |                                        [1.,0.,0.]]))
104 |                       ]
105 | 
106 |         for matrix_, expected in test_cases:
107 |             self.single_case_test(matrix_, expected, w)
108 | 
109 |         w = RowNormalization(criterion = "length")
110 |         test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
111 |                       (self.c, self.c),
112 |                       (self.e, np.mat([[0.26726124,0.53452248,0.80178373],
113 |                                        [1.,0.,0.]]))
114 |                       ]
115 | 
116 |         for matrix_, expected in test_cases:
117 |             self.single_case_test(matrix_, expected, w)
118 | 
119 |         w = RowNormalization(criterion = "sum")
120 |         test_cases = [(self.b, np.mat([[0.16666667,0.33333333,0.5]])),
121 |                       (self.c, self.c),
122 |                       (self.e, np.mat([[0.16666667,0.33333333,0.5],
123 |                                        [1.,0.,0.]]))
124 |                       ]
125 | 
126 |         for matrix_, expected in test_cases:
127 |             self.single_case_test(matrix_, expected, w)
128 | 
129 |     def test_norm(self):
130 |         w = Normalization()
131 |         test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])),
132 |                       (self.c, self.c),
133 |                       (self.e, np.mat([[1/7.0,2/7.0,3/7.0],
134 |                                        [1./7.0,0.,0.]]))
135 |                       ]
136 | 
137 |         for matrix_, expected in test_cases:
138 |             self.single_case_test(matrix_, expected, w)
139 | 
140 |         w = Normalization(criterion = "length")
141 |         test_cases = [(self.b, np.mat([[0.26726124,0.53452248,0.80178373]])),
142 |                       (self.c, self.c),
143 |                       (self.e, np.mat([[0.25819889,0.51639778,0.77459667],
144 |                                        [0.25819889,0.        ,0.        ]]))
145 |                       ]
146 | 
147 |         for matrix_, expected in test_cases:
148 |             self.single_case_test(matrix_, expected, w)
149 | 
150 | 
151 |         w = Normalization(criterion = "sum")
152 |         test_cases = [(self.b, np.mat([[1/6.0,2/6.0,3/6.0]])),
153 |                       (self.c, self.c),
154 |                       (self.e, np.mat([[1/7.0,2/7.0,3/7.0],
155 |                                        [1./7.0,0.,0.]]))
156 |                       ]
157 | 
158 |         for matrix_, expected in test_cases:
159 |             self.single_case_test(matrix_, expected, w)
160 | 
161 |     def test_epmi_raises(self):
162 |         w = EpmiWeighting()
163 |         test_cases = [(self.d, ValueError)]
164 | 
165 |         for matrix_, error_type in test_cases:
166 |             self.single_case_raises_test(matrix_, error_type, w)
167 | 
168 |     def test_ppmi_raises(self):
169 |         w = PpmiWeighting()
170 |         test_cases = [(self.d, ValueError)]
171 | 
172 |         for matrix_, error_type in test_cases:
173 |             self.single_case_raises_test(matrix_, error_type, w)
174 | 
175 |     def test_plmi_raises(self):
176 |         w = PlmiWeighting()
177 |         test_cases = [(self.d, ValueError)]
178 | 
179 |         for matrix_, error_type in test_cases:
180 |             self.single_case_raises_test(matrix_, error_type, w)
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     #import sys;sys.argv = ['', 'Test.test_epmi']
185 |     unittest.main()


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py27
 8 | 
 9 | [testenv]
10 | sitepackages = True
11 | commands = python setup.py test
12 | deps =
13 |     numpy
14 |     Cython
15 | 


--------------------------------------------------------------------------------