├── .dvc
├── .gitignore
└── config
├── .dvcignore
├── .gitattributes
├── .gitignore
├── .idea
├── misc.xml
├── modules.xml
├── pySenti4SD.iml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── Sample.csv
├── Senti4SD.model
├── Senti4SD_info
├── classification.sh
├── java
├── .gitignore
├── NgramsExtraction.jar.dvc
├── Senti4SD-fast.jar.dvc
├── Senti4SD.jar.dvc
└── dsm.bin.dvc
├── liblinear_solvers
├── python
├── .gitignore
├── classification_task.py
├── core
│ ├── __init__.py
│ ├── classification.py
│ ├── liblinear_multicore
│ │ ├── COPYRIGHT
│ │ ├── __init__.py
│ │ ├── commonutil.py
│ │ ├── liblinear.py
│ │ ├── so
│ │ │ └── liblinear.so.3
│ │ └── windows
│ │ │ └── liblinear.dll
│ ├── liblinearutil.py
│ ├── train_model.py
│ ├── tuning_parameter.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── core_utils.py
│ │ ├── csv_formatter.py
│ │ ├── csv_utils.py
│ │ └── report.py
├── csv_processing.py
└── train.py
├── requirements.txt
├── test_stackoverflow.csv
├── train.sh
└── train_stackoverflow.csv
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 |
--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | ['remote "origin"']
2 | url = https://dagshub.com/collab-uniba/pySenti4SD.dvc
3 |
--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/.gitattributes
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
90 | # install all needed dependencies.
91 | #Pipfile.lock
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
125 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/pySenti4SD.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 | m
120 | dir_path
121 | output_dir
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 | 1560153363066
227 |
228 |
229 | 1560153363066
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Collaborative Development Group
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pySenti4SD
2 | Python implementation of Senti4SD. Senti4SD is an emotion polarity classifier specifically trained to support sentiment analysis in developers' communication channels.
3 | Senti4SD is trained and evaluated on a gold standard of over 4K posts extracted from Stack Overflow. It is part of the Collab Emotion Mining Toolkit, ([EMTk](https://github.com/collab-uniba/EMTk)).
4 |
5 | ## Fair Use Policy
6 | Please, cite the following paper if you intend to use our tool for your own research:
7 | > Calefato, F., Lanubile, F., Maiorano, F., Novielli N. (2018) "Sentiment Polarity Detection for Software Development," _Empirical Software Engineering_, 23(3), pp:1352-1382, doi: https://doi.org/10.1007/s10664-017-9546-9. [(BibTeX)](https://scholar.googleusercontent.com/scholar.bib?q=info:2Vtb0Wmx7hEJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAW9gCvJzwrHV1MKhoxzqLaJZA8lPDFxgx&scisf=4&ct=citation&cd=-1&hl=en)
8 |
9 | ## How do I get set up? ##
10 |
11 | ### Installation ###
12 |
13 | **NOTE**: You will need to install [dvc](https://dvc.org) to check out this project. Once installed and initialized, simply the following:
14 |
15 | ```bash
16 | git clone https://github.com/collab-uniba/pySenti4SD.git
17 | cd pySenti4SD
18 | dvc pull -r origin
19 | ```
20 |
21 | ### Requirements ###
22 |
23 | * dvc
24 | * java 8+
25 | * python 3.7+
26 | * Libraries
27 | * ```numpy, pandas, scipy, scikit-learn, joblib```
28 | * Installation:
29 | ```pip install -r requirements.txt```
30 |
31 |
32 | ## Usage ##
33 | In the following, we show first how to train a new model for polarity classification and, then, how to test the model on unseen data.
34 | For testing purposes, you can use the Sample.csv input file available in the root of the repo.
35 | ### Train a new classification model ###
36 | ```bash
37 | sh train.sh -i train.csv [-d csv_delimiter] [-g] [-c chunk-size] [-j jobs-number] [-o model-name]
38 | ```
39 | or you can run the script with two separated datasets, one for training and the other for testing:
40 | ```bash
41 | sh train.sh -i train.csv -i test.csv [-d csv_delimiter] [-g] [-c chunk-size] [-j jobs-number] [-o model-name]
42 | ```
43 |
44 | where
45 | * ```-i dataset.csv```: is a file containing the data to train a classification model.
46 | The dataset must contain at least the following two columns, in any order:
47 | ```text
48 | Text;Polarity
49 | …
50 | """@DrabJay: excellent suggestion! Code changed. :-)""";positive
51 | """@IgnacioOcampo, I gave up after a while I am afraid :(""";negative
52 | …
53 | ```
54 | same settings are valid if the test set is used separately.
55 | * ```-d csv-delimiter```: the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]
56 | * ```-F features```: all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]
57 | * ```-g```: enables the extraction of n-grams (i.e,. bigrams and unigrams). [optional]
58 | * ```-c chunk-size```: the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]
59 | * ```-j jobs-number```: the number of cores to use during csv reading phase. If you pass -1 all cores will be used.
60 | If you pass a number higher than your total core number, the script will use all the cores. [Default value: 1]
61 | * ```-o model-name```: the name of trained model. [Default value: "Senti4SD"]
62 |
63 | As a result, the script will generate the following output files:
64 | * ```liblinear_perfomance/```: a subfolder containing the perfomance of all liblinear solvers on given test set
65 | * ```UnigramsList and BigramsList files```: in the case the extraction of n-grams was enabled.
66 | * ```Model-name.model```: trained classification model
67 | * ```Model-name_info```: a file containing some info about the trained classification model
68 |
69 | ### Classification task ###
70 | ```bash
71 | sh classification.sh -i dataset.csv [-d csv_delimiter] [-g] [-t] [-m model-name] [-c chunk-size] [-j jobs-number] [-o predictions.csv]
72 | ```
73 |
74 | where
75 | * ```-i dataset.csv```: is a file containing the documents to classify.
76 | The dataset must contain at least the following column:
77 | ```text
78 | Text
79 | …
80 | """@DrabJay: excellent suggestion! Code changed. :-)"""
81 | """@IgnacioOcampo, I gave up after a while I am afraid :("""
82 | …
83 | ```
84 | If the dataset contains a column named ID, this will be saved inside the predictions.csv file.
85 | * ```-d csv-delimiter```: the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]
86 | * ```-F features```: all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]
87 | * ```-g```: enables use of UnigramsList and BigramsList.
88 | * ```-t```: enables documents saving along with the prediction labels inside "predictions.csv" file. [optional]
89 | * ```-m model-name```: the name of classification model to use to classifiy documents. [Default value: "Senti4SD"]
90 | * ```-c chunk-size```: the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]
91 | * ```-j jobs-number```: the number of cores to use during csv reading phase. If you pass -1 all cores will be used.
92 | If you pass a number higher than your total core number, the script will use all the available cores. [Default value: 1]
93 | * ```-o prediction-file-name```: the name of the csv file where to save the model predictions. [Default value: "predictions.csv"]
94 |
95 | As a result, the script will create a ```prediction-file-name.csv``` inside ```predictions``` folder containing:
96 | ```text
97 | Polarity
98 | …
99 | positive
100 | negative
101 | …
102 | ```
103 | or for example, in the case the input dataset contains a column named "ID" and the ```-t``` parameter is used, the ```predictions-file-name.csv``` will look like this:
104 | ```text
105 | ID,Text,Polarity
106 | …
107 | 21,"""@DrabJay: excellent suggestion! Code changed. :-)""",positive
108 | 22,"""@IgnacioOcampo, I gave up after a while I am afraid :(""",negative
109 | …
110 | ```
111 | For example, if you wanted to detect the polarity of the documents in the input file Sample.csv, you would have to run:
112 |
113 | ```bash
114 | sh classification.sh -i Sample.csv -d sc
115 | ```
116 |
--------------------------------------------------------------------------------
/Sample.csv:
--------------------------------------------------------------------------------
1 | ID;Text
2 | 1;I swear - I don't put pseudo code I get told off for having bad variable names and things that don't match... I put pseudocode and I still get grief!
3 | 2;Reinnstalled Xcode4 - same thing. Awful!
4 | 3;Yeah, it's definitely annoying!
5 | 4;I really hate people who downvote for no reason. Just tell me what your problem is in a comment after you downvote. God!
6 | 5;That's depressing :/ Are you sure I can't get a collection of all controls on the page with a particular class and give them a single data source?
7 | 6;yeah it working fine :) Thanks !!
8 | 7;Excellent tutorial!
9 | 8;This is amazing. Thanks so much for explaining. Excellent explanation!
10 | 9;Love this solution!
11 | 10;Sweet :) Happy hacking!
12 | 11;I want them to resize based on the length of the data they're showing.
13 | 12;Do you have jQuery loaded correctly?
14 | 13;For Python 3 the following will work.
15 | 14;If you're really worried about this, Java is not the language for you
16 | 15;I would continue running in the background and set an (called by the os when you app is REALLY killed) and there use
17 |
--------------------------------------------------------------------------------
/Senti4SD_info:
--------------------------------------------------------------------------------
1 | Solver name: L1-regularized logistic regression
2 | Solver value: 6
3 | C value: 0.5
4 | Accuracy score: 0.8748114630467572
5 | Perfomance on test set:
6 | precision recall f1-score support
7 |
8 | negative 0.82 0.88 0.85 360
9 | neutral 0.87 0.82 0.85 508
10 | positive 0.92 0.93 0.93 458
11 |
12 | micro avg 0.87 0.87 0.87 1326
13 | macro avg 0.87 0.88 0.87 1326
14 | weighted avg 0.88 0.87 0.87 1326
15 |
--------------------------------------------------------------------------------
/classification.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname "$0")
4 |
5 | inputFile=""
6 | csvDelimiter='c'
7 | features='A'
8 | grams=false
9 | documents=false
10 | model="$SCRIPTDIR/Senti4SD.model"
11 | chunkSize=200
12 | jobsNumber=1
13 | outputFile="$SCRIPTDIR/predictions.csv"
14 |
15 | help(){
16 | echo "Usage: sh classification.sh -i input.csv [-d delimiter] [-F features] [-g] [-t] [-m model] [-c chunk_size] [-j jobs_number] [-o predictions.csv]"
17 | echo "-i input file to classify [required]"
18 | echo '-d delimiter used in csv file, "c" for comma or "sc" for semicolon'
19 | echo '-F -- all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]'
20 | echo "-g -- enables use of custom UnigramsList and BigramsList [optional]"
21 | echo "-t -- enables documents saving along with the prediction labels inside 'predictions.csv' file. [optional]"
22 | echo "-m prediction model [default = Senti4SD]"
23 | echo "-c chunk size [default = 200]"
24 | echo "-j number of jobs for parallelism. In case of '-1' value it will use all available cores [default = -1]"
25 | echo "-o output file with predicted label [default = predictions.csv]"
26 | exit 1
27 | }
28 |
29 | NUMARGS=$#
30 | if [ $NUMARGS -eq 0 ]; then
31 | help
32 | exit 1
33 | fi
34 |
35 | while getopts "h:i:d:F:m:c:j:o:tg" OPTIONS; do
36 | case $OPTIONS in
37 | h)
38 | help
39 | ;;
40 | i)
41 | inputFile=$OPTARG
42 | ;;
43 | d)
44 | csvDelimiter="$OPTARG"
45 | ;;
46 | t)
47 | documents=true
48 | ;;
49 | g)
50 | grams=true
51 | ;;
52 | F)
53 | features=$OPTARG
54 | ;;
55 | m)
56 | model="$SCRIPTDIR/$OPTARG"
57 | ;;
58 | c)
59 | chunkSize=$OPTARG
60 | ;;
61 | j)
62 | jobsNumber=$OPTARG
63 | ;;
64 | o)
65 | outputFile="$SCRIPTDIR/$OPTARG"
66 | ;;
67 | \?)
68 | echo -e \\n"Option $OPTARG not allowed."
69 | help
70 | ;;
71 | esac
72 | done
73 |
74 | if [ -z $inputFile ]; then
75 | echo "input csv file is required!"
76 | exit 1
77 | fi
78 | if [ ! -f $inputFile ]; then
79 | echo "File $inputFile not found!"
80 | exit 1
81 | fi
82 |
83 | mkdir -p $SCRIPTDIR/temp_features;
84 |
85 | python $SCRIPTDIR/python/csv_processing.py -i $inputFile -d $csvDelimiter -c text
86 |
87 | IFS='.' read -ra FILENAMESPLIT <<< "$inputFile"
88 | jarInputFile="${FILENAMESPLIT[0]}_jar.csv"
89 |
90 | if [ "$grams" = true ] ; then
91 | unigramsFile="$SCRIPTDIR/UnigramsList"
92 | bigramsFile="$SCRIPTDIR/BigramsList"
93 | echo $unigramsFile
94 | echo $bigramsFile
95 | if [ ! -f $unigramsFile ]; then
96 | echo "File $unigramsFile not found!"
97 | exit 1
98 | fi
99 | if [ ! -f $bigramsFile ]; then
100 | echo "File $bigramsFile not found!"
101 | exit 1
102 | fi
103 |
104 | #-F A: all features to be considered
105 | #-i file_name: a file containg a document for every line
106 | #-W cbow600.bin: DSM to be loaded
107 | #-oc file_name.csv: output dataset containg the features extracted
108 | #-vd numeric: vectors size (for cbow600.bin the size is 600)
109 | #-L: if present corpus have a label column [optional]
110 | #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
111 | #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
112 |
113 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600 -ul $unigramsFile -bl $bigramsFile
114 |
115 | if [ "$documents" = true ] ; then
116 | python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -t -m $model -c $chunkSize -j $jobsNumber -o $outputFile
117 | else
118 | python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -m $model -c $chunkSize -j $jobsNumber -o $outputFile
119 | fi
120 |
121 | rm -rf $SCRIPTDIR/temp_features
122 | rm $jarInputFile
123 | else
124 | #-F A: all features to be considered
125 | #-i file_name: a file containg a document for every line
126 | #-W cbow600.bin: DSM to be loaded
127 | #-oc file_name.csv: output dataset containg the features extracted
128 | #-vd numeric: vectors size (for cbow600.bin the size is 600)
129 | #-L: if present corpus have a label column [optional]
130 | #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
131 | #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
132 |
133 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600
134 |
135 | if [ "$documents" = true ] ; then
136 | python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -t -m $model -c $chunkSize -j $jobsNumber -o $outputFile
137 | else
138 | python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -m $model -c $chunkSize -j $jobsNumber -o $outputFile
139 | fi
140 |
141 | rm -rf $SCRIPTDIR/temp_features
142 | rm $jarInputFile
143 | fi
144 |
--------------------------------------------------------------------------------
/java/.gitignore:
--------------------------------------------------------------------------------
1 | /NgramsExtraction.jar
2 | /Senti4SD-fast.jar
3 | /Senti4SD.jar
4 | /dsm.bin
5 |
--------------------------------------------------------------------------------
/java/NgramsExtraction.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: d66bbaaa07739cfbd4cb8b94565d43f8
3 | size: 13568432
4 | path: NgramsExtraction.jar
5 |
--------------------------------------------------------------------------------
/java/Senti4SD-fast.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 0ab05dae382556f265736d7a80d7b5e8
3 | size: 61243352
4 | path: Senti4SD-fast.jar
5 |
--------------------------------------------------------------------------------
/java/Senti4SD.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 37c5228913ac9d2a76e6313023cd8160
3 | size: 51300173
4 | path: Senti4SD.jar
5 |
--------------------------------------------------------------------------------
/java/dsm.bin.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 395cb470cacd6e584508d108945b67d3
3 | size: 835369209
4 | path: dsm.bin
5 |
--------------------------------------------------------------------------------
/liblinear_solvers:
--------------------------------------------------------------------------------
1 | L2-regularized logistic regression (primal)
2 | L2-regularized L2-loss support vector classification (dual)
3 | L2-regularized L2-loss support vector classification (primal)
4 | L2-regularized L1-loss support vector classification (dual)
5 | support vector classification by Crammer and Singer
6 | L1-regularized L2-loss support vector classification
7 | L1-regularized logistic regression
8 | L2-regularized logistic regression (dual)
9 |
--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
90 | # install all needed dependencies.
91 | #Pipfile.lock
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
--------------------------------------------------------------------------------
/python/classification_task.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core'))
4 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/liblinear_multicore'))
6 |
7 | import argparse
8 | import logging
9 | from pathlib import Path
10 |
11 | from core.classification import Classification
12 | from core.utils.csv_utils import CsvUtils
13 | from core.utils.core_utils import CoreUtils
14 |
15 |
16 | logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level = logging.INFO)
17 |
18 | def main():
19 | parser = argparse.ArgumentParser(description = "Classification task")
20 | parser.add_argument('-i',
21 | '--input',
22 | help = "path to csv file.",
23 | type = str,
24 | action = 'append',
25 | required = True)
26 | parser.add_argument('-d',
27 | '--delimiter',
28 | help = 'csv delimiter, use c for comma and sc for semicolon',
29 | type = str,
30 | default = 'c')
31 | parser.add_argument('-t',
32 | '--text',
33 | help = 'enables documents saving along with the prediction labels inside "predictions.csv" file.',
34 | action = "store_true")
35 | parser.add_argument('-m',
36 | '--model',
37 | help = 'prediction model (default = Senti4SD.model)',
38 | type = str,
39 | default = "Senti4SD.model")
40 | parser.add_argument('-c',
41 | '--chunk-size',
42 | help = 'chunk size (--default = 1000)',
43 | type = int,
44 | default = 1000)
45 | parser.add_argument('-j',
46 | '--jobs-number',
47 | help = 'number of jobs for parallelism (default = 1)',
48 | type = int,
49 | default = 1)
50 | parser.add_argument('-o',
51 | '--output',
52 | help = 'prediction file name',
53 | type = str,
54 | default = 'predictions.csv')
55 | args = parser.parse_args()
56 |
57 | #TODO Add again second input line
58 | if len(args.input) == 2:
59 | jar_csv = args.input[0]
60 | input_csv = args.input[1]
61 | jar_csv = Path(jar_csv).resolve()
62 | input_csv = Path(input_csv).resolve()
63 | elif len(args.input) > 2:
64 | logging.error("Too many input file. [jar generated csv][input csv]")
65 | sys.exit(1)
66 | elif len(args.input) < 2:
67 | print("Two input file are required. [jar generated csv][input csv]")
68 | sys.exit(1)
69 |
70 | try:
71 | CsvUtils.check_csv(jar_csv)
72 | CsvUtils.check_csv(input_csv)
73 | except OSError as e:
74 | logging.error(e)
75 | sys.exit(1)
76 |
77 | if not Path(args.model).exists():
78 | print("Model doesn't exist. Provide a correct path to the model, or train a new one using the train script.")
79 | sys.exit(1)
80 |
81 | output_path = Path(f"{Path.cwd()}/predictions")
82 | output_path.mkdir(parents = True, exist_ok = True )
83 | output_path = f"{output_path.resolve()}/{args.output}"
84 | classification = Classification(args.model)
85 | logging.info("Starting classification task")
86 | classification.predict(jar_csv, args.chunk_size, CoreUtils.check_jobs_number(args.jobs_number), output_path)
87 | logging.info("Ending classification task")
88 | logging.info("Starting ordering prediction csv")
89 | CsvUtils.order_csv(output_path, 'ID')
90 | logging.info("Ending ordering prediction csv")
91 | logging.info("Starting rewriting prediction csv")
92 | if args.delimiter.lower() == 'c':
93 | classification.write_id_and_text(input_csv, ',', output_path, args.text)
94 | elif args.delimiter.lower() == 'sc':
95 | classification.write_id_and_text(input_csv, ';', output_path, args.text)
96 | logging.info("Ending rewriting prediction csv")
97 |
98 |
99 |
100 | if __name__ == '__main__':
101 | main()
102 |
--------------------------------------------------------------------------------
/python/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/__init__.py
--------------------------------------------------------------------------------
/python/core/classification.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import glob
4 | from multiprocessing import Pool
5 | from collections import OrderedDict
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from joblib import Parallel, delayed
10 | from sklearn.preprocessing import LabelEncoder
11 |
12 | from utils.csv_utils import CsvUtils
13 | from utils.csv_formatter import CsvFormatter
14 |
15 | from liblinearutil import *
16 |
17 | class Classification():
18 |
19 | def __init__(self, model):
20 | self.model = model
21 |
22 | def __create_classification_file(self, pred_csv):
23 | with open(pred_csv, 'w+') as prediction:
24 | prediction.write("ID,PREDICTED\n")
25 | prediction.close()
26 |
27 | def __clean_id(self, id):
28 | temp = id.split(',')[0]
29 | temp = temp.replace('t', "")
30 | return int(temp)
31 |
32 | def __convert_lines_and_predict(self, rows, label_encoder, pred_file):
33 | model = load_model(self.model)
34 | X = np.array([])
35 | splitted_rows_id = []
36 | first = True
37 | for i in range(0, len(rows)):
38 | values = rows[i].split(',')
39 | splitted_rows_id.append(values[0])
40 | splitted_row_features = [float(value) for value in values[1:]]
41 | if first:
42 | X = np.array(splitted_row_features)
43 | first = False
44 | else:
45 | X = np.append(X, np.array(splitted_row_features))
46 | X = X.reshape((i+1, len(splitted_row_features)))
47 | y_pred, y_acc, y_val = predict([], X, model, '-q')
48 | y_pred = [int(label) for label in y_pred]
49 | y_pred = label_encoder.inverse_transform(y_pred)
50 | y_pred = [pred.replace('\n', "") for pred in y_pred]
51 | dataframe = OrderedDict()
52 | dataframe.update({'id': [(self.__clean_id(row_id) + 1) for row_id in splitted_rows_id]})
53 | dataframe.update({'predicted' : y_pred})
54 | CsvUtils.write_to_csv(dataframe, pred_file, ',', False, 'a+')
55 |
56 | def predict(self, csv_file, chunk_size, jobs_number, pred_file):
57 | self.__create_classification_file(pred_file)
58 | chunk_size = int(chunk_size / jobs_number)
59 | stop = False
60 | label_encoder = LabelEncoder()
61 | label_encoder.fit(['positive', 'negative', 'neutral'])
62 | with open(csv_file, 'r+') as csv:
63 | next(csv)
64 | while not stop:
65 | read_rows = []
66 | try:
67 | for _ in range(jobs_number):
68 | temp_rows = []
69 | for _ in range (chunk_size):
70 | temp_rows.append(next(csv))
71 | read_rows.append(temp_rows)
72 | except StopIteration:
73 | stop = True
74 | read_rows.append(temp_rows)
75 | finally:
76 | Parallel(n_jobs = jobs_number)(delayed(self.__convert_lines_and_predict)(rows, label_encoder, pred_file) for rows in read_rows)
77 | csv.close()
78 |
79 | def write_id_and_text(self, input_csv, csv_delimiter, pred_csv, text = False):
80 | dataframe = OrderedDict()
81 | try:
82 | csv_fomatter = CsvFormatter(['ID'], csv_delimiter)
83 | dataframe.update(csv_fomatter.get_rows(input_csv))
84 | except IOError as e:
85 | print(e)
86 | if text:
87 | try:
88 | csv_fomatter = CsvFormatter(['TEXT'], csv_delimiter)
89 | dataframe.update(csv_fomatter.get_rows(input_csv))
90 | except IOError as e:
91 | print(e)
92 | if dataframe:
93 | temp = pd.read_csv(pred_csv, delimiter = ",")
94 | dataframe.update({'PREDICTED': temp.iloc[:, -1:].values.ravel()})
95 | CsvUtils.write_to_csv(dataframe, pred_csv, ',', True)
96 |
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/COPYRIGHT:
--------------------------------------------------------------------------------
1 |
2 | Copyright (c) 2007-2019 The LIBLINEAR Project.
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions
7 | are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright
10 | notice, this list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright
13 | notice, this list of conditions and the following disclaimer in the
14 | documentation and/or other materials provided with the distribution.
15 |
16 | 3. Neither name of copyright holders nor the names of its contributors
17 | may be used to endorse or promote products derived from this software
18 | without specific prior written permission.
19 |
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 |
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/__init__.py
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/commonutil.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import print_function
4 | import sys
5 |
6 | try:
7 | import scipy
8 | from scipy import sparse
9 | except:
10 | scipy = None
11 | sparse = None
12 |
13 |
14 | __all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
15 |
16 | def svm_read_problem(data_file_name, return_scipy=False):
17 | """
18 | svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
19 | svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix
20 |
21 | Read LIBSVM-format data from data_file_name and return labels y
22 | and data instances x.
23 | """
24 | prob_y = []
25 | prob_x = []
26 | row_ptr = [0]
27 | col_idx = []
28 | for i, line in enumerate(open(data_file_name)):
29 | line = line.split(None, 1)
30 | # In case an instance with all zero features
31 | if len(line) == 1: line += ['']
32 | label, features = line
33 | prob_y += [float(label)]
34 | if scipy != None and return_scipy:
35 | nz = 0
36 | for e in features.split():
37 | ind, val = e.split(":")
38 | val = float(val)
39 | if val != 0:
40 | col_idx += [int(ind)-1]
41 | prob_x += [val]
42 | nz += 1
43 | row_ptr += [row_ptr[-1]+nz]
44 | else:
45 | xi = {}
46 | for e in features.split():
47 | ind, val = e.split(":")
48 | xi[int(ind)] = float(val)
49 | prob_x += [xi]
50 | if scipy != None and return_scipy:
51 | prob_y = scipy.array(prob_y)
52 | prob_x = scipy.array(prob_x)
53 | col_idx = scipy.array(col_idx)
54 | row_ptr = scipy.array(row_ptr)
55 | prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
56 | return (prob_y, prob_x)
57 |
58 | def evaluations_scipy(ty, pv):
59 | """
60 | evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
61 | ty, pv: ndarray
62 |
63 | Calculate accuracy, mean squared error and squared correlation coefficient
64 | using the true values (ty) and predicted values (pv).
65 | """
66 | if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
67 | raise TypeError("type of ty and pv must be ndarray")
68 | if len(ty) != len(pv):
69 | raise ValueError("len(ty) must be equal to len(pv)")
70 | ACC = 100.0*(ty == pv).mean()
71 | MSE = ((ty - pv)**2).mean()
72 | l = len(ty)
73 | sumv = pv.sum()
74 | sumy = ty.sum()
75 | sumvy = (pv*ty).sum()
76 | sumvv = (pv*pv).sum()
77 | sumyy = (ty*ty).sum()
78 | with scipy.errstate(all = 'raise'):
79 | try:
80 | SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
81 | except:
82 | SCC = float('nan')
83 | return (float(ACC), float(MSE), float(SCC))
84 |
85 | def evaluations(ty, pv, useScipy = True):
86 | """
87 | evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
88 | ty, pv: list, tuple or ndarray
89 | useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
90 |
91 | Calculate accuracy, mean squared error and squared correlation coefficient
92 | using the true values (ty) and predicted values (pv).
93 | """
94 | if scipy != None and useScipy:
95 | return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
96 | if len(ty) != len(pv):
97 | raise ValueError("len(ty) must be equal to len(pv)")
98 | total_correct = total_error = 0
99 | sumv = sumy = sumvv = sumyy = sumvy = 0
100 | for v, y in zip(pv, ty):
101 | if y == v:
102 | total_correct += 1
103 | total_error += (v-y)*(v-y)
104 | sumv += v
105 | sumy += y
106 | sumvv += v*v
107 | sumyy += y*y
108 | sumvy += v*y
109 | l = len(ty)
110 | ACC = 100.0*total_correct/l
111 | MSE = total_error/l
112 | try:
113 | SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
114 | except:
115 | SCC = float('nan')
116 | return (float(ACC), float(MSE), float(SCC))
117 |
118 | def csr_find_scale_param(x, lower=-1, upper=1):
119 | assert isinstance(x, sparse.csr_matrix)
120 | assert lower < upper
121 | l, n = x.shape
122 | feat_min = x.min(axis=0).toarray().flatten()
123 | feat_max = x.max(axis=0).toarray().flatten()
124 | coef = (feat_max - feat_min) / (upper - lower)
125 | coef[coef != 0] = 1.0 / coef[coef != 0]
126 |
127 | # (x - ones(l,1) * feat_min') * diag(coef) + lower
128 | # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
129 | # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
130 | # = x * diag(coef) + ones(l, 1) * offset'
131 | offset = -feat_min * coef + lower
132 | offset[coef == 0] = 0
133 |
134 | if sum(offset != 0) * l > 3 * x.getnnz():
135 | print(
136 | "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
137 | "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
138 | file=sys.stderr)
139 |
140 | return {'coef':coef, 'offset':offset}
141 |
142 | def csr_scale(x, scale_param):
143 | assert isinstance(x, sparse.csr_matrix)
144 |
145 | offset = scale_param['offset']
146 | coef = scale_param['coef']
147 | assert len(coef) == len(offset)
148 |
149 | l, n = x.shape
150 |
151 | if not n == len(coef):
152 | print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
153 | coef = resize(coef, n)
154 | offset = resize(offset, n)
155 |
156 | # scaled_x = x * diag(coef) + ones(l, 1) * offset'
157 | offset = sparse.csr_matrix(offset.reshape(1, n))
158 | offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
159 | scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
160 |
161 | if scaled_x.getnnz() > x.getnnz():
162 | print(
163 | "WARNING: original #nonzeros %d\n" % x.getnnz() +
164 | " > new #nonzeros %d\n" % scaled_x.getnnz() +
165 | "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
166 | file=sys.stderr)
167 |
168 | return scaled_x
169 |
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/liblinear.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from ctypes import *
4 | from ctypes.util import find_library
5 | from os import path
6 | import sys
7 |
8 | try:
9 | import scipy
10 | from scipy import sparse
11 | except:
12 | scipy = None
13 | sparse = None
14 |
15 | if sys.version_info[0] < 3:
16 | range = xrange
17 | from itertools import izip as zip
18 |
19 | __all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem',
20 | 'parameter', 'model', 'toPyModel', 'L2R_LR', 'L2R_L2LOSS_SVC_DUAL',
21 | 'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL', 'MCSVM_CS',
22 | 'L1R_L2LOSS_SVC', 'L1R_LR', 'L2R_LR_DUAL', 'L2R_L2LOSS_SVR',
23 | 'L2R_L2LOSS_SVR_DUAL', 'L2R_L1LOSS_SVR_DUAL', 'print_null']
24 |
25 | try:
26 | dirname = path.dirname(path.abspath(__file__))
27 | if sys.platform == 'win32':
28 | liblinear = CDLL(path.join(dirname, r'windows\liblinear.dll'))
29 | else:
30 | liblinear = CDLL(path.join(dirname, 'so/liblinear.so.3'))
31 | except:
32 | # For unix the prefix 'lib' is not considered.
33 | if find_library('linear'):
34 | liblinear = CDLL(find_library('linear'))
35 | elif find_library('liblinear'):
36 | liblinear = CDLL(find_library('liblinear'))
37 | else:
38 | raise Exception('LIBLINEAR library not found.')
39 |
40 | L2R_LR = 0
41 | L2R_L2LOSS_SVC_DUAL = 1
42 | L2R_L2LOSS_SVC = 2
43 | L2R_L1LOSS_SVC_DUAL = 3
44 | MCSVM_CS = 4
45 | L1R_L2LOSS_SVC = 5
46 | L1R_LR = 6
47 | L2R_LR_DUAL = 7
48 | L2R_L2LOSS_SVR = 11
49 | L2R_L2LOSS_SVR_DUAL = 12
50 | L2R_L1LOSS_SVR_DUAL = 13
51 |
52 | PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p)
53 | def print_null(s):
54 | return
55 |
56 | def genFields(names, types):
57 | return list(zip(names, types))
58 |
59 | def fillprototype(f, restype, argtypes):
60 | f.restype = restype
61 | f.argtypes = argtypes
62 |
63 | class feature_node(Structure):
64 | _names = ["index", "value"]
65 | _types = [c_int, c_double]
66 | _fields_ = genFields(_names, _types)
67 |
68 | def __str__(self):
69 | return '%d:%g' % (self.index, self.value)
70 |
71 | def gen_feature_nodearray(xi, feature_max=None):
72 | if feature_max:
73 | assert(isinstance(feature_max, int))
74 |
75 | xi_shift = 0 # ensure correct indices of xi
76 | if scipy and isinstance(xi, tuple) and len(xi) == 2\
77 | and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
78 | index_range = xi[0] + 1 # index starts from 1
79 | if feature_max:
80 | index_range = index_range[scipy.where(index_range <= feature_max)]
81 | elif scipy and isinstance(xi, scipy.ndarray):
82 | xi_shift = 1
83 | index_range = xi.nonzero()[0] + 1 # index starts from 1
84 | if feature_max:
85 | index_range = index_range[scipy.where(index_range <= feature_max)]
86 | elif isinstance(xi, (dict, list, tuple)):
87 | if isinstance(xi, dict):
88 | index_range = xi.keys()
89 | elif isinstance(xi, (list, tuple)):
90 | xi_shift = 1
91 | index_range = range(1, len(xi) + 1)
92 | index_range = filter(lambda j: xi[j-xi_shift] != 0, index_range)
93 |
94 | if feature_max:
95 | index_range = filter(lambda j: j <= feature_max, index_range)
96 | index_range = sorted(index_range)
97 | else:
98 | raise TypeError('xi should be a dictionary, list, tuple, 1-d numpy array, or tuple of (index, data)')
99 |
100 | ret = (feature_node*(len(index_range)+2))()
101 | ret[-1].index = -1 # for bias term
102 | ret[-2].index = -1
103 |
104 | if scipy and isinstance(xi, tuple) and len(xi) == 2\
105 | and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
106 | for idx, j in enumerate(index_range):
107 | ret[idx].index = j
108 | ret[idx].value = (xi[1])[idx]
109 | else:
110 | for idx, j in enumerate(index_range):
111 | ret[idx].index = j
112 | ret[idx].value = xi[j - xi_shift]
113 |
114 | max_idx = 0
115 | if len(index_range) > 0:
116 | max_idx = index_range[-1]
117 | return ret, max_idx
118 |
119 | try:
120 | from numba import jit
121 | jit_enabled = True
122 | except:
123 | jit = lambda x: x
124 | jit_enabled = False
125 |
126 | @jit
127 | def csr_to_problem_jit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
128 | for i in range(l):
129 | b1,e1 = x_rowptr[i], x_rowptr[i+1]
130 | b2,e2 = prob_rowptr[i], prob_rowptr[i+1]-2
131 | for j in range(b1,e1):
132 | prob_ind[j-b1+b2] = x_ind[j]+1
133 | prob_val[j-b1+b2] = x_val[j]
134 | def csr_to_problem_nojit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
135 | for i in range(l):
136 | x_slice = slice(x_rowptr[i], x_rowptr[i+1])
137 | prob_slice = slice(prob_rowptr[i], prob_rowptr[i+1]-2)
138 | prob_ind[prob_slice] = x_ind[x_slice]+1
139 | prob_val[prob_slice] = x_val[x_slice]
140 |
141 | def csr_to_problem(x, prob):
142 | # Extra space for termination node and (possibly) bias term
143 | x_space = prob.x_space = scipy.empty((x.nnz+x.shape[0]*2), dtype=feature_node)
144 | prob.rowptr = x.indptr.copy()
145 | prob.rowptr[1:] += 2*scipy.arange(1,x.shape[0]+1)
146 | prob_ind = x_space["index"]
147 | prob_val = x_space["value"]
148 | prob_ind[:] = -1
149 | if jit_enabled:
150 | csr_to_problem_jit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
151 | else:
152 | csr_to_problem_nojit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
153 |
154 | class problem(Structure):
155 | _names = ["l", "n", "y", "x", "bias"]
156 | _types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double]
157 | _fields_ = genFields(_names, _types)
158 |
159 | def __init__(self, y, x, bias = -1):
160 | if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
161 | raise TypeError("type of y: {0} is not supported!".format(type(y)))
162 |
163 | if isinstance(x, (list, tuple)):
164 | if len(y) != len(x):
165 | raise ValueError("len(y) != len(x)")
166 | elif scipy != None and isinstance(x, (scipy.ndarray, sparse.spmatrix)):
167 | if len(y) != x.shape[0]:
168 | raise ValueError("len(y) != len(x)")
169 | if isinstance(x, scipy.ndarray):
170 | x = scipy.ascontiguousarray(x) # enforce row-major
171 | if isinstance(x, sparse.spmatrix):
172 | x = x.tocsr()
173 | pass
174 | else:
175 | raise TypeError("type of x: {0} is not supported!".format(type(x)))
176 | self.l = l = len(y)
177 | self.bias = -1
178 |
179 | max_idx = 0
180 | x_space = self.x_space = []
181 | if scipy != None and isinstance(x, sparse.csr_matrix):
182 | csr_to_problem(x, self)
183 | max_idx = x.shape[1]
184 | else:
185 | for i, xi in enumerate(x):
186 | tmp_xi, tmp_idx = gen_feature_nodearray(xi)
187 | x_space += [tmp_xi]
188 | max_idx = max(max_idx, tmp_idx)
189 | self.n = max_idx
190 |
191 | self.y = (c_double * l)()
192 | if scipy != None and isinstance(y, scipy.ndarray):
193 | scipy.ctypeslib.as_array(self.y, (self.l,))[:] = y
194 | else:
195 | for i, yi in enumerate(y): self.y[i] = yi
196 |
197 | self.x = (POINTER(feature_node) * l)()
198 | if scipy != None and isinstance(x, sparse.csr_matrix):
199 | base = addressof(self.x_space.ctypes.data_as(POINTER(feature_node))[0])
200 | x_ptr = cast(self.x, POINTER(c_uint64))
201 | x_ptr = scipy.ctypeslib.as_array(x_ptr,(self.l,))
202 | x_ptr[:] = self.rowptr[:-1]*sizeof(feature_node)+base
203 | else:
204 | for i, xi in enumerate(self.x_space): self.x[i] = xi
205 |
206 | self.set_bias(bias)
207 |
208 | def set_bias(self, bias):
209 | if self.bias == bias:
210 | return
211 | if bias >= 0 and self.bias < 0:
212 | self.n += 1
213 | node = feature_node(self.n, bias)
214 | if bias < 0 and self.bias >= 0:
215 | self.n -= 1
216 | node = feature_node(-1, bias)
217 |
218 | if isinstance(self.x_space, list):
219 | for xi in self.x_space:
220 | xi[-2] = node
221 | else:
222 | self.x_space["index"][self.rowptr[1:]-2] = node.index
223 | self.x_space["value"][self.rowptr[1:]-2] = node.value
224 |
225 | self.bias = bias
226 |
227 |
228 | class parameter(Structure):
229 | _names = ["solver_type", "eps", "C", "nr_thread", "nr_weight", "weight_label", "weight", "p", "init_sol"]
230 | _types = [c_int, c_double, c_double, c_int, c_int, POINTER(c_int), POINTER(c_double), c_double, POINTER(c_double)]
231 | _fields_ = genFields(_names, _types)
232 |
233 | def __init__(self, options = None):
234 | if options == None:
235 | options = ''
236 | self.parse_options(options)
237 |
238 | def __str__(self):
239 | s = ''
240 | attrs = parameter._names + list(self.__dict__.keys())
241 | values = map(lambda attr: getattr(self, attr), attrs)
242 | for attr, val in zip(attrs, values):
243 | s += (' %s: %s\n' % (attr, val))
244 | s = s.strip()
245 |
246 | return s
247 |
248 | def set_to_default_values(self):
249 | self.solver_type = L2R_L2LOSS_SVC_DUAL
250 | self.eps = float('inf')
251 | self.C = 1
252 | self.p = 0.1
253 | self.nr_thread = 1
254 | self.nr_weight = 0
255 | self.weight_label = None
256 | self.weight = None
257 | self.init_sol = None
258 | self.bias = -1
259 | self.flag_cross_validation = False
260 | self.flag_C_specified = False
261 | self.flag_solver_specified = False
262 | self.flag_find_C = False
263 | self.flag_omp = False
264 | self.nr_fold = 0
265 | self.print_func = cast(None, PRINT_STRING_FUN)
266 |
267 | def parse_options(self, options):
268 | if isinstance(options, list):
269 | argv = options
270 | elif isinstance(options, str):
271 | argv = options.split()
272 | else:
273 | raise TypeError("arg 1 should be a list or a str.")
274 | self.set_to_default_values()
275 | self.print_func = cast(None, PRINT_STRING_FUN)
276 | weight_label = []
277 | weight = []
278 |
279 | i = 0
280 | while i < len(argv) :
281 | if argv[i] == "-s":
282 | i = i + 1
283 | self.solver_type = int(argv[i])
284 | self.flag_solver_specified = True
285 | elif argv[i] == "-c":
286 | i = i + 1
287 | self.C = float(argv[i])
288 | self.flag_C_specified = True
289 | elif argv[i] == "-p":
290 | i = i + 1
291 | self.p = float(argv[i])
292 | elif argv[i] == "-e":
293 | i = i + 1
294 | self.eps = float(argv[i])
295 | elif argv[i] == "-B":
296 | i = i + 1
297 | self.bias = float(argv[i])
298 | elif argv[i] == "-v":
299 | i = i + 1
300 | self.flag_cross_validation = 1
301 | self.nr_fold = int(argv[i])
302 | if self.nr_fold < 2 :
303 | raise ValueError("n-fold cross validation: n must >= 2")
304 | elif argv[i] == "-n":
305 | i = i + 1
306 | self.flag_omp = True
307 | self.nr_thread = int(argv[i])
308 | elif argv[i].startswith("-w"):
309 | i = i + 1
310 | self.nr_weight += 1
311 | weight_label += [int(argv[i-1][2:])]
312 | weight += [float(argv[i])]
313 | elif argv[i] == "-q":
314 | self.print_func = PRINT_STRING_FUN(print_null)
315 | elif argv[i] == "-C":
316 | self.flag_find_C = True
317 |
318 | else:
319 | raise ValueError("Wrong options")
320 | i += 1
321 |
322 | liblinear.set_print_string_function(self.print_func)
323 | self.weight_label = (c_int*self.nr_weight)()
324 | self.weight = (c_double*self.nr_weight)()
325 | for i in range(self.nr_weight):
326 | self.weight[i] = weight[i]
327 | self.weight_label[i] = weight_label[i]
328 |
329 | # default solver for parameter selection is L2R_L2LOSS_SVC
330 | if self.flag_find_C:
331 | if not self.flag_cross_validation:
332 | self.nr_fold = 5
333 | if not self.flag_solver_specified:
334 | self.solver_type = L2R_L2LOSS_SVC
335 | self.flag_solver_specified = True
336 | elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC]:
337 | raise ValueError("Warm-start parameter search only available for -s 0 and -s 2")
338 | if self.flag_omp:
339 | if not self.flag_solver_specified:
340 | self.solver_type = L2R_L2LOSS_SVC
341 | self.flag_solver_specified = True
342 | elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC, L2R_L2LOSS_SVR, L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, L1R_LR, L1R_L2LOSS_SVC]:
343 | raise ValueError("Parallel LIBLINEAR is only available for -s 0, 1, 2, 3, 5, 6, 11 now")
344 |
345 | if self.eps == float('inf'):
346 | if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]:
347 | self.eps = 0.01
348 | elif self.solver_type in [L2R_L2LOSS_SVR]:
349 | self.eps = 0.001
350 | elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]:
351 | self.eps = 0.1
352 | elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]:
353 | self.eps = 0.01
354 | elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
355 | self.eps = 0.1
356 |
357 | class model(Structure):
358 | _names = ["param", "nr_class", "nr_feature", "w", "label", "bias"]
359 | _types = [parameter, c_int, c_int, POINTER(c_double), POINTER(c_int), c_double]
360 | _fields_ = genFields(_names, _types)
361 |
362 | def __init__(self):
363 | self.__createfrom__ = 'python'
364 |
365 | def __del__(self):
366 | # free memory created by C to avoid memory leak
367 | if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C':
368 | liblinear.free_and_destroy_model(pointer(self))
369 |
370 | def get_nr_feature(self):
371 | return liblinear.get_nr_feature(self)
372 |
373 | def get_nr_class(self):
374 | return liblinear.get_nr_class(self)
375 |
376 | def get_labels(self):
377 | nr_class = self.get_nr_class()
378 | labels = (c_int * nr_class)()
379 | liblinear.get_labels(self, labels)
380 | return labels[:nr_class]
381 |
382 | def get_decfun_coef(self, feat_idx, label_idx=0):
383 | return liblinear.get_decfun_coef(self, feat_idx, label_idx)
384 |
385 | def get_decfun_bias(self, label_idx=0):
386 | return liblinear.get_decfun_bias(self, label_idx)
387 |
388 | def get_decfun(self, label_idx=0):
389 | w = [liblinear.get_decfun_coef(self, feat_idx, label_idx) for feat_idx in range(1, self.nr_feature+1)]
390 | b = liblinear.get_decfun_bias(self, label_idx)
391 | return (w, b)
392 |
393 | def is_probability_model(self):
394 | return (liblinear.check_probability_model(self) == 1)
395 |
396 | def is_regression_model(self):
397 | return (liblinear.check_regression_model(self) == 1)
398 |
399 | def toPyModel(model_ptr):
400 | """
401 | toPyModel(model_ptr) -> model
402 |
403 | Convert a ctypes POINTER(model) to a Python model
404 | """
405 | if bool(model_ptr) == False:
406 | raise ValueError("Null pointer")
407 | m = model_ptr.contents
408 | m.__createfrom__ = 'C'
409 | return m
410 |
411 | fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)])
412 | fillprototype(liblinear.find_parameter_C, None, [POINTER(problem), POINTER(parameter), c_int, c_double, c_double, POINTER(c_double), POINTER(c_double)])
413 | fillprototype(liblinear.cross_validation, None, [POINTER(problem), POINTER(parameter), c_int, POINTER(c_double)])
414 |
415 | fillprototype(liblinear.predict_values, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
416 | fillprototype(liblinear.predict, c_double, [POINTER(model), POINTER(feature_node)])
417 | fillprototype(liblinear.predict_probability, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
418 |
419 | fillprototype(liblinear.save_model, c_int, [c_char_p, POINTER(model)])
420 | fillprototype(liblinear.load_model, POINTER(model), [c_char_p])
421 |
422 | fillprototype(liblinear.get_nr_feature, c_int, [POINTER(model)])
423 | fillprototype(liblinear.get_nr_class, c_int, [POINTER(model)])
424 | fillprototype(liblinear.get_labels, None, [POINTER(model), POINTER(c_int)])
425 | fillprototype(liblinear.get_decfun_coef, c_double, [POINTER(model), c_int, c_int])
426 | fillprototype(liblinear.get_decfun_bias, c_double, [POINTER(model), c_int])
427 |
428 | fillprototype(liblinear.free_model_content, None, [POINTER(model)])
429 | fillprototype(liblinear.free_and_destroy_model, None, [POINTER(POINTER(model))])
430 | fillprototype(liblinear.destroy_param, None, [POINTER(parameter)])
431 | fillprototype(liblinear.check_parameter, c_char_p, [POINTER(problem), POINTER(parameter)])
432 | fillprototype(liblinear.check_probability_model, c_int, [POINTER(model)])
433 | fillprototype(liblinear.check_regression_model, c_int, [POINTER(model)])
434 | fillprototype(liblinear.set_print_string_function, None, [CFUNCTYPE(None, c_char_p)])
435 |
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/so/liblinear.so.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/so/liblinear.so.3
--------------------------------------------------------------------------------
/python/core/liblinear_multicore/windows/liblinear.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/windows/liblinear.dll
--------------------------------------------------------------------------------
/python/core/liblinearutil.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os, sys
4 | #sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/liblinear_multicore")
5 | #print(sys.path)
6 | from liblinear_multicore.liblinear import *
7 | from liblinear_multicore.liblinear import __all__ as liblinear_all
8 | from liblinear_multicore.liblinear import scipy, sparse
9 | from liblinear_multicore.commonutil import *
10 | from liblinear_multicore.commonutil import __all__ as common_all
11 | from ctypes import c_double
12 |
13 | if sys.version_info[0] < 3:
14 | range = xrange
15 | from itertools import izip as zip
16 |
17 | __all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
18 |
19 |
20 | def load_model(model_file_name):
21 | """
22 | load_model(model_file_name) -> model
23 |
24 | Load a LIBLINEAR model from model_file_name and return.
25 | """
26 | model = liblinear.load_model(model_file_name.encode())
27 | if not model:
28 | print("can't open model file %s" % model_file_name)
29 | return None
30 | model = toPyModel(model)
31 | return model
32 |
33 | def save_model(model_file_name, model):
34 | """
35 | save_model(model_file_name, model) -> None
36 |
37 | Save a LIBLINEAR model to the file model_file_name.
38 | """
39 | liblinear.save_model(model_file_name.encode(), model)
40 |
41 | def train(arg1, arg2=None, arg3=None):
42 | """
43 | train(y, x [, options]) -> model | ACC
44 |
45 | y: a list/tuple/ndarray of l true labels (type must be int/double).
46 |
47 | x: 1. a list/tuple of l training instances. Feature vector of
48 | each training instance is a list/tuple or dictionary.
49 |
50 | 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
51 |
52 | train(prob [, options]) -> model | ACC
53 | train(prob, param) -> model | ACC
54 |
55 | Train a model from data (y, x) or a problem prob using
56 | 'options' or a parameter param.
57 |
58 | If '-v' is specified in 'options' (i.e., cross validation)
59 | either accuracy (ACC) or mean-squared error (MSE) is returned.
60 |
61 | options:
62 | -s type : set type of solver (default 1)
63 | for multi-class classification
64 | 0 -- L2-regularized logistic regression (primal)
65 | 1 -- L2-regularized L2-loss support vector classification (dual)
66 | 2 -- L2-regularized L2-loss support vector classification (primal)
67 | 3 -- L2-regularized L1-loss support vector classification (dual)
68 | 4 -- support vector classification by Crammer and Singer
69 | 5 -- L1-regularized L2-loss support vector classification
70 | 6 -- L1-regularized logistic regression
71 | 7 -- L2-regularized logistic regression (dual)
72 | for regression
73 | 11 -- L2-regularized L2-loss support vector regression (primal)
74 | 12 -- L2-regularized L2-loss support vector regression (dual)
75 | 13 -- L2-regularized L1-loss support vector regression (dual)
76 | -c cost : set the parameter C (default 1)
77 | -p epsilon : set the epsilon in loss function of SVR (default 0.1)
78 | -e epsilon : set tolerance of termination criterion
79 | -s 0 and 2
80 | |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
81 | where f is the primal function, (default 0.01)
82 | -s 11
83 | |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)
84 | -s 1, 3, 4, and 7
85 | Dual maximal violation <= eps; similar to liblinear (default 0.)
86 | -s 5 and 6
87 | |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
88 | where f is the primal function (default 0.01)
89 | -s 12 and 13
90 | |f'(alpha)|_1 <= eps |f'(alpha0)|,
91 | where f is the dual function (default 0.1)
92 | -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
93 | -wi weight: weights adjust the parameter C of different classes (see README for details)
94 | -v n: n-fold cross validation mode
95 | -n nr_thread : parallel version with [nr_thread] threads (default 1; only for -s 0, 1, 2, 3, 11)
96 | -q : quiet mode (no outputs)
97 | """
98 | prob, param = None, None
99 | if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)):
100 | assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix)))
101 | y, x, options = arg1, arg2, arg3
102 | prob = problem(y, x)
103 | param = parameter(options)
104 | elif isinstance(arg1, problem):
105 | prob = arg1
106 | if isinstance(arg2, parameter):
107 | param = arg2
108 | else:
109 | param = parameter(arg2)
110 | if prob == None or param == None :
111 | raise TypeError("Wrong types for the arguments")
112 |
113 | prob.set_bias(param.bias)
114 | liblinear.set_print_string_function(param.print_func)
115 | err_msg = liblinear.check_parameter(prob, param)
116 | if err_msg :
117 | raise ValueError('Error: %s' % err_msg)
118 |
119 | if param.flag_find_C:
120 | nr_fold = param.nr_fold
121 | best_C = c_double()
122 | best_rate = c_double()
123 | max_C = 1024
124 | if param.flag_C_specified:
125 | start_C = param.C
126 | else:
127 | start_C = -1.0
128 | liblinear.find_parameter_C(prob, param, nr_fold, start_C, max_C, best_C, best_rate)
129 | print("Best C = %lf CV accuracy = %g%%\n"% (best_C.value, 100.0*best_rate.value))
130 | return best_C.value,best_rate.value
131 |
132 |
133 | elif param.flag_cross_validation:
134 | l, nr_fold = prob.l, param.nr_fold
135 | target = (c_double * l)()
136 | liblinear.cross_validation(prob, param, nr_fold, target)
137 | ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
138 | if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
139 | print("Cross Validation Mean squared error = %g" % MSE)
140 | print("Cross Validation Squared correlation coefficient = %g" % SCC)
141 | return MSE
142 | else:
143 | print("Cross Validation Accuracy = %g%%" % ACC)
144 | return ACC
145 | else:
146 | m = liblinear.train(prob, param)
147 | m = toPyModel(m)
148 |
149 | return m
150 |
151 | def predict(y, x, m, options=""):
152 | """
153 | predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
154 |
155 | y: a list/tuple/ndarray of l true labels (type must be int/double).
156 | It is used for calculating the accuracy. Use [] if true labels are
157 | unavailable.
158 |
159 | x: 1. a list/tuple of l training instances. Feature vector of
160 | each training instance is a list/tuple or dictionary.
161 |
162 | 2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
163 |
164 | Predict data (y, x) with the SVM model m.
165 | options:
166 | -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
167 | -q quiet mode (no outputs)
168 |
169 | The return tuple contains
170 | p_labels: a list of predicted labels
171 | p_acc: a tuple including accuracy (for classification), mean-squared
172 | error, and squared correlation coefficient (for regression).
173 | p_vals: a list of decision values or probability estimates (if '-b 1'
174 | is specified). If k is the number of classes, for decision values,
175 | each element includes results of predicting k binary-class
176 | SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value
177 | is returned. For probabilities, each element contains k values
178 | indicating the probability that the testing instance is in each class.
179 | Note that the order of classes here is the same as 'model.label'
180 | field in the model structure.
181 | """
182 |
183 | def info(s):
184 | print(s)
185 |
186 | if scipy and isinstance(x, scipy.ndarray):
187 | x = scipy.ascontiguousarray(x) # enforce row-major
188 | elif sparse and isinstance(x, sparse.spmatrix):
189 | x = x.tocsr()
190 | elif not isinstance(x, (list, tuple)):
191 | raise TypeError("type of x: {0} is not supported!".format(type(x)))
192 |
193 | if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
194 | raise TypeError("type of y: {0} is not supported!".format(type(y)))
195 |
196 | predict_probability = 0
197 | argv = options.split()
198 | i = 0
199 | while i < len(argv):
200 | if argv[i] == '-b':
201 | i += 1
202 | predict_probability = int(argv[i])
203 | elif argv[i] == '-q':
204 | info = print_null
205 | else:
206 | raise ValueError("Wrong options")
207 | i+=1
208 |
209 | solver_type = m.param.solver_type
210 | nr_class = m.get_nr_class()
211 | nr_feature = m.get_nr_feature()
212 | is_prob_model = m.is_probability_model()
213 | bias = m.bias
214 | if bias >= 0:
215 | biasterm = feature_node(nr_feature+1, bias)
216 | else:
217 | biasterm = feature_node(-1, bias)
218 | pred_labels = []
219 | pred_values = []
220 |
221 | if scipy and isinstance(x, sparse.spmatrix):
222 | nr_instance = x.shape[0]
223 | else:
224 | nr_instance = len(x)
225 |
226 | if predict_probability:
227 | if not is_prob_model:
228 | raise TypeError('probability output is only supported for logistic regression')
229 | prob_estimates = (c_double * nr_class)()
230 | for i in range(nr_instance):
231 | if scipy and isinstance(x, sparse.spmatrix):
232 | indslice = slice(x.indptr[i], x.indptr[i+1])
233 | xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
234 | else:
235 | xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
236 | xi[-2] = biasterm
237 | label = liblinear.predict_probability(m, xi, prob_estimates)
238 | values = prob_estimates[:nr_class]
239 | pred_labels += [label]
240 | pred_values += [values]
241 | else:
242 | if nr_class <= 2:
243 | nr_classifier = 1
244 | else:
245 | nr_classifier = nr_class
246 | dec_values = (c_double * nr_classifier)()
247 | for i in range(nr_instance):
248 | if scipy and isinstance(x, sparse.spmatrix):
249 | indslice = slice(x.indptr[i], x.indptr[i+1])
250 | xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
251 | else:
252 | xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
253 | xi[-2] = biasterm
254 | label = liblinear.predict_values(m, xi, dec_values)
255 | values = dec_values[:nr_classifier]
256 | pred_labels += [label]
257 | pred_values += [values]
258 |
259 | if len(y) == 0:
260 | y = [0] * nr_instance
261 | ACC, MSE, SCC = evaluations(y, pred_labels)
262 |
263 | if m.is_regression_model():
264 | info("Mean squared error = %g (regression)" % MSE)
265 | info("Squared correlation coefficient = %g (regression)" % SCC)
266 | else:
267 | info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance))
268 |
269 | return pred_labels, (ACC, MSE, SCC), pred_values
270 |
--------------------------------------------------------------------------------
/python/core/train_model.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from pathlib import Path
3 |
4 | from utils.report import Report
5 |
6 | from sklearn.preprocessing import LabelEncoder
7 | from liblinearutil import *
8 |
9 | class Train():
10 |
11 | def __init__(self, jobs_number, solver_name, solver_value, c_value, model_name):
12 | self.jobs_number = jobs_number
13 | self.solver_name = solver_name
14 | self.solver_value = solver_value
15 | self.c_value = c_value
16 | self.model_name = model_name
17 | self.best_perfomance = OrderedDict()
18 | self.best_perfomance['Solver name'] = solver_name
19 | self.best_perfomance['Solver value'] = solver_value
20 | self.best_perfomance['C value'] = c_value
21 | self.report = None
22 |
23 | def save_best_perfomance(self, output_dir):
24 | model_name = Path(self.model_name).stem
25 | with open(f"{output_dir}/{model_name}_info", 'w') as bpf:
26 | for value in self.best_perfomance.keys():
27 | bpf.write(f"{value}: {self.best_perfomance[value]}\n")
28 | bpf.write(f"Accuracy score: {self.report.get_accuracy_score()}\n")
29 | bpf.write("Perfomance on test set:\n")
30 | bpf.write(self.report.get_report())
31 | bpf.close()
32 |
33 | def train_model(self, X_train, X_test, y_train, y_test):
34 | le = LabelEncoder()
35 | le.fit(y_train)
36 | y_train = le.transform(y_train)
37 | y_test = le.transform(y_test)
38 |
39 | if self.solver_value == 4 or self.solver_value == 7:
40 | parameters = "-s {} -c {} -B 1 -q".format(self.solver_value, self.c_value)
41 | else:
42 | parameters = "-s {} -n {} -c {} -B 1 -q".format(self.solver_value, self.jobs_number, self.c_value)
43 | param = parameter(parameters)
44 | prob = problem(y_train, X_train)
45 | model = train(prob, param)
46 |
47 | p_label, p_acc, p_val = predict(y_test, X_test, model)
48 |
49 | #Convert predicted value from float to int
50 | y_pred = [int(label) for label in p_label]
51 | y_test = le.inverse_transform(y_test)
52 | y_pred = le.inverse_transform(y_pred)
53 |
54 | self.report = Report(y_test, y_pred)
55 |
56 | save_model(f"{self.model_name}", model)
57 | return y_pred
--------------------------------------------------------------------------------
/python/core/tuning_parameter.py:
--------------------------------------------------------------------------------
1 | from time import time, gmtime, strftime
2 | from collections import OrderedDict
3 | from pathlib import Path
4 |
5 | from sklearn.preprocessing import LabelEncoder
6 | from sklearn.metrics import accuracy_score
7 | from liblinearutil import *
8 |
9 | class Tuning():
10 |
11 | def __init__(self, jobs_number, solvers_value_file, output_dir):
12 | self.solvers = OrderedDict()
13 | self.solvers["L2-regularized logistic regression (primal)"] = 0
14 | self.solvers["L2-regularized L2-loss support vector classification (dual)"] = 1
15 | self.solvers["L2-regularized L2-loss support vector classification (primal)"] = 2
16 | self.solvers["L2-regularized L1-loss support vector classification (dual)"] = 3
17 | self.solvers["support vector classification by Crammer and Singer"] = 4
18 | self.solvers["L1-regularized L2-loss support vector classification"] = 5
19 | self.solvers["L1-regularized logistic regression"] = 6
20 | self.solvers["L2-regularized logistic regression (dual)"] = 7
21 | self.C_VALUE = [0.01, 0.05, 0.10, 0.20, 0.25, 0.50, 1, 2, 4, 8]
22 | self.output_dir = output_dir
23 | if solvers_value_file is None:
24 | self.__write_solvers_value()
25 | self.jobs_number = jobs_number
26 | self.__load_solvers_value(solvers_value_file)
27 | self.best_perfomance = OrderedDict()
28 |
29 | def __write_solvers_value(self):
30 | with open(f"{self.output_dir}/liblinear_solver", 'w') as sf:
31 | for value in self.solvers.keys():
32 | sf.write(f"{value}\n")
33 | sf.close()
34 |
35 | def __load_solvers_value(self, solvers_value_file):
36 | solvers_value_file = Path(solvers_value_file)
37 | if not solvers_value_file.exists():
38 | with solvers_value_file.open('w', encoding='utf-8') as sf:
39 | for key in self.solvers.keys():
40 | sf.write(f"{key}\n")
41 | sf.close()
42 | with open(solvers_value_file, 'r') as sf:
43 | lines = []
44 | for line in sf:
45 | line = line.rstrip('\n')
46 | print(line)
47 | lines.append(line)
48 | sf.close()
49 | for key in self.solvers.keys():
50 | if key not in lines:
51 | del self.solvers[key]
52 |
53 | def __encode_label(self, y_train, y_test):
54 | le = LabelEncoder()
55 | le.fit(y_train)
56 | y_train = le.transform(y_train)
57 | y_test = le.transform(y_test)
58 | return y_train, y_test
59 |
60 | def __create_perfomance_file(self, perfomance_dict):
61 | with open(f"{self.output_dir}/{perfomance_dict['Solver name']}", 'w') as sf:
62 | for value in perfomance_dict.keys():
63 | sf.write(f"{value}: {perfomance_dict[value]}\n")
64 | sf.close()
65 |
66 | def __train_and_predict(self, X_train, X_test, y_train, y_test, solver_value, c_value):
67 | if solver_value == 4 or solver_value == 7:
68 | parameters = "-s {} -c {} -B 1 -q".format(solver_value, c_value)
69 | else:
70 | parameters = "-s {} -n {} -c {} -B 1 -q".format(solver_value, self.jobs_number, c_value)
71 | param = parameter(parameters)
72 |
73 | model = train(self.prob, param)
74 |
75 | p_label, p_acc, p_val = predict(y_test, X_test, model)
76 |
77 | #Convert predicted value from float to int
78 | y_pred = [int(label) for label in p_label]
79 |
80 | accuracy = accuracy_score(y_test, y_pred)
81 |
82 | return accuracy
83 |
84 |
85 | def tuning_parameter(self, X_train, X_test, y_train, y_test):
86 | y_train, y_test = self.__encode_label(y_train, y_test)
87 | self.prob = problem(y_train, X_train)
88 |
89 | self.scores_list = []
90 |
91 | cv_accuracy = 0
92 |
93 | best_solver_name = ""
94 | best_cv_accuracy = 0
95 | best_c_value = 0
96 | best_s_value = 0
97 |
98 | current_cv_accuracy = 0
99 | current_c_value = 0
100 |
101 | for solver_name, solver_value in self.solvers.items():
102 | print(f"Tuning solver {solver_name}")
103 | time_start = time()
104 | for c_value in self.C_VALUE:
105 | print(f"C value: {c_value}")
106 | if solver_value == 4 or solver_value == 7:
107 | parameters = "-s {} -c {} -v 10 -B 1 -q".format(solver_value, c_value)
108 | else:
109 | parameters = "-s {} -n {} -c {} -v 10 -B 1 -q".format(solver_value, self.jobs_number, c_value)
110 | param = parameter(parameters)
111 | cv_accuracy = train(self.prob, param)
112 | if cv_accuracy > best_cv_accuracy:
113 | best_c_value = c_value
114 | best_cv_accuracy = cv_accuracy
115 | best_s_value = solver_value
116 | best_solver_name = solver_name
117 | if cv_accuracy > current_cv_accuracy:
118 | current_cv_accuracy = cv_accuracy
119 | current_c_value = c_value
120 | tuning_time = time() - time_start
121 | tuning_time = strftime("%H:%M:%S", gmtime(tuning_time))
122 |
123 | #Training current model for testing
124 | accuracy = self.__train_and_predict(X_train, X_test, y_train, y_test, solver_value, current_c_value)
125 | perfomance_dict = OrderedDict()
126 | perfomance_dict["Solver name"] = solver_name
127 | perfomance_dict["Best C value"] = current_c_value
128 | perfomance_dict["Tuning time"] = tuning_time
129 | perfomance_dict["Accuracy"] = accuracy
130 | self.__create_perfomance_file(perfomance_dict)
131 | current_cv_accuracy = 0
132 | current_c_value = 0
133 | print("\n")
134 |
135 | #training_time, test_time, accuracy = self.__train_and_predict(X_train, X_test, y_train, y_test, best_s_value, best_c_value)
136 | #self.best_perfomance = OrderedDict()
137 | #self.best_perfomance["Solver name"] = best_solver_name
138 | #self.best_perfomance["C value"] = best_c_value
139 | #self.best_perfomance["Tuning time"] = tuning_time
140 | #self.best_perfomance["Training time"] = training_time
141 | #self.best_perfomance["Test time"] = test_time
142 | #self.best_perfomance["Accuracy"] = accuracy
143 |
144 | return best_solver_name, best_s_value, best_c_value
145 |
--------------------------------------------------------------------------------
/python/core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/utils/__init__.py
--------------------------------------------------------------------------------
/python/core/utils/core_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | class CoreUtils():
4 |
5 | @staticmethod
6 | def check_jobs_number(jobs_number):
7 | max_jobs = os.cpu_count()
8 | if jobs_number > max_jobs:
9 | jobs_number = max_jobs
10 | elif jobs_number < 0:
11 | if jobs_number == -1:
12 | jobs_number = max_jobs
13 | elif jobs_number > -(max_jobs - 1):
14 | jobs_number = max_jobs + jobs_number
15 | else:
16 | jobs_number = 1
17 | return jobs_number
18 |
--------------------------------------------------------------------------------
/python/core/utils/csv_formatter.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from collections import OrderedDict
3 |
4 | from csv_utils import CsvUtils
5 |
6 | class CsvFormatter():
7 |
8 | def __init__(self, header_list, csv_delimiter, header = False):
9 | self.header_list = header_list
10 | self.header = header
11 | self.csv_delimiter = csv_delimiter
12 |
13 | def get_rows(self, input_csv):
14 | with open(input_csv, 'r+', newline = '', encoding='utf8') as csv_file:
15 | header_list_copy = self.header_list.copy()
16 | csv_file.seek(0)
17 | csv_file_reader = csv.reader(csv_file, delimiter = self.csv_delimiter)
18 | header = next(csv_file_reader)
19 | rows = OrderedDict()
20 | if len(header) == 0:
21 | csv_file.close()
22 | raise IOError("{} is empty.".format(input_csv))
23 | elif len(self.header_list) <= len(header) :
24 | count = 0
25 | while len(header_list_copy) != 0:
26 | for i in range(0, len(header)):
27 | if header[i].lower().strip() == header_list_copy[0].lower().strip():
28 | rows.update({header_list_copy[0]: [row[i] for row in csv_file_reader]})
29 | count += 1
30 | break
31 | header_list_copy.pop(0)
32 | csv_file.seek(0)
33 | next(csv_file_reader)
34 | if count != len(self.header_list):
35 | csv_file.close()
36 | raise IOError("{} not found in {}".format(header_list_copy, input_csv))
37 | else:
38 | csv_file.close()
39 | raise IOError("Too many header in the list.")
40 | csv_file.close()
41 | return rows
42 |
43 | def write(self, data, output_csv):
44 | CsvUtils.write_to_csv(data, output_csv, self.csv_delimiter)
45 |
--------------------------------------------------------------------------------
/python/core/utils/csv_utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | from multiprocessing import Pool
4 |
5 | import pandas as pd
6 | import numpy as np
7 |
8 | class CsvUtils():
9 |
10 | @staticmethod
11 | def __check_file_existence(file_path):
12 | if not os.path.isfile(file_path):
13 | return False
14 | else:
15 | return True
16 |
17 | @staticmethod
18 | def __check_file_extension(file_path, allowed_extension):
19 | extension = os.path.splitext(file_path)[1]
20 | if extension not in allowed_extension:
21 | return False
22 | else:
23 | return True
24 |
25 | @staticmethod
26 | def check_csv(csv_path):
27 | if not CsvUtils.__check_file_existence(csv_path):
28 | raise OSError ("FILE NOT FOUND : {} wasn't found".format(csv_path))
29 | if not CsvUtils.__check_file_extension(csv_path, ['.csv']):
30 | raise OSError("WRONG FILE EXTENSION : {} wasn't a csv file.".format(csv_path))
31 |
32 | @staticmethod
33 | def convert_lines(rows):
34 | X = np.array([])
35 | y = np.array([])
36 | first = True
37 | for i in range(0, len(rows)):
38 | values = rows[i].split(',')
39 | splitted_row_features = [float(value) for value in values[1:-2]]
40 | splitted_row_label = values[-1].rstrip('\n')
41 | if first:
42 | X = np.array(splitted_row_features)
43 | y = np.array(splitted_row_label)
44 | first = False
45 | else:
46 | X = np.append(X, np.array(splitted_row_features))
47 | y = np.append(y, np.array(splitted_row_label))
48 | return X.reshape((i+1, len(splitted_row_features))), y
49 |
50 | @staticmethod
51 | def from_csv(csv_file, chunk_size, jobs_number):
52 | stop = False
53 | rows = []
54 | chunk_size = int(chunk_size / jobs_number)
55 | with open(csv_file, 'r+') as csv:
56 | next(csv)
57 | while not stop:
58 | read_rows = []
59 | try:
60 | for _ in range(jobs_number):
61 | temp_rows = []
62 | for _ in range (chunk_size):
63 | temp_rows.append(next(csv))
64 | read_rows.append(temp_rows)
65 | except StopIteration:
66 | stop = True
67 | read_rows.append(temp_rows)
68 | finally:
69 | if len(temp_rows) != 0:
70 | with Pool(jobs_number) as p:
71 | results = p.map(CsvUtils.convert_lines, read_rows)
72 | for result in results:
73 | rows.append(result)
74 | csv.close()
75 | first = True
76 | for row in rows:
77 | if first:
78 | X = row[0]
79 | y = row[1]
80 | first = False
81 | else:
82 | X = np.concatenate((X, row[0]))
83 | y = np.concatenate((y, row[1]))
84 | return X, y
85 |
86 | @staticmethod
87 | def write_to_csv(data, output_csv, csv_delimiter, print_header = False, mode = 'w+'):
88 | with open(output_csv, mode, newline = '', encoding='utf8') as csv_file:
89 | csv_file_writer = csv.writer(csv_file, delimiter = csv_delimiter)
90 | if print_header == True:
91 | header = data.keys()
92 | csv_file_writer.writerow(header)
93 | data = zip(*data.values())
94 | csv_file_writer.writerows(data)
95 | csv_file.close()
96 |
97 | @staticmethod
98 | def order_csv(input_csv, column_name):
99 | #csv_delimiter = CsvUtils.find_csv_delimiter(input_csv)
100 | temp = pd.read_csv(input_csv, delimiter = ',')
101 | temp = temp.sort_values(by=[column_name])
102 | temp.to_csv(input_csv, index = False)
103 |
--------------------------------------------------------------------------------
/python/core/utils/report.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
3 |
4 |
5 | class Report():
6 |
7 | def __init__(self, y_true, y_pred):
8 | self.y_true = y_true
9 | self.y_pred = y_pred
10 |
11 | def print_report(self):
12 | print(classification_report(self.y_true, self.y_pred))
13 |
14 | def get_report(self):
15 | return classification_report(self.y_true, self.y_pred)
16 |
17 | def get_micro_score(self):
18 | return precision_recall_fscore_support(self.y_true, self.y_pred, average='micro')
19 |
20 | def get_macro_score(self):
21 | return precision_recall_fscore_support(self.y_true, self.y_pred, average='macro')
22 |
23 | def get_accuracy_score(self):
24 | return accuracy_score(self.y_true, self.y_pred)
25 |
26 | def get_classes_score(self):
27 | unique, counts = np.unique(self.y_pred, return_counts=True)
28 | scores = precision_recall_fscore_support(self.y_true, self.y_pred, average=None, labels=unique)
29 | scores_dict = OrderedDict()
30 | for value in unique:
31 | scores_dict.update({value: []})
32 | for i in range(len(scores) - 1):
33 | for j in range(len(unique)):
34 | scores_dict[unique[j]].append(scores[i][j])
35 | scores_dict.update({"support": scores[-1]})
36 | return scores_dict
--------------------------------------------------------------------------------
/python/csv_processing.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 | import sys
5 | import os
6 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
7 |
8 | from core.utils.csv_formatter import CsvFormatter
9 | from core.utils.csv_utils import CsvUtils
10 |
11 | logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description = "Csv file processing")
15 | parser.add_argument('-i',
16 | '--input',
17 | help = "path to csv file",
18 | type = str,
19 | required = True)
20 | parser.add_argument('-d',
21 | '--delimiter',
22 | help = 'csv delimiter, use c for comma and sc for semicolon',
23 | type = str,
24 | default = 'c')
25 | parser.add_argument('-c',
26 | '--columns',
27 | help = "column or columns to extract from csv [default = 'text']",
28 | type = str,
29 | action = 'append',
30 | required = True)
31 | args = parser.parse_args()
32 | input_csv = args.input
33 | input_csv = Path(input_csv).resolve()
34 | output_csv = "{}/{}_jar.csv".format(input_csv.parent, input_csv.name.split('.')[0])
35 | try:
36 | CsvUtils.check_csv(input_csv)
37 | logging.info("Start formatting csv file")
38 | try:
39 | if(args.delimiter == 'c'):
40 | csvFormatter = CsvFormatter(args.columns, ',')
41 | elif(args.delimiter == 'sc'):
42 | csvFormatter = CsvFormatter(args.columns, ';')
43 | else:
44 | logging.error('Wrong csv delimiter. Use "c" for comma and "sc" for semicolon.')
45 | sys.exit(1)
46 | data = csvFormatter.get_rows(input_csv)
47 | csvFormatter.write(data, output_csv)
48 | except IOError as e:
49 | logging.error(e)
50 | sys.exit(1)
51 | logging.info("End formatting csv file")
52 | except OSError as e:
53 | logging.error(e)
54 | sys.exit(1)
55 |
56 | if __name__ == '__main__':
57 | main()
58 |
--------------------------------------------------------------------------------
/python/train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core'))
4 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/liblinear_multicore'))
6 |
7 | import logging
8 | import argparse
9 | from pathlib import Path
10 |
11 | import numpy as np
12 | from sklearn.model_selection import train_test_split
13 |
14 | from core.tuning_parameter import Tuning
15 | from core.train_model import Train
16 | from core.utils.csv_utils import CsvUtils
17 | from core.utils.core_utils import CoreUtils
18 |
19 |
20 | logging.basicConfig(level = logging.INFO, format = "[%(levelname)s] %(asctime)s - %(message)s")
21 |
22 |
23 | def main():
24 | parser = argparse.ArgumentParser(description = "Hyperparameter tuning")
25 | parser.add_argument('-i',
26 | '--input',
27 | help = "path to train set and to test set csv.",
28 | type = str,
29 | action = 'append',
30 | required = True)
31 | parser.add_argument('-c',
32 | '--chunk-size',
33 | help = 'chunk size --default = 1000',
34 | type = int,
35 | default = 1000)
36 | parser.add_argument('-j',
37 | '--jobs-number',
38 | help = 'number of jobs',
39 | type = int,
40 | default = 1)
41 | parser.add_argument('-m',
42 | '--model',
43 | help = 'model file name',
44 | type = str,
45 | default = 'Senti4SD')
46 | args = parser.parse_args()
47 |
48 | seed = np.random.seed(42)
49 |
50 | jobs_number = CoreUtils.check_jobs_number(args.jobs_number)
51 |
52 | if len(args.input) == 1:
53 |
54 | train_file_path = Path(args.input[0]).resolve()
55 |
56 | # Check file existence in advance to avoid missing test set
57 | try:
58 | CsvUtils.check_csv(train_file_path)
59 | except OSError as e:
60 | print(e)
61 | sys.exit(1)
62 |
63 | try:
64 | logging.info("Start reading dataset in chunk...")
65 | X, y = CsvUtils.from_csv(train_file_path, args.chunk_size, jobs_number)
66 | logging.info("End reading dataset in chunk...")
67 | except OSError as e:
68 | print(e)
69 | sys.exit(1)
70 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify = y, random_state = seed)
71 | del X, y
72 | elif len(args.input) == 2:
73 |
74 | train_file_path = Path(args.input[0]).resolve()
75 | test_file_path = Path(args.input[1]).resolve()
76 |
77 | #Check file existence in advance to avoid missing test set
78 | try:
79 | CsvUtils.check_csv(train_file_path)
80 | CsvUtils.check_csv(test_file_path)
81 | except OSError as e:
82 | print(e)
83 | sys.exit(1)
84 |
85 | #read the train set in chunk
86 | logging.info("Start reading training set in chunk...")
87 | X_train, y_train = CsvUtils.from_csv(train_file_path, args.chunk_size, jobs_number)
88 | logging.info("End reading training set in chunk...")
89 | logging.info("Start reading test set in chunk...")
90 | X_test, y_test = CsvUtils.from_csv(test_file_path, args.chunk_size, jobs_number)
91 | logging.info("End reading test set in chunk...")
92 |
93 | else:
94 | print("Too many input arguments.")
95 |
96 | #create path
97 | output_path = Path('liblinear_perfomance')
98 | output_path.mkdir(parents=True, exist_ok=True)
99 | output_path = output_path.resolve()
100 | dir_path = output_path.parent
101 | model_path = f"{dir_path}/{args.model}.model"
102 |
103 | logging.info("Start parameter tuning")
104 | current_path = Path.cwd()
105 | solvers_path = Path(f'{current_path}/liblinear_solvers').resolve()
106 | tuning = Tuning(jobs_number, solvers_path, output_path)
107 | best_solver_name, best_solver_value, best_c_value = tuning.tuning_parameter(X_train, X_test, y_train, y_test)
108 | logging.info("End parameter tuning")
109 |
110 | logging.info("Start training model")
111 | train = Train(jobs_number, best_solver_name, best_solver_value, best_c_value, model_path)
112 | train.train_model(X_train, X_test, y_train, y_test)
113 | train.save_best_perfomance(dir_path)
114 | logging.info("End training model")
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
120 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib==1.2.0
2 | llvmlite==0.28.0
3 | numba==0.43.1
4 | numpy==1.22.0
5 | pandas==0.24.2
6 | python-dateutil==2.8.0
7 | pytz==2019.1
8 | scikit-learn==1.5.0
9 | scipy==1.10.0
10 | six==1.12.0
11 |
--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SCRIPTDIR=$(dirname "$0")
4 |
5 | csvDelimiter='c'
6 | features='A'
7 | grams=false
8 | chunkSize=1000
9 | jobsNumber=1
10 | modelFile="$SCRIPTDIR/Senti4SD"
11 |
12 | help(){
13 | echo "Usage-1: sh train.sh -i train.csv [-d csv-delimiter] [-F features] [-g] [-c chunk_size] [-j jobs_number] [-o Senti4SD.model]"
14 | echo "or"
15 | echo "Usage-2: sh train.sh -i train.csv -i test.csv [-d csv-delimiter] [-g] [-c chunk_size] [-j jobs_number] [-o Senti4SD.model]"
16 | echo "-i -- the input file, containing the corpus for the training; it's possible to run the script with two separated datasets, one for training and the other for testing [see Usage-2]. [required]"
17 | echo '-d -- the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]'
18 | echo '-F -- all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]'
19 | echo '-g -- enables the extraction of n-grams (i.e,. bigrams and unigrams)'
20 | echo "-c -- the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]"
21 | echo "-j -- the number of cores to use during csv reading phase. If you pass -1 all cores will be used.
22 | If you pass a number higher than your total core number, the script will use all the cores. [Default value: 1] "
23 | echo "-o -- the name of trained model. [Default value: 'Senti4SD.model']"
24 | exit 1
25 | }
26 |
27 | NUMARGS=$#
28 | if [ $NUMARGS -eq 0 ]; then
29 | help
30 | exit 1
31 | fi
32 |
33 | while getopts "h:i:d:F:m:c:j:o:g" OPTIONS; do
34 | case $OPTIONS in
35 | h)
36 | help
37 | ;;
38 | i)
39 | inputFiles+=($OPTARG)
40 | ;;
41 | d)
42 | csvDelimiter=$OPTARG
43 | ;;
44 | F)
45 | features=$OPTARG
46 | ;;
47 | g)
48 | grams=true
49 | ;;
50 | c)
51 | chunkSize=$OPTARG
52 | ;;
53 | j)
54 | jobsNumber=$OPTARG
55 | ;;
56 | m)
57 | modelFile="$SCRIPTDIR/$OPTARG"
58 | ;;
59 | \?)
60 | echo -e \\n"Option $OPTARG not allowed."
61 | help
62 | ;;
63 | esac
64 | done
65 |
66 | INPUTFILESLENGTH=${#inputFiles[@]}
67 | echo $INPUTFILESLENGTH
68 |
69 | if [ $INPUTFILESLENGTH -lt 1 ]; then
70 | echo "Train data file is required!"
71 | exit 1
72 | else
73 | if [ $INPUTFILESLENGTH -gt 2 ]; then
74 | echo "Too many input file!"
75 | exit 1
76 | else
77 | if [ $INPUTFILESLENGTH -eq 1 ]; then
78 |
79 | mkdir -p $SCRIPTDIR/temp_features;
80 |
81 | inputFile=$inputFiles
82 |
83 | python $SCRIPTDIR/python/csv_processing.py -i $inputFile -d $csvDelimiter -c text -c polarity
84 |
85 | IFS='.' read -ra FILENAMESPLIT <<< "$inputFile"
86 | jarInputFile="${FILENAMESPLIT[0]}_jar.csv"
87 |
88 | echo $jarInputFile
89 |
90 | #-F A: all features to be considered
91 | #-i file_name: a file containg a document for every line
92 | #-W cbow600.bin: DSM to be loaded
93 | #-oc file_name.csv: output dataset containg the features extracted
94 | #-vd numeric: vectors size (for cbow600.bin the size is 600)
95 | #-L: if present corpus have a label column [optional]
96 | #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
97 | #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
98 |
99 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600 -L
100 |
101 | python $SCRIPTDIR/python/train.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -c $chunkSize -j $jobsNumber -m $modelFile
102 |
103 | rm -rf $SCRIPTDIR/temp_features
104 | rm $jarInputFile
105 | else
106 |
107 | for file in ${inputFiles[@]}; do
108 | if [ ! -f $file ]; then
109 | echo "File $file not found!"
110 | exit 1
111 | fi
112 | done
113 |
114 | mkdir -p $SCRIPTDIR/temp_features;
115 |
116 | trainFile=${inputFiles[0]}
117 | testFile=${inputFiles[1]}
118 |
119 | python $SCRIPTDIR/python/csv_processing.py -i $trainFile -d $csvDelimiter -c Text -c Polarity
120 | python $SCRIPTDIR/python/csv_processing.py -i $testFile -d $csvDelimiter -c Text -c Polarity
121 |
122 | IFS='.' read -ra FILENAMESPLIT <<< "$trainFile"
123 | jarTrainFile="${FILENAMESPLIT[0]}_jar.csv"
124 |
125 | IFS='.' read -ra FILENAMESPLIT <<< "$testFile"
126 | jarTestFile="${FILENAMESPLIT[0]}_jar.csv"
127 |
128 | echo $jarTrainFile
129 | echo $jarTestFile
130 |
131 | if [ "$grams" = true ] ; then
132 | java -jar $SCRIPTDIR/java/NgramsExtraction.jar $jarTrainFile -L
133 |
134 |
135 | #-F A: all features to be considered
136 | #-i file_name: a file containg a document for every line
137 | #-W cbow600.bin: DSM to be loaded
138 | #-oc file_name.csv: output dataset containg the features extracted
139 | #-vd numeric: vectors size (for cbow600.bin the size is 600)
140 | #-L: if present corpus have a label column [optional]
141 | #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
142 | #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
143 |
144 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTrainFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -vd 600 -L -ul $SCRIPTDIR/UnigramsList -bl $SCRIPTDIR/BigramsList
145 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTestFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -vd 600 -L -ul $SCRIPTDIR/UnigramsList -bl $SCRIPTDIR/BigramsList
146 | else
147 | #-F A: all features to be considered
148 | #-i file_name: a file containg a document for every line
149 | #-W cbow600.bin: DSM to be loaded
150 | #-oc file_name.csv: output dataset containg the features extracted
151 | #-vd numeric: vectors size (for cbow600.bin the size is 600)
152 | #-L: if present corpus have a label column [optional]
153 | #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
154 | #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
155 |
156 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTrainFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -vd 600 -L
157 | java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTestFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -vd 600 -L
158 | fi
159 |
160 | python $SCRIPTDIR/python/train.py -i $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -i $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -c $chunkSize -j $jobsNumber -m $modelFile
161 |
162 | rm -rf $SCRIPTDIR/temp_features
163 | rm $jarTrainFile
164 | rm $jarTestFile
165 |
166 | fi
167 | fi
168 | fi
169 |
--------------------------------------------------------------------------------