├── .dvc
    ├── .gitignore
    └── config
├── .dvcignore
├── .gitattributes
├── .gitignore
├── .idea
    ├── misc.xml
    ├── modules.xml
    ├── pySenti4SD.iml
    ├── vcs.xml
    └── workspace.xml
├── LICENSE
├── README.md
├── Sample.csv
├── Senti4SD.model
├── Senti4SD_info
├── classification.sh
├── java
    ├── .gitignore
    ├── NgramsExtraction.jar.dvc
    ├── Senti4SD-fast.jar.dvc
    ├── Senti4SD.jar.dvc
    └── dsm.bin.dvc
├── liblinear_solvers
├── python
    ├── .gitignore
    ├── classification_task.py
    ├── core
    │   ├── __init__.py
    │   ├── classification.py
    │   ├── liblinear_multicore
    │   │   ├── COPYRIGHT
    │   │   ├── __init__.py
    │   │   ├── commonutil.py
    │   │   ├── liblinear.py
    │   │   ├── so
    │   │   │   └── liblinear.so.3
    │   │   └── windows
    │   │   │   └── liblinear.dll
    │   ├── liblinearutil.py
    │   ├── train_model.py
    │   ├── tuning_parameter.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── core_utils.py
    │   │   ├── csv_formatter.py
    │   │   ├── csv_utils.py
    │   │   └── report.py
    ├── csv_processing.py
    └── train.py
├── requirements.txt
├── test_stackoverflow.csv
├── train.sh
└── train_stackoverflow.csv


/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | ['remote "origin"']
2 |     url = https://dagshub.com/collab-uniba/pySenti4SD.dvc
3 | 


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/.gitattributes


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pySenti4SD.iml" filepath="$PROJECT_DIR$/.idea/pySenti4SD.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/pySenti4SD.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TemplatesService">
 9 |     <option name="TEMPLATE_FOLDERS">
10 |       <list>
11 |         <option value="$MODULE_DIR$/python/venv/lib/python3.7/site-packages/pandas/io/formats/templates" />
12 |       </list>
13 |     </option>
14 |   </component>
15 |   <component name="TestRunnerService">
16 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
17 |   </component>
18 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="fd5dea99-bca6-44fe-9612-d363938e9340" name="Default Changelist" comment="">
  5 |       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
  6 |       <change beforePath="$PROJECT_DIR$/python/core/classification.py" beforeDir="false" afterPath="$PROJECT_DIR$/python/core/classification.py" afterDir="false" />
  7 |       <change beforePath="$PROJECT_DIR$/python/core/utils/csv_formatter.py" beforeDir="false" afterPath="$PROJECT_DIR$/python/core/utils/csv_formatter.py" afterDir="false" />
  8 |       <change beforePath="$PROJECT_DIR$/python/core/utils/csv_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/python/core/utils/csv_utils.py" afterDir="false" />
  9 |     </list>
 10 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 11 |     <option name="SHOW_DIALOG" value="false" />
 12 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 13 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 14 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 15 |   </component>
 16 |   <component name="FileEditorManager">
 17 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 18 |       <file pinned="false" current-in-tab="false">
 19 |         <entry file="file://$PROJECT_DIR$/python/classification_task.py">
 20 |           <provider selected="true" editor-type-id="text-editor">
 21 |             <state relative-caret-position="1394">
 22 |               <caret line="83" column="47" selection-start-line="83" selection-start-column="47" selection-end-line="83" selection-end-column="47" />
 23 |               <folding>
 24 |                 <element signature="e#0#10#0" expanded="true" />
 25 |               </folding>
 26 |             </state>
 27 |           </provider>
 28 |         </entry>
 29 |       </file>
 30 |       <file pinned="false" current-in-tab="false">
 31 |         <entry file="file://$PROJECT_DIR$/python/core/utils/csv_utils.py">
 32 |           <provider selected="true" editor-type-id="text-editor">
 33 |             <state relative-caret-position="533">
 34 |               <caret line="86" column="64" selection-start-line="86" selection-start-column="64" selection-end-line="86" selection-end-column="64" />
 35 |               <folding>
 36 |                 <element signature="e#0#10#0" expanded="true" />
 37 |               </folding>
 38 |             </state>
 39 |           </provider>
 40 |         </entry>
 41 |       </file>
 42 |       <file pinned="false" current-in-tab="false">
 43 |         <entry file="file://$PROJECT_DIR$/python/core/classification.py">
 44 |           <provider selected="true" editor-type-id="text-editor">
 45 |             <state relative-caret-position="1241">
 46 |               <caret line="87" column="50" selection-start-line="87" selection-start-column="50" selection-end-line="87" selection-end-column="50" />
 47 |               <folding>
 48 |                 <element signature="e#0#9#0" expanded="true" />
 49 |               </folding>
 50 |             </state>
 51 |           </provider>
 52 |         </entry>
 53 |       </file>
 54 |       <file pinned="false" current-in-tab="false">
 55 |         <entry file="file://$PROJECT_DIR$/python/csv_processing.py">
 56 |           <provider selected="true" editor-type-id="text-editor">
 57 |             <state relative-caret-position="493">
 58 |               <caret line="33" column="87" selection-start-line="33" selection-start-column="87" selection-end-line="33" selection-end-column="87" />
 59 |               <folding>
 60 |                 <element signature="e#0#15#0" expanded="true" />
 61 |               </folding>
 62 |             </state>
 63 |           </provider>
 64 |         </entry>
 65 |       </file>
 66 |       <file pinned="false" current-in-tab="false">
 67 |         <entry file="file://$PROJECT_DIR$/classification.sh">
 68 |           <provider selected="true" editor-type-id="text-editor">
 69 |             <state relative-caret-position="578">
 70 |               <caret line="34" column="22" selection-start-line="34" selection-start-column="22" selection-end-line="34" selection-end-column="22" />
 71 |             </state>
 72 |           </provider>
 73 |         </entry>
 74 |       </file>
 75 |       <file pinned="false" current-in-tab="true">
 76 |         <entry file="file://$PROJECT_DIR$/python/core/utils/csv_formatter.py">
 77 |           <provider selected="true" editor-type-id="text-editor">
 78 |             <state relative-caret-position="595">
 79 |               <caret line="35" column="80" selection-start-line="35" selection-start-column="80" selection-end-line="35" selection-end-column="80" />
 80 |               <folding>
 81 |                 <element signature="e#0#10#0" expanded="true" />
 82 |               </folding>
 83 |             </state>
 84 |           </provider>
 85 |         </entry>
 86 |       </file>
 87 |       <file pinned="false" current-in-tab="false">
 88 |         <entry file="file://$PROJECT_DIR$/train.sh">
 89 |           <provider selected="true" editor-type-id="text-editor">
 90 |             <state relative-caret-position="289">
 91 |               <caret line="17" column="179" selection-start-line="17" selection-start-column="10" selection-end-line="17" selection-end-column="179" />
 92 |             </state>
 93 |           </provider>
 94 |         </entry>
 95 |       </file>
 96 |       <file pinned="false" current-in-tab="false">
 97 |         <entry file="file://$PROJECT_DIR$/README.md">
 98 |           <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
 99 |             <state split_layout="SPLIT">
100 |               <first_editor relative-caret-position="884">
101 |                 <caret line="52" column="184" selection-start-line="52" selection-start-column="184" selection-end-line="52" selection-end-column="184" />
102 |               </first_editor>
103 |               <second_editor />
104 |             </state>
105 |           </provider>
106 |         </entry>
107 |       </file>
108 |     </leaf>
109 |   </component>
110 |   <component name="FileTemplateManagerImpl">
111 |     <option name="RECENT_TEMPLATES">
112 |       <list>
113 |         <option value="Python Script" />
114 |       </list>
115 |     </option>
116 |   </component>
117 |   <component name="FindInProjectRecents">
118 |     <findStrings>
119 |       <find>m</find>
120 |       <find>dir_path</find>
121 |       <find>output_dir</find>
122 |     </findStrings>
123 |   </component>
124 |   <component name="Git.Settings">
125 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
126 |   </component>
127 |   <component name="IdeDocumentHistory">
128 |     <option name="CHANGED_PATHS">
129 |       <list>
130 |         <option value="$PROJECT_DIR$/python/__init__.py" />
131 |         <option value="$PROJECT_DIR$/python/utils/__init__.py" />
132 |         <option value="$PROJECT_DIR$/python/utils/csv_formatter.py" />
133 |         <option value="$PROJECT_DIR$/python/core/liblinearutil.py" />
134 |         <option value="$PROJECT_DIR$/python/core/utils/report.py" />
135 |         <option value="$PROJECT_DIR$/python/core/train_model.py" />
136 |         <option value="$PROJECT_DIR$/python/core/tuning_parameter.py" />
137 |         <option value="$PROJECT_DIR$/python/train.py" />
138 |         <option value="$PROJECT_DIR$/python/classification_task.py" />
139 |         <option value="$PROJECT_DIR$/python/csv_processing.py" />
140 |         <option value="$PROJECT_DIR$/train.sh" />
141 |         <option value="$PROJECT_DIR$/README.md" />
142 |         <option value="$PROJECT_DIR$/classification.sh" />
143 |         <option value="$PROJECT_DIR$/python/core/classification.py" />
144 |         <option value="$PROJECT_DIR$/python/core/utils/csv_utils.py" />
145 |         <option value="$PROJECT_DIR$/python/core/utils/csv_formatter.py" />
146 |       </list>
147 |     </option>
148 |   </component>
149 |   <component name="ProjectFrameBounds" extendedState="6">
150 |     <option name="x" value="293" />
151 |     <option name="y" value="53" />
152 |     <option name="width" value="1400" />
153 |     <option name="height" value="1000" />
154 |   </component>
155 |   <component name="ProjectView">
156 |     <navigator proportions="" version="1">
157 |       <foldersAlwaysOnTop value="true" />
158 |     </navigator>
159 |     <panes>
160 |       <pane id="ProjectPane">
161 |         <subPane>
162 |           <expand>
163 |             <path>
164 |               <item name="pySenti4SD" type="b2602c69:ProjectViewProjectNode" />
165 |               <item name="pySenti4SD" type="462c0819:PsiDirectoryNode" />
166 |             </path>
167 |             <path>
168 |               <item name="pySenti4SD" type="b2602c69:ProjectViewProjectNode" />
169 |               <item name="pySenti4SD" type="462c0819:PsiDirectoryNode" />
170 |               <item name="python" type="462c0819:PsiDirectoryNode" />
171 |             </path>
172 |             <path>
173 |               <item name="pySenti4SD" type="b2602c69:ProjectViewProjectNode" />
174 |               <item name="pySenti4SD" type="462c0819:PsiDirectoryNode" />
175 |               <item name="python" type="462c0819:PsiDirectoryNode" />
176 |               <item name="core" type="462c0819:PsiDirectoryNode" />
177 |             </path>
178 |             <path>
179 |               <item name="pySenti4SD" type="b2602c69:ProjectViewProjectNode" />
180 |               <item name="pySenti4SD" type="462c0819:PsiDirectoryNode" />
181 |               <item name="python" type="462c0819:PsiDirectoryNode" />
182 |               <item name="core" type="462c0819:PsiDirectoryNode" />
183 |               <item name="liblinear_multicore" type="462c0819:PsiDirectoryNode" />
184 |             </path>
185 |             <path>
186 |               <item name="pySenti4SD" type="b2602c69:ProjectViewProjectNode" />
187 |               <item name="pySenti4SD" type="462c0819:PsiDirectoryNode" />
188 |               <item name="python" type="462c0819:PsiDirectoryNode" />
189 |               <item name="core" type="462c0819:PsiDirectoryNode" />
190 |               <item name="utils" type="462c0819:PsiDirectoryNode" />
191 |             </path>
192 |           </expand>
193 |           <select />
194 |         </subPane>
195 |       </pane>
196 |       <pane id="Scope" />
197 |     </panes>
198 |   </component>
199 |   <component name="PropertiesComponent">
200 |     <property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
201 |     <property name="WebServerToolWindowFactoryState" value="false" />
202 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
203 |     <property name="node.js.detected.package.eslint" value="true" />
204 |     <property name="node.js.path.for.package.eslint" value="project" />
205 |     <property name="node.js.selected.package.eslint" value="(autodetect)" />
206 |     <property name="settings.editor.selected.configurable" value="editor.preferences.fonts.default" />
207 |   </component>
208 |   <component name="RunDashboard">
209 |     <option name="ruleStates">
210 |       <list>
211 |         <RuleState>
212 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
213 |         </RuleState>
214 |         <RuleState>
215 |           <option name="name" value="StatusDashboardGroupingRule" />
216 |         </RuleState>
217 |       </list>
218 |     </option>
219 |   </component>
220 |   <component name="SvnConfiguration">
221 |     <configuration />
222 |   </component>
223 |   <component name="TaskManager">
224 |     <task active="true" id="Default" summary="Default task">
225 |       <changelist id="fd5dea99-bca6-44fe-9612-d363938e9340" name="Default Changelist" comment="" />
226 |       <created>1560153363066</created>
227 |       <option name="number" value="Default" />
228 |       <option name="presentableId" value="Default" />
229 |       <updated>1560153363066</updated>
230 |       <workItem from="1560153368606" duration="28055000" />
231 |       <workItem from="1560197182130" duration="1425000" />
232 |       <workItem from="1560198663261" duration="3359000" />
233 |       <workItem from="1560234631815" duration="12863000" />
234 |       <workItem from="1562682057166" duration="6872000" />
235 |       <workItem from="1562751971442" duration="7755000" />
236 |       <workItem from="1562828318513" duration="1490000" />
237 |       <workItem from="1562844882905" duration="1916000" />
238 |       <workItem from="1562875539190" duration="131000" />
239 |       <workItem from="1563261584463" duration="991000" />
240 |       <workItem from="1563307459627" duration="289000" />
241 |       <workItem from="1563363805743" duration="165000" />
242 |       <workItem from="1563364792118" duration="201000" />
243 |     </task>
244 |     <servers />
245 |   </component>
246 |   <component name="TimeTrackingManager">
247 |     <option name="totallyTimeSpent" value="65512000" />
248 |   </component>
249 |   <component name="TodoView">
250 |     <todo-panel id="selected-file">
251 |       <is-autoscroll-to-source value="true" />
252 |     </todo-panel>
253 |     <todo-panel id="all">
254 |       <are-packages-shown value="true" />
255 |       <is-autoscroll-to-source value="true" />
256 |     </todo-panel>
257 |   </component>
258 |   <component name="ToolWindowManager">
259 |     <frame x="67" y="25" width="1853" height="1055" extended-state="6" />
260 |     <layout>
261 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.19701162" />
262 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
263 |       <window_info id="Favorites" order="2" side_tool="true" />
264 |       <window_info anchor="bottom" id="Message" order="0" />
265 |       <window_info anchor="bottom" id="Find" order="1" weight="0.32937366" />
266 |       <window_info anchor="bottom" id="Run" order="2" />
267 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
268 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
269 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
270 |       <window_info anchor="bottom" id="TODO" order="6" weight="0.32937366" />
271 |       <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
272 |       <window_info anchor="bottom" id="Version Control" order="8" />
273 |       <window_info anchor="bottom" id="Database Changes" order="9" />
274 |       <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
275 |       <window_info anchor="bottom" id="Terminal" order="11" weight="0.45464364" />
276 |       <window_info anchor="bottom" id="Python Console" order="12" />
277 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
278 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
279 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
280 |       <window_info anchor="right" id="SciView" order="3" />
281 |       <window_info anchor="right" id="Database" order="4" />
282 |     </layout>
283 |   </component>
284 |   <component name="TypeScriptGeneratedFilesManager">
285 |     <option name="version" value="1" />
286 |   </component>
287 |   <component name="editorHistoryManager">
288 |     <entry file="file://$PROJECT_DIR$/python/__init__.py" />
289 |     <entry file="file://$PROJECT_DIR$/.gitignore">
290 |       <provider selected="true" editor-type-id="text-editor">
291 |         <state relative-caret-position="-582" />
292 |       </provider>
293 |     </entry>
294 |     <entry file="file://$PROJECT_DIR$/python/utils/__init__.py" />
295 |     <entry file="file://$PROJECT_DIR$/python/utils/csv_utils.py" />
296 |     <entry file="file://$PROJECT_DIR$/python/utils/csv_formatter.py" />
297 |     <entry file="file://$PROJECT_DIR$/python/core/liblinear_multicore/liblinear.py">
298 |       <provider selected="true" editor-type-id="text-editor">
299 |         <state relative-caret-position="-102" />
300 |       </provider>
301 |     </entry>
302 |     <entry file="file://$PROJECT_DIR$/python/core/liblinear_multicore/commonutil.py">
303 |       <provider selected="true" editor-type-id="text-editor" />
304 |     </entry>
305 |     <entry file="file://$PROJECT_DIR$/python/classification.py" />
306 |     <entry file="file://$PROJECT_DIR$/python/core/liblinearutil.py">
307 |       <provider selected="true" editor-type-id="text-editor">
308 |         <state relative-caret-position="34">
309 |           <caret line="2" lean-forward="true" selection-start-line="2" selection-end-line="2" />
310 |           <folding>
311 |             <element signature="e#23#37#0" expanded="true" />
312 |           </folding>
313 |         </state>
314 |       </provider>
315 |     </entry>
316 |     <entry file="file://$PROJECT_DIR$/python/core/utils/core_utils.py">
317 |       <provider selected="true" editor-type-id="text-editor" />
318 |     </entry>
319 |     <entry file="file://$PROJECT_DIR$/python/train.py">
320 |       <provider selected="true" editor-type-id="text-editor">
321 |         <state relative-caret-position="1598">
322 |           <caret line="95" column="6" selection-start-line="95" selection-start-column="6" selection-end-line="95" selection-end-column="6" />
323 |           <folding>
324 |             <element signature="e#0#10#0" expanded="true" />
325 |           </folding>
326 |         </state>
327 |       </provider>
328 |     </entry>
329 |     <entry file="file://$PROJECT_DIR$/python/core/tuning_parameter.py">
330 |       <provider selected="true" editor-type-id="text-editor">
331 |         <state relative-caret-position="1394">
332 |           <caret line="88" column="23" selection-start-line="88" selection-start-column="23" selection-end-line="88" selection-end-column="23" />
333 |           <folding>
334 |             <element signature="e#0#39#0" expanded="true" />
335 |           </folding>
336 |         </state>
337 |       </provider>
338 |     </entry>
339 |     <entry file="file://$PROJECT_DIR$/python/core/train_model.py">
340 |       <provider selected="true" editor-type-id="text-editor">
341 |         <state relative-caret-position="391">
342 |           <caret line="29" column="46" selection-start-line="29" selection-start-column="46" selection-end-line="29" selection-end-column="46" />
343 |         </state>
344 |       </provider>
345 |     </entry>
346 |     <entry file="file://$PROJECT_DIR$/python/core/utils/report.py">
347 |       <provider selected="true" editor-type-id="text-editor">
348 |         <state relative-caret-position="221">
349 |           <caret line="14" column="61" selection-start-line="14" selection-start-column="61" selection-end-line="14" selection-end-column="61" />
350 |           <folding>
351 |             <element signature="e#0#35#0" expanded="true" />
352 |           </folding>
353 |         </state>
354 |       </provider>
355 |     </entry>
356 |     <entry file="file://$PROJECT_DIR$/python/classification_task.py">
357 |       <provider selected="true" editor-type-id="text-editor">
358 |         <state relative-caret-position="1394">
359 |           <caret line="83" column="47" selection-start-line="83" selection-start-column="47" selection-end-line="83" selection-end-column="47" />
360 |           <folding>
361 |             <element signature="e#0#10#0" expanded="true" />
362 |           </folding>
363 |         </state>
364 |       </provider>
365 |     </entry>
366 |     <entry file="file://$PROJECT_DIR$/python/core/classification.py">
367 |       <provider selected="true" editor-type-id="text-editor">
368 |         <state relative-caret-position="1241">
369 |           <caret line="87" column="50" selection-start-line="87" selection-start-column="50" selection-end-line="87" selection-end-column="50" />
370 |           <folding>
371 |             <element signature="e#0#9#0" expanded="true" />
372 |           </folding>
373 |         </state>
374 |       </provider>
375 |     </entry>
376 |     <entry file="file://$PROJECT_DIR$/python/csv_processing.py">
377 |       <provider selected="true" editor-type-id="text-editor">
378 |         <state relative-caret-position="493">
379 |           <caret line="33" column="87" selection-start-line="33" selection-start-column="87" selection-end-line="33" selection-end-column="87" />
380 |           <folding>
381 |             <element signature="e#0#15#0" expanded="true" />
382 |           </folding>
383 |         </state>
384 |       </provider>
385 |     </entry>
386 |     <entry file="file://$PROJECT_DIR$/classification.sh">
387 |       <provider selected="true" editor-type-id="text-editor">
388 |         <state relative-caret-position="578">
389 |           <caret line="34" column="22" selection-start-line="34" selection-start-column="22" selection-end-line="34" selection-end-column="22" />
390 |         </state>
391 |       </provider>
392 |     </entry>
393 |     <entry file="file://$PROJECT_DIR$/train.sh">
394 |       <provider selected="true" editor-type-id="text-editor">
395 |         <state relative-caret-position="289">
396 |           <caret line="17" column="179" selection-start-line="17" selection-start-column="10" selection-end-line="17" selection-end-column="179" />
397 |         </state>
398 |       </provider>
399 |     </entry>
400 |     <entry file="file://$PROJECT_DIR$/README.md">
401 |       <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
402 |         <state split_layout="SPLIT">
403 |           <first_editor relative-caret-position="884">
404 |             <caret line="52" column="184" selection-start-line="52" selection-start-column="184" selection-end-line="52" selection-end-column="184" />
405 |           </first_editor>
406 |           <second_editor />
407 |         </state>
408 |       </provider>
409 |     </entry>
410 |     <entry file="file://$PROJECT_DIR$/python/core/utils/csv_utils.py">
411 |       <provider selected="true" editor-type-id="text-editor">
412 |         <state relative-caret-position="533">
413 |           <caret line="86" column="64" selection-start-line="86" selection-start-column="64" selection-end-line="86" selection-end-column="64" />
414 |           <folding>
415 |             <element signature="e#0#10#0" expanded="true" />
416 |           </folding>
417 |         </state>
418 |       </provider>
419 |     </entry>
420 |     <entry file="file://$PROJECT_DIR$/python/core/utils/csv_formatter.py">
421 |       <provider selected="true" editor-type-id="text-editor">
422 |         <state relative-caret-position="595">
423 |           <caret line="35" column="80" selection-start-line="35" selection-start-column="80" selection-end-line="35" selection-end-column="80" />
424 |           <folding>
425 |             <element signature="e#0#10#0" expanded="true" />
426 |           </folding>
427 |         </state>
428 |       </provider>
429 |     </entry>
430 |   </component>
431 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Collaborative Development Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pySenti4SD
  2 | Python implementation of Senti4SD. Senti4SD is an emotion polarity classifier specifically trained to support sentiment analysis in developers' communication channels. 
  3 | Senti4SD is trained and evaluated on a gold standard of over 4K posts extracted from Stack Overflow. It is part of the Collab Emotion Mining Toolkit, ([EMTk](https://github.com/collab-uniba/EMTk)).
  4 | 
  5 | ## Fair Use Policy
  6 | Please, cite the following paper if you intend to use our tool for your own research:
  7 | > Calefato, F., Lanubile, F., Maiorano, F., Novielli N. (2018) "Sentiment Polarity Detection for Software Development," _Empirical Software Engineering_, 23(3), pp:1352-1382, doi: https://doi.org/10.1007/s10664-017-9546-9. [(BibTeX)](https://scholar.googleusercontent.com/scholar.bib?q=info:2Vtb0Wmx7hEJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAW9gCvJzwrHV1MKhoxzqLaJZA8lPDFxgx&scisf=4&ct=citation&cd=-1&hl=en)
  8 | 
  9 | ## How do I get set up? ##
 10 | 
 11 | ### Installation ###
 12 | 
 13 | **NOTE**: You will need to install [dvc](https://dvc.org) to check out this project. Once installed and initialized, simply the following:
 14 | 
 15 | ```bash
 16 | git clone https://github.com/collab-uniba/pySenti4SD.git
 17 | cd pySenti4SD
 18 | dvc pull -r origin
 19 | ```
 20 | 
 21 | ### Requirements ###
 22 | 
 23 | * dvc
 24 | * java 8+
 25 | * python 3.7+
 26 |     * Libraries
 27 |         * ```numpy, pandas, scipy, scikit-learn, joblib```
 28 |         * Installation:  
 29 |         ```pip install -r requirements.txt```
 30 |     
 31 | 
 32 | ## Usage ##
 33 | In the following, we show first how to train a new model for polarity classification and, then, how to test the model on unseen data.  
 34 | For testing purposes, you can use the Sample.csv input file available in the root of the repo.
 35 | ### Train a new classification model ###
 36 | ```bash
 37 | sh train.sh -i train.csv [-d csv_delimiter] [-g] [-c chunk-size] [-j jobs-number] [-o model-name]
 38 | ```
 39 | or you can run the script with two separated datasets, one for training and the other for testing:
 40 | ```bash
 41 | sh train.sh -i train.csv -i test.csv [-d csv_delimiter] [-g] [-c chunk-size] [-j jobs-number] [-o model-name]
 42 | ```
 43 | 
 44 | where
 45 | * ```-i dataset.csv```: is a file containing the data to train a classification model.  
 46 |   The dataset must contain at least the following two columns, in any order:
 47 |   ```text
 48 |   Text;Polarity  
 49 |   …  
 50 |   """@DrabJay: excellent suggestion! Code changed. :-)""";positive  
 51 |   """@IgnacioOcampo, I gave up after a while I am afraid :(""";negative    
 52 |   …
 53 |   ```
 54 |   same settings are valid if the test set is used separately.
 55 | * ```-d csv-delimiter```: the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]
 56 | * ```-F features```: all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]
 57 | * ```-g```: enables the extraction of n-grams (i.e,. bigrams and unigrams). [optional]
 58 | * ```-c chunk-size```: the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]
 59 | * ```-j jobs-number```: the number of cores to use during csv reading phase. If you pass -1 all cores will be used. 
 60 | If you pass a number higher than your total core number, the script will use all the cores. [Default value: 1] 
 61 | * ```-o model-name```: the name of trained model. [Default value: "Senti4SD"]
 62 | 
 63 | As a result, the script will generate the following output files:
 64 | * ```liblinear_perfomance/```: a subfolder containing the perfomance of all liblinear solvers on given test set
 65 | * ```UnigramsList and BigramsList files```: in the case the extraction of n-grams was enabled.
 66 | * ```Model-name.model```: trained classification model
 67 | * ```Model-name_info```: a file containing some info about the trained classification model
 68 | 
 69 | ### Classification task ###
 70 | ```bash
 71 | sh classification.sh -i dataset.csv [-d csv_delimiter] [-g] [-t] [-m model-name] [-c chunk-size] [-j jobs-number] [-o predictions.csv]
 72 | ```
 73 | 
 74 | where
 75 | * ```-i dataset.csv```: is a file containing the documents to classify.  
 76 |   The dataset must contain at least the following column:
 77 |   ```text
 78 |   Text 
 79 |   …  
 80 |   """@DrabJay: excellent suggestion! Code changed. :-)"""  
 81 |   """@IgnacioOcampo, I gave up after a while I am afraid :(""" 
 82 |   …
 83 |   ```
 84 |   If the dataset contains a column named ID, this will be saved inside the predictions.csv file.
 85 | * ```-d csv-delimiter```: the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]
 86 | * ```-F features```: all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]
 87 | * ```-g```: enables use of UnigramsList and BigramsList.
 88 | * ```-t```: enables documents saving along with the prediction labels inside "predictions.csv" file. [optional]
 89 | * ```-m model-name```: the name of classification model to use to classifiy documents. [Default value: "Senti4SD"] 
 90 | * ```-c chunk-size```: the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]
 91 | * ```-j jobs-number```: the number of cores to use during csv reading phase. If you pass -1 all cores will be used. 
 92 | If you pass a number higher than your total core number, the script will use all the available cores. [Default value: 1] 
 93 | * ```-o prediction-file-name```: the name of the csv file where to save the model predictions. [Default value: "predictions.csv"]
 94 | 
 95 | As a result, the script will create a ```prediction-file-name.csv``` inside ```predictions``` folder containing:
 96 | ```text
 97 |   Polarity 
 98 |   …  
 99 |   positive
100 |   negative
101 |   …
102 |   ```
103 |   or for example, in the case the input dataset contains a column named "ID" and the ```-t``` parameter is used, the ```predictions-file-name.csv``` will look like this: 
104 | ```text
105 |   ID,Text,Polarity 
106 |   …  
107 |   21,"""@DrabJay: excellent suggestion! Code changed. :-)""",positive
108 |   22,"""@IgnacioOcampo, I gave up after a while I am afraid :(""",negative
109 |   …
110 |   ```
111 | For example, if you wanted to detect the polarity of the documents in the input file Sample.csv, you would have to run:
112 | 
113 | ```bash
114 | sh classification.sh -i Sample.csv -d sc
115 | ```
116 | 


--------------------------------------------------------------------------------
/Sample.csv:
--------------------------------------------------------------------------------
 1 | ID;Text
 2 | 1;I swear - I don't put pseudo code I get told off for having bad variable names and things that don't match... I put pseudocode and I still get grief!
 3 | 2;Reinnstalled Xcode4 - same thing. Awful!
 4 | 3;Yeah, it's definitely annoying!
 5 | 4;I really hate people who downvote for no reason. Just tell me what your problem is in a comment after you downvote. God!
 6 | 5;That's depressing :/ Are you sure I can't get a collection of all controls on the page with a particular class and give them a single data source?
 7 | 6;yeah it working fine :) Thanks !!
 8 | 7;Excellent tutorial!
 9 | 8;This is amazing. Thanks so much for explaining. Excellent explanation!
10 | 9;Love this solution!
11 | 10;Sweet :) Happy hacking!
12 | 11;I want them to resize based on the length of the data they're showing.
13 | 12;Do you have jQuery loaded correctly?
14 | 13;For Python 3 the following will work.
15 | 14;If you're really worried about this, Java is not the language for you
16 | 15;I would continue running in the background and set an (called by the os when you app is REALLY killed) and there use
17 | 


--------------------------------------------------------------------------------
/Senti4SD_info:
--------------------------------------------------------------------------------
 1 | Solver name: L1-regularized logistic regression
 2 | Solver value: 6
 3 | C value: 0.5
 4 | Accuracy score: 0.8748114630467572
 5 | Perfomance on test set:
 6 |               precision    recall  f1-score   support
 7 | 
 8 |     negative       0.82      0.88      0.85       360
 9 |      neutral       0.87      0.82      0.85       508
10 |     positive       0.92      0.93      0.93       458
11 | 
12 |    micro avg       0.87      0.87      0.87      1326
13 |    macro avg       0.87      0.88      0.87      1326
14 | weighted avg       0.88      0.87      0.87      1326
15 | 


--------------------------------------------------------------------------------
/classification.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | SCRIPTDIR=$(dirname "$0")
  4 | 
  5 | inputFile=""
  6 | csvDelimiter='c'
  7 | features='A'
  8 | grams=false
  9 | documents=false
 10 | model="$SCRIPTDIR/Senti4SD.model"
 11 | chunkSize=200
 12 | jobsNumber=1
 13 | outputFile="$SCRIPTDIR/predictions.csv"
 14 | 
 15 | help(){
 16 |     echo "Usage: sh classification.sh -i input.csv [-d delimiter] [-F features] [-g] [-t] [-m model] [-c chunk_size] [-j jobs_number] [-o predictions.csv]"
 17 |     echo "-i input file to classify [required]"
 18 |     echo '-d delimiter used in csv file, "c" for comma or "sc" for semicolon'
 19 |     echo '-F -- all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]'
 20 |     echo "-g -- enables use of custom UnigramsList and BigramsList [optional]"
 21 |     echo "-t -- enables documents saving along with the prediction labels inside 'predictions.csv' file. [optional]"
 22 |     echo "-m prediction model [default = Senti4SD]"
 23 |     echo "-c chunk size [default = 200]"
 24 |     echo "-j number of jobs for parallelism. In case of '-1' value it will use all available cores [default = -1]"
 25 |     echo "-o output file with predicted label [default = predictions.csv]"
 26 |     exit 1
 27 | }
 28 | 
 29 | NUMARGS=$#
 30 | if [ $NUMARGS -eq 0 ]; then
 31 |     help
 32 |     exit 1
 33 | fi
 34 | 
 35 | while getopts "h:i:d:F:m:c:j:o:tg" OPTIONS; do
 36 |     case $OPTIONS in
 37 |         h)
 38 |           help
 39 |           ;;
 40 |         i)
 41 |           inputFile=$OPTARG
 42 |           ;;
 43 |         d)
 44 |           csvDelimiter="$OPTARG"
 45 |           ;;
 46 |         t)
 47 |           documents=true
 48 |           ;;
 49 | 	    g)
 50 | 	      grams=true
 51 |           ;;
 52 |         F)
 53 |           features=$OPTARG
 54 |           ;;
 55 |         m)
 56 |           model="$SCRIPTDIR/$OPTARG"
 57 |           ;;
 58 |         c)
 59 |           chunkSize=$OPTARG
 60 |           ;;
 61 |         j)
 62 |           jobsNumber=$OPTARG
 63 |           ;;
 64 |         o)
 65 |           outputFile="$SCRIPTDIR/$OPTARG"
 66 |           ;;
 67 |         \?)
 68 |           echo -e \\n"Option $OPTARG not allowed."
 69 |           help
 70 |           ;;
 71 |     esac
 72 | done
 73 | 
 74 | if [ -z $inputFile ]; then
 75 |     echo "input csv file is required!"
 76 |     exit 1
 77 | fi
 78 | if [ ! -f $inputFile ]; then
 79 |     echo "File $inputFile not found!"
 80 |     exit 1
 81 | fi 
 82 | 
 83 | mkdir -p $SCRIPTDIR/temp_features;
 84 | 
 85 | python $SCRIPTDIR/python/csv_processing.py -i $inputFile -d $csvDelimiter -c text
 86 | 
 87 | IFS='.' read -ra FILENAMESPLIT <<< "$inputFile"
 88 | jarInputFile="${FILENAMESPLIT[0]}_jar.csv"
 89 | 
 90 | if [ "$grams" = true ] ; then
 91 |     unigramsFile="$SCRIPTDIR/UnigramsList"
 92 |     bigramsFile="$SCRIPTDIR/BigramsList"
 93 |     echo $unigramsFile
 94 |     echo $bigramsFile
 95 |     if [ ! -f $unigramsFile ]; then
 96 |     	echo "File $unigramsFile not found!"
 97 |     	exit 1
 98 |     fi
 99 |     if [ ! -f $bigramsFile ]; then
100 | 	    echo "File $bigramsFile not found!"
101 |     	exit 1
102 |     fi
103 | 
104 |     #-F A: all features to be considered
105 |     #-i file_name: a file containg a document for every line
106 |     #-W cbow600.bin: DSM to be loaded
107 |     #-oc file_name.csv: output dataset containg the features extracted
108 |     #-vd numeric: vectors size (for cbow600.bin the size is 600)
109 |     #-L: if present corpus have a label column [optional]
110 |     #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
111 |     #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
112 | 
113 |     java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600 -ul $unigramsFile -bl $bigramsFile
114 | 
115 |     if [ "$documents" = true ] ; then
116 |         python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -t -m $model -c $chunkSize -j $jobsNumber -o $outputFile
117 |     else
118 |         python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -m $model -c $chunkSize -j $jobsNumber -o $outputFile
119 |     fi
120 |     
121 |     rm -rf $SCRIPTDIR/temp_features
122 |     rm $jarInputFile
123 | else
124 |     #-F A: all features to be considered
125 |     #-i file_name: a file containg a document for every line
126 |     #-W cbow600.bin: DSM to be loaded
127 |     #-oc file_name.csv: output dataset containg the features extracted
128 |     #-vd numeric: vectors size (for cbow600.bin the size is 600)
129 |     #-L: if present corpus have a label column [optional]
130 |     #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
131 |     #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
132 | 
133 |     java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600
134 | 
135 |     if [ "$documents" = true ] ; then
136 |         python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -t -m $model -c $chunkSize -j $jobsNumber -o $outputFile
137 |     else
138 |         python $SCRIPTDIR/python/classification_task.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -i $inputFile -d $csvDelimiter -m $model -c $chunkSize -j $jobsNumber -o $outputFile
139 |     fi
140 |     
141 |     rm -rf $SCRIPTDIR/temp_features
142 |     rm $jarInputFile
143 | fi
144 | 


--------------------------------------------------------------------------------
/java/.gitignore:
--------------------------------------------------------------------------------
1 | /NgramsExtraction.jar
2 | /Senti4SD-fast.jar
3 | /Senti4SD.jar
4 | /dsm.bin
5 | 


--------------------------------------------------------------------------------
/java/NgramsExtraction.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: d66bbaaa07739cfbd4cb8b94565d43f8
3 |   size: 13568432
4 |   path: NgramsExtraction.jar
5 | 


--------------------------------------------------------------------------------
/java/Senti4SD-fast.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 0ab05dae382556f265736d7a80d7b5e8
3 |   size: 61243352
4 |   path: Senti4SD-fast.jar
5 | 


--------------------------------------------------------------------------------
/java/Senti4SD.jar.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 37c5228913ac9d2a76e6313023cd8160
3 |   size: 51300173
4 |   path: Senti4SD.jar
5 | 


--------------------------------------------------------------------------------
/java/dsm.bin.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 395cb470cacd6e584508d108945b67d3
3 |   size: 835369209
4 |   path: dsm.bin
5 | 


--------------------------------------------------------------------------------
/liblinear_solvers:
--------------------------------------------------------------------------------
1 | L2-regularized logistic regression (primal)
2 | L2-regularized L2-loss support vector classification (dual)
3 | L2-regularized L2-loss support vector classification (primal)
4 | L2-regularized L1-loss support vector classification (dual)
5 | support vector classification by Crammer and Singer
6 | L1-regularized L2-loss support vector classification
7 | L1-regularized logistic regression
8 | L2-regularized logistic regression (dual)
9 | 


--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/


--------------------------------------------------------------------------------
/python/classification_task.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core'))
  4 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/liblinear_multicore'))
  6 | 
  7 | import argparse
  8 | import logging
  9 | from pathlib import Path
 10 | 
 11 | from core.classification import Classification
 12 | from core.utils.csv_utils import CsvUtils
 13 | from core.utils.core_utils import CoreUtils
 14 | 
 15 | 
 16 | logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level = logging.INFO)
 17 | 
 18 | def main():
 19 |     parser = argparse.ArgumentParser(description = "Classification task")
 20 |     parser.add_argument('-i',
 21 |                        '--input',
 22 |                        help = "path to csv file.",
 23 |                        type = str,
 24 |                        action = 'append',
 25 |                        required = True)
 26 |     parser.add_argument('-d',
 27 |                        '--delimiter',
 28 |                        help = 'csv delimiter, use c for comma and sc for semicolon',
 29 |                        type = str,
 30 |                        default = 'c')
 31 |     parser.add_argument('-t',
 32 |                         '--text',
 33 |                         help = 'enables documents saving along with the prediction labels inside "predictions.csv" file.',
 34 |                         action = "store_true")
 35 |     parser.add_argument('-m',
 36 |                         '--model',
 37 |                         help = 'prediction model (default = Senti4SD.model)',
 38 |                         type = str,
 39 |                         default = "Senti4SD.model")
 40 |     parser.add_argument('-c',
 41 |                         '--chunk-size',
 42 |                         help = 'chunk size  (--default = 1000)',
 43 |                         type = int,
 44 |                         default = 1000)
 45 |     parser.add_argument('-j',
 46 |                         '--jobs-number',
 47 |                         help = 'number of jobs for parallelism  (default = 1)',
 48 |                         type = int,
 49 |                         default = 1)
 50 |     parser.add_argument('-o', 
 51 |                         '--output',
 52 |                         help = 'prediction file name',
 53 |                         type = str,
 54 |                         default = 'predictions.csv')
 55 |     args = parser.parse_args()
 56 | 
 57 |     #TODO Add again second input line
 58 |     if len(args.input) == 2:
 59 |         jar_csv = args.input[0]
 60 |         input_csv = args.input[1]
 61 |         jar_csv = Path(jar_csv).resolve()
 62 |         input_csv = Path(input_csv).resolve()
 63 |     elif len(args.input) > 2:
 64 |         logging.error("Too many input file. [jar generated csv][input csv]")
 65 |         sys.exit(1)
 66 |     elif len(args.input) < 2:
 67 |         print("Two input file are required. [jar generated csv][input csv]")
 68 |         sys.exit(1)
 69 |         
 70 |     try:
 71 |         CsvUtils.check_csv(jar_csv)
 72 |         CsvUtils.check_csv(input_csv)
 73 |     except OSError as e:
 74 |         logging.error(e)
 75 |         sys.exit(1)
 76 | 
 77 |     if not Path(args.model).exists():
 78 |         print("Model doesn't exist. Provide a correct path to the model, or train a new one using the train script.")
 79 |         sys.exit(1)
 80 | 
 81 |     output_path = Path(f"{Path.cwd()}/predictions")
 82 |     output_path.mkdir(parents = True, exist_ok = True )
 83 |     output_path = f"{output_path.resolve()}/{args.output}"
 84 |     classification = Classification(args.model)
 85 |     logging.info("Starting classification task")
 86 |     classification.predict(jar_csv, args.chunk_size, CoreUtils.check_jobs_number(args.jobs_number), output_path)
 87 |     logging.info("Ending classification task")
 88 |     logging.info("Starting ordering prediction csv")
 89 |     CsvUtils.order_csv(output_path, 'ID')
 90 |     logging.info("Ending ordering prediction csv")
 91 |     logging.info("Starting rewriting prediction csv")
 92 |     if args.delimiter.lower() == 'c':
 93 |         classification.write_id_and_text(input_csv, ',', output_path, args.text)
 94 |     elif args.delimiter.lower() == 'sc':
 95 |         classification.write_id_and_text(input_csv, ';', output_path, args.text)
 96 |     logging.info("Ending rewriting prediction csv")
 97 | 
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 |     


--------------------------------------------------------------------------------
/python/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/__init__.py


--------------------------------------------------------------------------------
/python/core/classification.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import glob
 4 | from multiprocessing import Pool
 5 | from collections import OrderedDict
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from joblib import Parallel, delayed
10 | from sklearn.preprocessing import LabelEncoder
11 | 
12 | from utils.csv_utils import CsvUtils
13 | from utils.csv_formatter import CsvFormatter
14 | 
15 | from liblinearutil import *
16 | 
17 | class Classification():
18 | 
19 |     def __init__(self, model):
20 |         self.model = model
21 | 
22 |     def __create_classification_file(self, pred_csv):
23 |         with open(pred_csv, 'w+') as prediction:
24 |             prediction.write("ID,PREDICTED\n")
25 |         prediction.close()
26 | 
27 |     def __clean_id(self, id):
28 |         temp = id.split(',')[0]
29 |         temp = temp.replace('t', "")
30 |         return int(temp)
31 | 
32 |     def __convert_lines_and_predict(self, rows, label_encoder, pred_file):
33 |         model = load_model(self.model)
34 |         X = np.array([])
35 |         splitted_rows_id = []
36 |         first = True
37 |         for i in range(0, len(rows)):
38 |             values = rows[i].split(',')
39 |             splitted_rows_id.append(values[0])
40 |             splitted_row_features = [float(value) for value in values[1:]]
41 |             if first:
42 |                 X = np.array(splitted_row_features)
43 |                 first = False
44 |             else:
45 |                 X = np.append(X, np.array(splitted_row_features))
46 |         X = X.reshape((i+1, len(splitted_row_features)))
47 |         y_pred, y_acc, y_val = predict([], X, model, '-q')
48 |         y_pred = [int(label) for label in y_pred]
49 |         y_pred = label_encoder.inverse_transform(y_pred)
50 |         y_pred = [pred.replace('\n', "") for pred in y_pred]
51 |         dataframe = OrderedDict()
52 |         dataframe.update({'id': [(self.__clean_id(row_id) + 1) for row_id in splitted_rows_id]})
53 |         dataframe.update({'predicted' : y_pred})
54 |         CsvUtils.write_to_csv(dataframe, pred_file, ',', False, 'a+')
55 |     
56 |     def predict(self, csv_file, chunk_size, jobs_number, pred_file):
57 |         self.__create_classification_file(pred_file)
58 |         chunk_size = int(chunk_size / jobs_number)
59 |         stop = False
60 |         label_encoder = LabelEncoder()
61 |         label_encoder.fit(['positive', 'negative', 'neutral'])
62 |         with open(csv_file, 'r+') as csv:
63 |             next(csv)
64 |             while not stop:
65 |                 read_rows = []
66 |                 try:
67 |                     for _ in range(jobs_number):
68 |                         temp_rows = []
69 |                         for _ in range (chunk_size):
70 |                             temp_rows.append(next(csv))
71 |                         read_rows.append(temp_rows)
72 |                 except StopIteration:
73 |                     stop = True
74 |                     read_rows.append(temp_rows)
75 |                 finally:
76 |                     Parallel(n_jobs = jobs_number)(delayed(self.__convert_lines_and_predict)(rows, label_encoder, pred_file) for rows in read_rows)
77 |         csv.close()
78 | 
79 |     def write_id_and_text(self, input_csv, csv_delimiter, pred_csv, text = False):
80 |         dataframe = OrderedDict()
81 |         try:
82 |             csv_fomatter = CsvFormatter(['ID'], csv_delimiter)
83 |             dataframe.update(csv_fomatter.get_rows(input_csv))
84 |         except IOError as e:
85 |             print(e)
86 |         if text:
87 |             try:
88 |                 csv_fomatter = CsvFormatter(['TEXT'], csv_delimiter)
89 |                 dataframe.update(csv_fomatter.get_rows(input_csv))
90 |             except IOError as e:
91 |                 print(e)
92 |         if dataframe:
93 |             temp = pd.read_csv(pred_csv, delimiter = ",")
94 |             dataframe.update({'PREDICTED': temp.iloc[:, -1:].values.ravel()})
95 |             CsvUtils.write_to_csv(dataframe, pred_csv, ',', True)
96 | 


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (c) 2007-2019 The LIBLINEAR Project.
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright
10 | notice, this list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright
13 | notice, this list of conditions and the following disclaimer in the
14 | documentation and/or other materials provided with the distribution.
15 | 
16 | 3. Neither name of copyright holders nor the names of its contributors
17 | may be used to endorse or promote products derived from this software
18 | without specific prior written permission.
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/__init__.py


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/commonutil.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import print_function
  4 | import sys
  5 | 
  6 | try:
  7 | 	import scipy
  8 | 	from scipy import sparse
  9 | except:
 10 | 	scipy = None
 11 | 	sparse = None
 12 | 
 13 | 
 14 | __all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
 15 | 
 16 | def svm_read_problem(data_file_name, return_scipy=False):
 17 | 	"""
 18 | 	svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
 19 | 	svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
 20 | 
 21 | 	Read LIBSVM-format data from data_file_name and return labels y
 22 | 	and data instances x.
 23 | 	"""
 24 | 	prob_y = []
 25 | 	prob_x = []
 26 | 	row_ptr = [0]
 27 | 	col_idx = []
 28 | 	for i, line in enumerate(open(data_file_name)):
 29 | 		line = line.split(None, 1)
 30 | 		# In case an instance with all zero features
 31 | 		if len(line) == 1: line += ['']
 32 | 		label, features = line
 33 | 		prob_y += [float(label)]
 34 | 		if scipy != None and return_scipy:
 35 | 			nz = 0
 36 | 			for e in features.split():
 37 | 				ind, val = e.split(":")
 38 | 				val = float(val)
 39 | 				if val != 0:
 40 | 					col_idx += [int(ind)-1]
 41 | 					prob_x += [val]
 42 | 					nz += 1
 43 | 			row_ptr += [row_ptr[-1]+nz]
 44 | 		else:
 45 | 			xi = {}
 46 | 			for e in features.split():
 47 | 				ind, val = e.split(":")
 48 | 				xi[int(ind)] = float(val)
 49 | 			prob_x += [xi]
 50 | 	if scipy != None and return_scipy:
 51 | 		prob_y = scipy.array(prob_y)
 52 | 		prob_x = scipy.array(prob_x)
 53 | 		col_idx = scipy.array(col_idx)
 54 | 		row_ptr = scipy.array(row_ptr)
 55 | 		prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
 56 | 	return (prob_y, prob_x)
 57 | 
 58 | def evaluations_scipy(ty, pv):
 59 | 	"""
 60 | 	evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
 61 | 	ty, pv: ndarray
 62 | 
 63 | 	Calculate accuracy, mean squared error and squared correlation coefficient
 64 | 	using the true values (ty) and predicted values (pv).
 65 | 	"""
 66 | 	if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
 67 | 		raise TypeError("type of ty and pv must be ndarray")
 68 | 	if len(ty) != len(pv):
 69 | 		raise ValueError("len(ty) must be equal to len(pv)")
 70 | 	ACC = 100.0*(ty == pv).mean()
 71 | 	MSE = ((ty - pv)**2).mean()
 72 | 	l = len(ty)
 73 | 	sumv = pv.sum()
 74 | 	sumy = ty.sum()
 75 | 	sumvy = (pv*ty).sum()
 76 | 	sumvv = (pv*pv).sum()
 77 | 	sumyy = (ty*ty).sum()
 78 | 	with scipy.errstate(all = 'raise'):
 79 | 		try:
 80 | 			SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
 81 | 		except:
 82 | 			SCC = float('nan')
 83 | 	return (float(ACC), float(MSE), float(SCC))
 84 | 
 85 | def evaluations(ty, pv, useScipy = True):
 86 | 	"""
 87 | 	evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
 88 | 	ty, pv: list, tuple or ndarray
 89 | 	useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
 90 | 
 91 | 	Calculate accuracy, mean squared error and squared correlation coefficient
 92 | 	using the true values (ty) and predicted values (pv).
 93 | 	"""
 94 | 	if scipy != None and useScipy:
 95 | 		return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
 96 | 	if len(ty) != len(pv):
 97 | 		raise ValueError("len(ty) must be equal to len(pv)")
 98 | 	total_correct = total_error = 0
 99 | 	sumv = sumy = sumvv = sumyy = sumvy = 0
100 | 	for v, y in zip(pv, ty):
101 | 		if y == v:
102 | 			total_correct += 1
103 | 		total_error += (v-y)*(v-y)
104 | 		sumv += v
105 | 		sumy += y
106 | 		sumvv += v*v
107 | 		sumyy += y*y
108 | 		sumvy += v*y
109 | 	l = len(ty)
110 | 	ACC = 100.0*total_correct/l
111 | 	MSE = total_error/l
112 | 	try:
113 | 		SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
114 | 	except:
115 | 		SCC = float('nan')
116 | 	return (float(ACC), float(MSE), float(SCC))
117 | 
118 | def csr_find_scale_param(x, lower=-1, upper=1):
119 | 	assert isinstance(x, sparse.csr_matrix)
120 | 	assert lower < upper
121 | 	l, n = x.shape
122 | 	feat_min = x.min(axis=0).toarray().flatten()
123 | 	feat_max = x.max(axis=0).toarray().flatten()
124 | 	coef = (feat_max - feat_min) / (upper - lower)
125 | 	coef[coef != 0] = 1.0 / coef[coef != 0]
126 | 
127 | 	# (x - ones(l,1) * feat_min') * diag(coef) + lower
128 | 	# = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
129 | 	# = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
130 | 	# = x * diag(coef) + ones(l, 1) * offset'
131 | 	offset = -feat_min * coef + lower
132 | 	offset[coef == 0] = 0
133 | 
134 | 	if sum(offset != 0) * l > 3 * x.getnnz():
135 | 		print(
136 | 			"WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
137 | 			"If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
138 | 			file=sys.stderr)
139 | 
140 | 	return {'coef':coef, 'offset':offset}
141 | 
142 | def csr_scale(x, scale_param):
143 | 	assert isinstance(x, sparse.csr_matrix)
144 | 
145 | 	offset = scale_param['offset']
146 | 	coef = scale_param['coef']
147 | 	assert len(coef) == len(offset)
148 | 
149 | 	l, n = x.shape
150 | 
151 | 	if not n == len(coef):
152 | 		print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
153 | 		coef = resize(coef, n)
154 | 		offset = resize(offset, n)
155 | 
156 | 	# scaled_x = x * diag(coef) + ones(l, 1) * offset'
157 | 	offset = sparse.csr_matrix(offset.reshape(1, n))
158 | 	offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
159 | 	scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
160 | 
161 | 	if scaled_x.getnnz() > x.getnnz():
162 | 		print(
163 | 			"WARNING: original #nonzeros %d\n" % x.getnnz() +
164 | 			"       > new      #nonzeros %d\n" % scaled_x.getnnz() +
165 | 			"If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
166 | 			file=sys.stderr)
167 | 
168 | 	return scaled_x
169 | 


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/liblinear.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from ctypes import *
  4 | from ctypes.util import find_library
  5 | from os import path
  6 | import sys
  7 | 
  8 | try:
  9 | 	import scipy
 10 | 	from scipy import sparse
 11 | except:
 12 | 	scipy = None
 13 | 	sparse = None
 14 | 
 15 | if sys.version_info[0] < 3:
 16 | 	range = xrange
 17 | 	from itertools import izip as zip
 18 | 
 19 | __all__ = ['liblinear', 'feature_node', 'gen_feature_nodearray', 'problem',
 20 |            'parameter', 'model', 'toPyModel', 'L2R_LR', 'L2R_L2LOSS_SVC_DUAL',
 21 |            'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL', 'MCSVM_CS',
 22 |            'L1R_L2LOSS_SVC', 'L1R_LR', 'L2R_LR_DUAL', 'L2R_L2LOSS_SVR',
 23 |            'L2R_L2LOSS_SVR_DUAL', 'L2R_L1LOSS_SVR_DUAL', 'print_null']
 24 | 
 25 | try:
 26 | 	dirname = path.dirname(path.abspath(__file__))
 27 | 	if sys.platform == 'win32':
 28 | 		liblinear = CDLL(path.join(dirname, r'windows\liblinear.dll'))
 29 | 	else:
 30 | 		liblinear = CDLL(path.join(dirname, 'so/liblinear.so.3'))
 31 | except:
 32 | # For unix the prefix 'lib' is not considered.
 33 | 	if find_library('linear'):
 34 | 		liblinear = CDLL(find_library('linear'))
 35 | 	elif find_library('liblinear'):
 36 | 		liblinear = CDLL(find_library('liblinear'))
 37 | 	else:
 38 | 		raise Exception('LIBLINEAR library not found.')
 39 | 
 40 | L2R_LR = 0
 41 | L2R_L2LOSS_SVC_DUAL = 1
 42 | L2R_L2LOSS_SVC = 2
 43 | L2R_L1LOSS_SVC_DUAL = 3
 44 | MCSVM_CS = 4
 45 | L1R_L2LOSS_SVC = 5
 46 | L1R_LR = 6
 47 | L2R_LR_DUAL = 7
 48 | L2R_L2LOSS_SVR = 11
 49 | L2R_L2LOSS_SVR_DUAL = 12
 50 | L2R_L1LOSS_SVR_DUAL = 13
 51 | 
 52 | PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p)
 53 | def print_null(s):
 54 | 	return
 55 | 
 56 | def genFields(names, types):
 57 | 	return list(zip(names, types))
 58 | 
 59 | def fillprototype(f, restype, argtypes):
 60 | 	f.restype = restype
 61 | 	f.argtypes = argtypes
 62 | 
 63 | class feature_node(Structure):
 64 | 	_names = ["index", "value"]
 65 | 	_types = [c_int, c_double]
 66 | 	_fields_ = genFields(_names, _types)
 67 | 
 68 | 	def __str__(self):
 69 | 		return '%d:%g' % (self.index, self.value)
 70 | 
 71 | def gen_feature_nodearray(xi, feature_max=None):
 72 | 	if feature_max:
 73 | 		assert(isinstance(feature_max, int))
 74 | 
 75 | 	xi_shift = 0 # ensure correct indices of xi
 76 | 	if scipy and isinstance(xi, tuple) and len(xi) == 2\
 77 | 			and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
 78 | 		index_range = xi[0] + 1 # index starts from 1
 79 | 		if feature_max:
 80 | 			index_range = index_range[scipy.where(index_range <= feature_max)]
 81 | 	elif scipy and isinstance(xi, scipy.ndarray):
 82 | 		xi_shift = 1
 83 | 		index_range = xi.nonzero()[0] + 1 # index starts from 1
 84 | 		if feature_max:
 85 | 			index_range = index_range[scipy.where(index_range <= feature_max)]
 86 | 	elif isinstance(xi, (dict, list, tuple)):
 87 | 		if isinstance(xi, dict):
 88 | 			index_range = xi.keys()
 89 | 		elif isinstance(xi, (list, tuple)):
 90 | 			xi_shift = 1
 91 | 			index_range = range(1, len(xi) + 1)
 92 | 		index_range = filter(lambda j: xi[j-xi_shift] != 0, index_range)
 93 | 
 94 | 		if feature_max:
 95 | 			index_range = filter(lambda j: j <= feature_max, index_range)
 96 | 		index_range = sorted(index_range)
 97 | 	else:
 98 | 		raise TypeError('xi should be a dictionary, list, tuple, 1-d numpy array, or tuple of (index, data)')
 99 | 
100 | 	ret = (feature_node*(len(index_range)+2))()
101 | 	ret[-1].index = -1 # for bias term
102 | 	ret[-2].index = -1
103 | 
104 | 	if scipy and isinstance(xi, tuple) and len(xi) == 2\
105 | 			and isinstance(xi[0], scipy.ndarray) and isinstance(xi[1], scipy.ndarray): # for a sparse vector
106 | 		for idx, j in enumerate(index_range):
107 | 			ret[idx].index = j
108 | 			ret[idx].value = (xi[1])[idx]
109 | 	else:
110 | 		for idx, j in enumerate(index_range):
111 | 			ret[idx].index = j
112 | 			ret[idx].value = xi[j - xi_shift]
113 | 
114 | 	max_idx = 0
115 | 	if len(index_range) > 0:
116 | 		max_idx = index_range[-1]
117 | 	return ret, max_idx
118 | 
119 | try:
120 | 	from numba import jit
121 | 	jit_enabled = True
122 | except:
123 | 	jit = lambda x: x
124 | 	jit_enabled = False
125 | 
126 | @jit
127 | def csr_to_problem_jit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
128 | 	for i in range(l):
129 | 		b1,e1 = x_rowptr[i], x_rowptr[i+1]
130 | 		b2,e2 = prob_rowptr[i], prob_rowptr[i+1]-2
131 | 		for j in range(b1,e1):
132 | 			prob_ind[j-b1+b2] = x_ind[j]+1
133 | 			prob_val[j-b1+b2] = x_val[j]
134 | def csr_to_problem_nojit(l, x_val, x_ind, x_rowptr, prob_val, prob_ind, prob_rowptr):
135 | 	for i in range(l):
136 | 		x_slice = slice(x_rowptr[i], x_rowptr[i+1])
137 | 		prob_slice = slice(prob_rowptr[i], prob_rowptr[i+1]-2)
138 | 		prob_ind[prob_slice] = x_ind[x_slice]+1
139 | 		prob_val[prob_slice] = x_val[x_slice]
140 | 
141 | def csr_to_problem(x, prob):
142 | 	# Extra space for termination node and (possibly) bias term
143 | 	x_space = prob.x_space = scipy.empty((x.nnz+x.shape[0]*2), dtype=feature_node)
144 | 	prob.rowptr = x.indptr.copy()
145 | 	prob.rowptr[1:] += 2*scipy.arange(1,x.shape[0]+1)
146 | 	prob_ind = x_space["index"]
147 | 	prob_val = x_space["value"]
148 | 	prob_ind[:] = -1
149 | 	if jit_enabled:
150 | 		csr_to_problem_jit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
151 | 	else:
152 | 		csr_to_problem_nojit(x.shape[0], x.data, x.indices, x.indptr, prob_val, prob_ind, prob.rowptr)
153 | 
154 | class problem(Structure):
155 | 	_names = ["l", "n", "y", "x", "bias"]
156 | 	_types = [c_int, c_int, POINTER(c_double), POINTER(POINTER(feature_node)), c_double]
157 | 	_fields_ = genFields(_names, _types)
158 | 
159 | 	def __init__(self, y, x, bias = -1):
160 | 		if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
161 | 			raise TypeError("type of y: {0} is not supported!".format(type(y)))
162 | 
163 | 		if isinstance(x, (list, tuple)):
164 | 			if len(y) != len(x):
165 | 				raise ValueError("len(y) != len(x)")
166 | 		elif scipy != None and isinstance(x, (scipy.ndarray, sparse.spmatrix)):
167 | 			if len(y) != x.shape[0]:
168 | 				raise ValueError("len(y) != len(x)")
169 | 			if isinstance(x, scipy.ndarray):
170 | 				x = scipy.ascontiguousarray(x) # enforce row-major
171 | 			if isinstance(x, sparse.spmatrix):
172 | 				x = x.tocsr()
173 | 				pass
174 | 		else:
175 | 			raise TypeError("type of x: {0} is not supported!".format(type(x)))
176 | 		self.l = l = len(y)
177 | 		self.bias = -1
178 | 
179 | 		max_idx = 0
180 | 		x_space = self.x_space = []
181 | 		if scipy != None and isinstance(x, sparse.csr_matrix):
182 | 			csr_to_problem(x, self)
183 | 			max_idx = x.shape[1]
184 | 		else:
185 | 			for i, xi in enumerate(x):
186 | 				tmp_xi, tmp_idx = gen_feature_nodearray(xi)
187 | 				x_space += [tmp_xi]
188 | 				max_idx = max(max_idx, tmp_idx)
189 | 		self.n = max_idx
190 | 
191 | 		self.y = (c_double * l)()
192 | 		if scipy != None and isinstance(y, scipy.ndarray):
193 | 			scipy.ctypeslib.as_array(self.y, (self.l,))[:] = y
194 | 		else:
195 | 			for i, yi in enumerate(y): self.y[i] = yi
196 | 
197 | 		self.x = (POINTER(feature_node) * l)()
198 | 		if scipy != None and isinstance(x, sparse.csr_matrix):
199 | 			base = addressof(self.x_space.ctypes.data_as(POINTER(feature_node))[0])
200 | 			x_ptr = cast(self.x, POINTER(c_uint64))
201 | 			x_ptr = scipy.ctypeslib.as_array(x_ptr,(self.l,))
202 | 			x_ptr[:] = self.rowptr[:-1]*sizeof(feature_node)+base
203 | 		else:
204 | 			for i, xi in enumerate(self.x_space): self.x[i] = xi
205 | 
206 | 		self.set_bias(bias)
207 | 
208 | 	def set_bias(self, bias):
209 | 		if self.bias == bias:
210 | 			return
211 | 		if bias >= 0 and self.bias < 0:
212 | 			self.n += 1
213 | 			node = feature_node(self.n, bias)
214 | 		if bias < 0 and self.bias >= 0:
215 | 			self.n -= 1
216 | 			node = feature_node(-1, bias)
217 | 
218 | 		if isinstance(self.x_space, list):
219 | 			for xi in self.x_space:
220 | 				xi[-2] = node
221 | 		else:
222 | 			self.x_space["index"][self.rowptr[1:]-2] = node.index
223 | 			self.x_space["value"][self.rowptr[1:]-2] = node.value
224 | 
225 | 		self.bias = bias
226 | 
227 | 
228 | class parameter(Structure):
229 | 	_names = ["solver_type", "eps", "C", "nr_thread", "nr_weight", "weight_label", "weight", "p", "init_sol"]
230 | 	_types = [c_int, c_double, c_double, c_int, c_int, POINTER(c_int), POINTER(c_double), c_double, POINTER(c_double)]
231 | 	_fields_ = genFields(_names, _types)
232 | 
233 | 	def __init__(self, options = None):
234 | 		if options == None:
235 | 			options = ''
236 | 		self.parse_options(options)
237 | 
238 | 	def __str__(self):
239 | 		s = ''
240 | 		attrs = parameter._names + list(self.__dict__.keys())
241 | 		values = map(lambda attr: getattr(self, attr), attrs)
242 | 		for attr, val in zip(attrs, values):
243 | 			s += (' %s: %s\n' % (attr, val))
244 | 		s = s.strip()
245 | 
246 | 		return s
247 | 
248 | 	def set_to_default_values(self):
249 | 		self.solver_type = L2R_L2LOSS_SVC_DUAL
250 | 		self.eps = float('inf')
251 | 		self.C = 1
252 | 		self.p = 0.1
253 | 		self.nr_thread = 1
254 | 		self.nr_weight = 0
255 | 		self.weight_label = None
256 | 		self.weight = None
257 | 		self.init_sol = None
258 | 		self.bias = -1
259 | 		self.flag_cross_validation = False
260 | 		self.flag_C_specified = False
261 | 		self.flag_solver_specified = False
262 | 		self.flag_find_C = False
263 | 		self.flag_omp = False
264 | 		self.nr_fold = 0
265 | 		self.print_func = cast(None, PRINT_STRING_FUN)
266 | 
267 | 	def parse_options(self, options):
268 | 		if isinstance(options, list):
269 | 			argv = options
270 | 		elif isinstance(options, str):
271 | 			argv = options.split()
272 | 		else:
273 | 			raise TypeError("arg 1 should be a list or a str.")
274 | 		self.set_to_default_values()
275 | 		self.print_func = cast(None, PRINT_STRING_FUN)
276 | 		weight_label = []
277 | 		weight = []
278 | 
279 | 		i = 0
280 | 		while i < len(argv) :
281 | 			if argv[i] == "-s":
282 | 				i = i + 1
283 | 				self.solver_type = int(argv[i])
284 | 				self.flag_solver_specified = True
285 | 			elif argv[i] == "-c":
286 | 				i = i + 1
287 | 				self.C = float(argv[i])
288 | 				self.flag_C_specified = True
289 | 			elif argv[i] == "-p":
290 | 				i = i + 1
291 | 				self.p = float(argv[i])
292 | 			elif argv[i] == "-e":
293 | 				i = i + 1
294 | 				self.eps = float(argv[i])
295 | 			elif argv[i] == "-B":
296 | 				i = i + 1
297 | 				self.bias = float(argv[i])
298 | 			elif argv[i] == "-v":
299 | 				i = i + 1
300 | 				self.flag_cross_validation = 1
301 | 				self.nr_fold = int(argv[i])
302 | 				if self.nr_fold < 2 :
303 | 					raise ValueError("n-fold cross validation: n must >= 2")
304 | 			elif argv[i] == "-n":
305 | 				i = i + 1
306 | 				self.flag_omp = True
307 | 				self.nr_thread = int(argv[i])
308 | 			elif argv[i].startswith("-w"):
309 | 				i = i + 1
310 | 				self.nr_weight += 1
311 | 				weight_label += [int(argv[i-1][2:])]
312 | 				weight += [float(argv[i])]
313 | 			elif argv[i] == "-q":
314 | 				self.print_func = PRINT_STRING_FUN(print_null)
315 | 			elif argv[i] == "-C":
316 | 				self.flag_find_C = True
317 | 
318 | 			else:
319 | 				raise ValueError("Wrong options")
320 | 			i += 1
321 | 
322 | 		liblinear.set_print_string_function(self.print_func)
323 | 		self.weight_label = (c_int*self.nr_weight)()
324 | 		self.weight = (c_double*self.nr_weight)()
325 | 		for i in range(self.nr_weight):
326 | 			self.weight[i] = weight[i]
327 | 			self.weight_label[i] = weight_label[i]
328 | 
329 | 		# default solver for parameter selection is L2R_L2LOSS_SVC
330 | 		if self.flag_find_C:
331 | 			if not self.flag_cross_validation:
332 | 				self.nr_fold = 5
333 | 			if not self.flag_solver_specified:
334 | 				self.solver_type = L2R_L2LOSS_SVC
335 | 				self.flag_solver_specified = True
336 | 			elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC]:
337 | 				raise ValueError("Warm-start parameter search only available for -s 0 and -s 2")
338 | 		if self.flag_omp:
339 | 			if not self.flag_solver_specified:
340 | 				self.solver_type = L2R_L2LOSS_SVC
341 | 				self.flag_solver_specified = True
342 | 			elif self.solver_type not in [L2R_LR, L2R_L2LOSS_SVC, L2R_L2LOSS_SVR, L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, L1R_LR, L1R_L2LOSS_SVC]:
343 | 				raise ValueError("Parallel LIBLINEAR is only available for -s 0, 1, 2, 3, 5, 6, 11 now")
344 | 	
345 | 		if self.eps == float('inf'):
346 | 			if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]:
347 | 				self.eps = 0.01
348 | 			elif self.solver_type in [L2R_L2LOSS_SVR]:
349 | 				self.eps = 0.001
350 | 			elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]:
351 | 				self.eps = 0.1
352 | 			elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]:
353 | 				self.eps = 0.01
354 | 			elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
355 | 				self.eps = 0.1
356 | 
357 | class model(Structure):
358 | 	_names = ["param", "nr_class", "nr_feature", "w", "label", "bias"]
359 | 	_types = [parameter, c_int, c_int, POINTER(c_double), POINTER(c_int), c_double]
360 | 	_fields_ = genFields(_names, _types)
361 | 
362 | 	def __init__(self):
363 | 		self.__createfrom__ = 'python'
364 | 
365 | 	def __del__(self):
366 | 		# free memory created by C to avoid memory leak
367 | 		if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C':
368 | 			liblinear.free_and_destroy_model(pointer(self))
369 | 
370 | 	def get_nr_feature(self):
371 | 		return liblinear.get_nr_feature(self)
372 | 
373 | 	def get_nr_class(self):
374 | 		return liblinear.get_nr_class(self)
375 | 
376 | 	def get_labels(self):
377 | 		nr_class = self.get_nr_class()
378 | 		labels = (c_int * nr_class)()
379 | 		liblinear.get_labels(self, labels)
380 | 		return labels[:nr_class]
381 | 
382 | 	def get_decfun_coef(self, feat_idx, label_idx=0):
383 | 		return liblinear.get_decfun_coef(self, feat_idx, label_idx)
384 | 
385 | 	def get_decfun_bias(self, label_idx=0):
386 | 		return liblinear.get_decfun_bias(self, label_idx)
387 | 
388 | 	def get_decfun(self, label_idx=0):
389 | 		w = [liblinear.get_decfun_coef(self, feat_idx, label_idx) for feat_idx in range(1, self.nr_feature+1)]
390 | 		b = liblinear.get_decfun_bias(self, label_idx)
391 | 		return (w, b)
392 | 
393 | 	def is_probability_model(self):
394 | 		return (liblinear.check_probability_model(self) == 1)
395 | 
396 | 	def is_regression_model(self):
397 | 		return (liblinear.check_regression_model(self) == 1)
398 | 
399 | def toPyModel(model_ptr):
400 | 	"""
401 | 	toPyModel(model_ptr) -> model
402 | 
403 | 	Convert a ctypes POINTER(model) to a Python model
404 | 	"""
405 | 	if bool(model_ptr) == False:
406 | 		raise ValueError("Null pointer")
407 | 	m = model_ptr.contents
408 | 	m.__createfrom__ = 'C'
409 | 	return m
410 | 
411 | fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)])
412 | fillprototype(liblinear.find_parameter_C, None, [POINTER(problem), POINTER(parameter), c_int, c_double, c_double, POINTER(c_double), POINTER(c_double)])
413 | fillprototype(liblinear.cross_validation, None, [POINTER(problem), POINTER(parameter), c_int, POINTER(c_double)])
414 | 
415 | fillprototype(liblinear.predict_values, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
416 | fillprototype(liblinear.predict, c_double, [POINTER(model), POINTER(feature_node)])
417 | fillprototype(liblinear.predict_probability, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
418 | 
419 | fillprototype(liblinear.save_model, c_int, [c_char_p, POINTER(model)])
420 | fillprototype(liblinear.load_model, POINTER(model), [c_char_p])
421 | 
422 | fillprototype(liblinear.get_nr_feature, c_int, [POINTER(model)])
423 | fillprototype(liblinear.get_nr_class, c_int, [POINTER(model)])
424 | fillprototype(liblinear.get_labels, None, [POINTER(model), POINTER(c_int)])
425 | fillprototype(liblinear.get_decfun_coef, c_double, [POINTER(model), c_int, c_int])
426 | fillprototype(liblinear.get_decfun_bias, c_double, [POINTER(model), c_int])
427 | 
428 | fillprototype(liblinear.free_model_content, None, [POINTER(model)])
429 | fillprototype(liblinear.free_and_destroy_model, None, [POINTER(POINTER(model))])
430 | fillprototype(liblinear.destroy_param, None, [POINTER(parameter)])
431 | fillprototype(liblinear.check_parameter, c_char_p, [POINTER(problem), POINTER(parameter)])
432 | fillprototype(liblinear.check_probability_model, c_int, [POINTER(model)])
433 | fillprototype(liblinear.check_regression_model, c_int, [POINTER(model)])
434 | fillprototype(liblinear.set_print_string_function, None, [CFUNCTYPE(None, c_char_p)])
435 | 


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/so/liblinear.so.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/so/liblinear.so.3


--------------------------------------------------------------------------------
/python/core/liblinear_multicore/windows/liblinear.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/liblinear_multicore/windows/liblinear.dll


--------------------------------------------------------------------------------
/python/core/liblinearutil.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys
  4 | #sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/liblinear_multicore")
  5 | #print(sys.path)
  6 | from liblinear_multicore.liblinear import *
  7 | from liblinear_multicore.liblinear import __all__ as liblinear_all
  8 | from liblinear_multicore.liblinear import scipy, sparse
  9 | from liblinear_multicore.commonutil import *
 10 | from liblinear_multicore.commonutil import __all__ as common_all
 11 | from ctypes import c_double
 12 | 
 13 | if sys.version_info[0] < 3:
 14 | 	range = xrange
 15 | 	from itertools import izip as zip
 16 | 
 17 | __all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all
 18 | 
 19 | 
 20 | def load_model(model_file_name):
 21 | 	"""
 22 | 	load_model(model_file_name) -> model
 23 | 
 24 | 	Load a LIBLINEAR model from model_file_name and return.
 25 | 	"""
 26 | 	model = liblinear.load_model(model_file_name.encode())
 27 | 	if not model:
 28 | 		print("can't open model file %s" % model_file_name)
 29 | 		return None
 30 | 	model = toPyModel(model)
 31 | 	return model
 32 | 
 33 | def save_model(model_file_name, model):
 34 | 	"""
 35 | 	save_model(model_file_name, model) -> None
 36 | 
 37 | 	Save a LIBLINEAR model to the file model_file_name.
 38 | 	"""
 39 | 	liblinear.save_model(model_file_name.encode(), model)
 40 | 
 41 | def train(arg1, arg2=None, arg3=None):
 42 | 	"""
 43 | 	train(y, x [, options]) -> model | ACC
 44 | 
 45 | 	y: a list/tuple/ndarray of l true labels (type must be int/double).
 46 | 
 47 | 	x: 1. a list/tuple of l training instances. Feature vector of
 48 | 	      each training instance is a list/tuple or dictionary.
 49 | 
 50 | 	   2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
 51 | 
 52 | 	train(prob [, options]) -> model | ACC
 53 | 	train(prob, param) -> model | ACC
 54 | 
 55 | 	Train a model from data (y, x) or a problem prob using
 56 | 	'options' or a parameter param.
 57 | 
 58 | 	If '-v' is specified in 'options' (i.e., cross validation)
 59 | 	either accuracy (ACC) or mean-squared error (MSE) is returned.
 60 | 
 61 | 	options:
 62 | 		-s type : set type of solver (default 1)
 63 | 		  for multi-class classification
 64 | 			 0 -- L2-regularized logistic regression (primal)
 65 | 			 1 -- L2-regularized L2-loss support vector classification (dual)
 66 | 			 2 -- L2-regularized L2-loss support vector classification (primal)
 67 | 			 3 -- L2-regularized L1-loss support vector classification (dual)
 68 | 			 4 -- support vector classification by Crammer and Singer
 69 | 			 5 -- L1-regularized L2-loss support vector classification
 70 | 			 6 -- L1-regularized logistic regression
 71 | 			 7 -- L2-regularized logistic regression (dual)
 72 | 		  for regression
 73 | 			11 -- L2-regularized L2-loss support vector regression (primal)
 74 | 			12 -- L2-regularized L2-loss support vector regression (dual)
 75 | 			13 -- L2-regularized L1-loss support vector regression (dual)
 76 | 		-c cost : set the parameter C (default 1)
 77 | 		-p epsilon : set the epsilon in loss function of SVR (default 0.1)
 78 | 		-e epsilon : set tolerance of termination criterion
 79 | 			-s 0 and 2
 80 | 				|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
 81 | 				where f is the primal function, (default 0.01)
 82 | 			-s 11
 83 | 				|f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)
 84 | 			-s 1, 3, 4, and 7
 85 | 				Dual maximal violation <= eps; similar to liblinear (default 0.)
 86 | 			-s 5 and 6
 87 | 				|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
 88 | 				where f is the primal function (default 0.01)
 89 | 			-s 12 and 13
 90 | 				|f'(alpha)|_1 <= eps |f'(alpha0)|,
 91 | 				where f is the dual function (default 0.1)
 92 | 		-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
 93 | 		-wi weight: weights adjust the parameter C of different classes (see README for details)
 94 | 		-v n: n-fold cross validation mode
 95 | 		-n nr_thread : parallel version with [nr_thread] threads (default 1; only for -s 0, 1, 2, 3, 11)
 96 | 		-q : quiet mode (no outputs)
 97 | 	"""
 98 | 	prob, param = None, None
 99 | 	if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)):
100 | 		assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix)))
101 | 		y, x, options = arg1, arg2, arg3
102 | 		prob = problem(y, x)
103 | 		param = parameter(options)
104 | 	elif isinstance(arg1, problem):
105 | 		prob = arg1
106 | 		if isinstance(arg2, parameter):
107 | 			param = arg2
108 | 		else:
109 | 			param = parameter(arg2)
110 | 	if prob == None or param == None :
111 | 		raise TypeError("Wrong types for the arguments")
112 | 
113 | 	prob.set_bias(param.bias)
114 | 	liblinear.set_print_string_function(param.print_func)
115 | 	err_msg = liblinear.check_parameter(prob, param)
116 | 	if err_msg :
117 | 		raise ValueError('Error: %s' % err_msg)
118 | 
119 | 	if param.flag_find_C:
120 | 		nr_fold = param.nr_fold
121 | 		best_C = c_double()
122 | 		best_rate = c_double()
123 | 		max_C = 1024
124 | 		if param.flag_C_specified:
125 | 			start_C = param.C
126 | 		else:
127 | 			start_C = -1.0
128 | 		liblinear.find_parameter_C(prob, param, nr_fold, start_C, max_C, best_C, best_rate)
129 | 		print("Best C = %lf  CV accuracy = %g%%\n"% (best_C.value, 100.0*best_rate.value))
130 | 		return best_C.value,best_rate.value
131 | 
132 | 
133 | 	elif param.flag_cross_validation:
134 | 		l, nr_fold = prob.l, param.nr_fold
135 | 		target = (c_double * l)()
136 | 		liblinear.cross_validation(prob, param, nr_fold, target)
137 | 		ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
138 | 		if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
139 | 			print("Cross Validation Mean squared error = %g" % MSE)
140 | 			print("Cross Validation Squared correlation coefficient = %g" % SCC)
141 | 			return MSE
142 | 		else:
143 | 			print("Cross Validation Accuracy = %g%%" % ACC)
144 | 			return ACC
145 | 	else:
146 | 		m = liblinear.train(prob, param)
147 | 		m = toPyModel(m)
148 | 
149 | 		return m
150 | 
151 | def predict(y, x, m, options=""):
152 | 	"""
153 | 	predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
154 | 
155 | 	y: a list/tuple/ndarray of l true labels (type must be int/double).
156 | 	   It is used for calculating the accuracy. Use [] if true labels are
157 | 	   unavailable.
158 | 
159 | 	x: 1. a list/tuple of l training instances. Feature vector of
160 | 	      each training instance is a list/tuple or dictionary.
161 | 
162 | 	   2. an l * n numpy ndarray or scipy spmatrix (n: number of features).
163 | 
164 | 	Predict data (y, x) with the SVM model m.
165 | 	options:
166 | 	    -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
167 | 	    -q quiet mode (no outputs)
168 | 
169 | 	The return tuple contains
170 | 	p_labels: a list of predicted labels
171 | 	p_acc: a tuple including  accuracy (for classification), mean-squared
172 | 	       error, and squared correlation coefficient (for regression).
173 | 	p_vals: a list of decision values or probability estimates (if '-b 1'
174 | 	        is specified). If k is the number of classes, for decision values,
175 | 	        each element includes results of predicting k binary-class
176 | 	        SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value
177 | 	        is returned. For probabilities, each element contains k values
178 | 	        indicating the probability that the testing instance is in each class.
179 | 	        Note that the order of classes here is the same as 'model.label'
180 | 	        field in the model structure.
181 | 	"""
182 | 
183 | 	def info(s):
184 | 		print(s)
185 | 
186 | 	if scipy and isinstance(x, scipy.ndarray):
187 | 		x = scipy.ascontiguousarray(x) # enforce row-major
188 | 	elif sparse and isinstance(x, sparse.spmatrix):
189 | 		x = x.tocsr()
190 | 	elif not isinstance(x, (list, tuple)):
191 | 		raise TypeError("type of x: {0} is not supported!".format(type(x)))
192 | 
193 | 	if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))):
194 | 		raise TypeError("type of y: {0} is not supported!".format(type(y)))
195 | 
196 | 	predict_probability = 0
197 | 	argv = options.split()
198 | 	i = 0
199 | 	while i < len(argv):
200 | 		if argv[i] == '-b':
201 | 			i += 1
202 | 			predict_probability = int(argv[i])
203 | 		elif argv[i] == '-q':
204 | 			info = print_null
205 | 		else:
206 | 			raise ValueError("Wrong options")
207 | 		i+=1
208 | 
209 | 	solver_type = m.param.solver_type
210 | 	nr_class = m.get_nr_class()
211 | 	nr_feature = m.get_nr_feature()
212 | 	is_prob_model = m.is_probability_model()
213 | 	bias = m.bias
214 | 	if bias >= 0:
215 | 		biasterm = feature_node(nr_feature+1, bias)
216 | 	else:
217 | 		biasterm = feature_node(-1, bias)
218 | 	pred_labels = []
219 | 	pred_values = []
220 | 
221 | 	if scipy and isinstance(x, sparse.spmatrix):
222 | 		nr_instance = x.shape[0]
223 | 	else:
224 | 		nr_instance = len(x)
225 | 
226 | 	if predict_probability:
227 | 		if not is_prob_model:
228 | 			raise TypeError('probability output is only supported for logistic regression')
229 | 		prob_estimates = (c_double * nr_class)()
230 | 		for i in range(nr_instance):
231 | 			if scipy and isinstance(x, sparse.spmatrix):
232 | 				indslice = slice(x.indptr[i], x.indptr[i+1])
233 | 				xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
234 | 			else:
235 | 				xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
236 | 			xi[-2] = biasterm
237 | 			label = liblinear.predict_probability(m, xi, prob_estimates)
238 | 			values = prob_estimates[:nr_class]
239 | 			pred_labels += [label]
240 | 			pred_values += [values]
241 | 	else:
242 | 		if nr_class <= 2:
243 | 			nr_classifier = 1
244 | 		else:
245 | 			nr_classifier = nr_class
246 | 		dec_values = (c_double * nr_classifier)()
247 | 		for i in range(nr_instance):
248 | 			if scipy and isinstance(x, sparse.spmatrix):
249 | 				indslice = slice(x.indptr[i], x.indptr[i+1])
250 | 				xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature)
251 | 			else:
252 | 				xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature)
253 | 			xi[-2] = biasterm
254 | 			label = liblinear.predict_values(m, xi, dec_values)
255 | 			values = dec_values[:nr_classifier]
256 | 			pred_labels += [label]
257 | 			pred_values += [values]
258 | 
259 | 	if len(y) == 0:
260 | 		y = [0] * nr_instance
261 | 	ACC, MSE, SCC = evaluations(y, pred_labels)
262 | 
263 | 	if m.is_regression_model():
264 | 		info("Mean squared error = %g (regression)" % MSE)
265 | 		info("Squared correlation coefficient = %g (regression)" % SCC)
266 | 	else:
267 | 		info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance))
268 | 
269 | 	return pred_labels, (ACC, MSE, SCC), pred_values
270 | 


--------------------------------------------------------------------------------
/python/core/train_model.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from pathlib import Path
 3 | 
 4 | from utils.report import Report
 5 | 
 6 | from sklearn.preprocessing import LabelEncoder
 7 | from liblinearutil import *
 8 | 
 9 | class Train():
10 |     
11 |     def __init__(self, jobs_number, solver_name, solver_value, c_value, model_name):
12 |         self.jobs_number = jobs_number
13 |         self.solver_name = solver_name
14 |         self.solver_value = solver_value
15 |         self.c_value = c_value
16 |         self.model_name = model_name
17 |         self.best_perfomance = OrderedDict()
18 |         self.best_perfomance['Solver name'] = solver_name
19 |         self.best_perfomance['Solver value'] = solver_value
20 |         self.best_perfomance['C value'] = c_value
21 |         self.report = None
22 | 
23 |     def save_best_perfomance(self, output_dir):
24 |         model_name = Path(self.model_name).stem
25 |         with open(f"{output_dir}/{model_name}_info", 'w') as bpf:
26 |             for value in self.best_perfomance.keys():
27 |                 bpf.write(f"{value}: {self.best_perfomance[value]}\n")
28 |             bpf.write(f"Accuracy score: {self.report.get_accuracy_score()}\n")
29 |             bpf.write("Perfomance on test set:\n")
30 |             bpf.write(self.report.get_report())
31 |         bpf.close()
32 | 
33 |     def train_model(self, X_train, X_test, y_train, y_test):
34 |         le = LabelEncoder()
35 |         le.fit(y_train)
36 |         y_train = le.transform(y_train)
37 |         y_test = le.transform(y_test)
38 |     
39 |         if self.solver_value == 4 or self.solver_value == 7:
40 |             parameters = "-s {} -c {} -B 1 -q".format(self.solver_value, self.c_value)
41 |         else:
42 |             parameters = "-s {} -n {} -c {} -B 1 -q".format(self.solver_value, self.jobs_number, self.c_value)
43 |         param = parameter(parameters)
44 |         prob = problem(y_train, X_train)
45 |         model = train(prob, param)
46 |         
47 |         p_label, p_acc, p_val = predict(y_test, X_test, model)
48 | 
49 |         #Convert predicted value from float to int
50 |         y_pred = [int(label) for label in p_label]
51 |         y_test = le.inverse_transform(y_test)
52 |         y_pred = le.inverse_transform(y_pred)
53 | 
54 |         self.report = Report(y_test, y_pred)
55 |         
56 |         save_model(f"{self.model_name}", model)
57 |         return y_pred


--------------------------------------------------------------------------------
/python/core/tuning_parameter.py:
--------------------------------------------------------------------------------
  1 | from time import time, gmtime, strftime
  2 | from collections import OrderedDict
  3 | from pathlib import Path
  4 | 
  5 | from sklearn.preprocessing import LabelEncoder
  6 | from sklearn.metrics import accuracy_score
  7 | from liblinearutil import *
  8 | 
  9 | class Tuning():    
 10 | 
 11 |     def __init__(self, jobs_number, solvers_value_file, output_dir):
 12 |         self.solvers = OrderedDict()
 13 |         self.solvers["L2-regularized logistic regression (primal)"] = 0
 14 |         self.solvers["L2-regularized L2-loss support vector classification (dual)"] = 1
 15 |         self.solvers["L2-regularized L2-loss support vector classification (primal)"] = 2
 16 |         self.solvers["L2-regularized L1-loss support vector classification (dual)"] = 3
 17 |         self.solvers["support vector classification by Crammer and Singer"] = 4
 18 |         self.solvers["L1-regularized L2-loss support vector classification"] = 5
 19 |         self.solvers["L1-regularized logistic regression"] = 6
 20 |         self.solvers["L2-regularized logistic regression (dual)"] = 7
 21 |         self.C_VALUE = [0.01, 0.05, 0.10, 0.20, 0.25, 0.50, 1, 2, 4, 8]
 22 |         self.output_dir = output_dir
 23 |         if solvers_value_file is None:
 24 |             self.__write_solvers_value()
 25 |         self.jobs_number = jobs_number
 26 |         self.__load_solvers_value(solvers_value_file)
 27 |         self.best_perfomance = OrderedDict()
 28 | 
 29 |     def __write_solvers_value(self):
 30 |         with open(f"{self.output_dir}/liblinear_solver", 'w') as sf:
 31 |             for value in self.solvers.keys():
 32 |                 sf.write(f"{value}\n")
 33 |         sf.close()
 34 | 
 35 |     def __load_solvers_value(self, solvers_value_file):
 36 |         solvers_value_file = Path(solvers_value_file)
 37 |         if not solvers_value_file.exists():
 38 |             with solvers_value_file.open('w', encoding='utf-8') as sf:
 39 |                 for key in self.solvers.keys():
 40 |                     sf.write(f"{key}\n")
 41 |             sf.close()
 42 |         with open(solvers_value_file, 'r') as sf:
 43 |             lines = []
 44 |             for line in sf:
 45 |                 line = line.rstrip('\n')
 46 |                 print(line)
 47 |                 lines.append(line)
 48 |         sf.close()
 49 |         for key in self.solvers.keys():
 50 |             if key not in lines:
 51 |                 del self.solvers[key]
 52 | 
 53 |     def __encode_label(self, y_train, y_test):
 54 |         le = LabelEncoder()
 55 |         le.fit(y_train)
 56 |         y_train = le.transform(y_train)
 57 |         y_test = le.transform(y_test)
 58 |         return y_train, y_test
 59 | 
 60 |     def __create_perfomance_file(self, perfomance_dict):
 61 |         with open(f"{self.output_dir}/{perfomance_dict['Solver name']}", 'w') as sf:
 62 |             for value in perfomance_dict.keys():
 63 |                 sf.write(f"{value}: {perfomance_dict[value]}\n")
 64 |         sf.close()    
 65 |         
 66 |     def __train_and_predict(self, X_train, X_test, y_train, y_test, solver_value, c_value):
 67 |         if solver_value == 4 or solver_value == 7:
 68 |             parameters = "-s {} -c {} -B 1 -q".format(solver_value, c_value)
 69 |         else:
 70 |             parameters = "-s {} -n {} -c {} -B 1 -q".format(solver_value, self.jobs_number, c_value)
 71 |         param = parameter(parameters)
 72 | 
 73 |         model = train(self.prob, param)
 74 | 
 75 |         p_label, p_acc, p_val = predict(y_test, X_test, model)
 76 | 
 77 |         #Convert predicted value from float to int
 78 |         y_pred = [int(label) for label in p_label]
 79 |         
 80 |         accuracy = accuracy_score(y_test, y_pred)
 81 |         
 82 |         return accuracy
 83 | 
 84 |         
 85 |     def tuning_parameter(self, X_train, X_test, y_train, y_test):
 86 |         y_train, y_test = self.__encode_label(y_train, y_test)
 87 |         self.prob = problem(y_train, X_train)
 88 | 
 89 |         self.scores_list = []
 90 |         
 91 |         cv_accuracy = 0
 92 |         
 93 |         best_solver_name = ""
 94 |         best_cv_accuracy = 0
 95 |         best_c_value = 0
 96 |         best_s_value = 0
 97 |         
 98 |         current_cv_accuracy = 0
 99 |         current_c_value = 0
100 | 
101 |         for solver_name, solver_value in self.solvers.items():
102 |             print(f"Tuning solver {solver_name}")
103 |             time_start = time()
104 |             for c_value in self.C_VALUE:
105 |                 print(f"C value: {c_value}")
106 |                 if solver_value == 4 or solver_value == 7:
107 |                     parameters = "-s {} -c {} -v 10 -B 1 -q".format(solver_value, c_value)
108 |                 else:
109 |                     parameters = "-s {} -n {} -c {} -v 10 -B 1 -q".format(solver_value, self.jobs_number, c_value)
110 |                 param = parameter(parameters)
111 |                 cv_accuracy = train(self.prob, param)
112 |                 if cv_accuracy > best_cv_accuracy:
113 |                     best_c_value = c_value
114 |                     best_cv_accuracy = cv_accuracy
115 |                     best_s_value = solver_value
116 |                     best_solver_name = solver_name
117 |                 if cv_accuracy > current_cv_accuracy:
118 |                     current_cv_accuracy = cv_accuracy
119 |                     current_c_value = c_value
120 |             tuning_time = time() - time_start
121 |             tuning_time = strftime("%H:%M:%S", gmtime(tuning_time))
122 | 
123 |             #Training current model for testing
124 |             accuracy = self.__train_and_predict(X_train, X_test, y_train, y_test, solver_value, current_c_value)
125 |             perfomance_dict = OrderedDict()
126 |             perfomance_dict["Solver name"] = solver_name
127 |             perfomance_dict["Best C value"] = current_c_value
128 |             perfomance_dict["Tuning time"] = tuning_time
129 |             perfomance_dict["Accuracy"] = accuracy
130 |             self.__create_perfomance_file(perfomance_dict)
131 |             current_cv_accuracy = 0
132 |             current_c_value = 0
133 |             print("\n")
134 |     
135 |         #training_time, test_time, accuracy = self.__train_and_predict(X_train, X_test, y_train, y_test, best_s_value, best_c_value)
136 |         #self.best_perfomance = OrderedDict()
137 |         #self.best_perfomance["Solver name"] = best_solver_name
138 |         #self.best_perfomance["C value"] = best_c_value
139 |         #self.best_perfomance["Tuning time"] = tuning_time
140 |         #self.best_perfomance["Training time"] = training_time
141 |         #self.best_perfomance["Test time"] = test_time
142 |         #self.best_perfomance["Accuracy"] = accuracy
143 | 
144 |         return best_solver_name, best_s_value, best_c_value
145 | 


--------------------------------------------------------------------------------
/python/core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/pySenti4SD/5ed11f1f9bf42c113db064278fe7decaf07587c4/python/core/utils/__init__.py


--------------------------------------------------------------------------------
/python/core/utils/core_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class CoreUtils():
 4 | 
 5 |     @staticmethod
 6 |     def check_jobs_number(jobs_number):
 7 |         max_jobs = os.cpu_count()
 8 |         if jobs_number > max_jobs:
 9 |             jobs_number = max_jobs
10 |         elif jobs_number < 0:
11 |             if jobs_number == -1:
12 |                 jobs_number = max_jobs
13 |             elif jobs_number > -(max_jobs - 1):
14 |                 jobs_number = max_jobs + jobs_number
15 |             else:
16 |                 jobs_number = 1
17 |         return jobs_number
18 | 


--------------------------------------------------------------------------------
/python/core/utils/csv_formatter.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from collections import OrderedDict
 3 | 
 4 | from csv_utils import CsvUtils
 5 | 
 6 | class CsvFormatter():
 7 | 
 8 |     def __init__(self, header_list, csv_delimiter, header = False):
 9 |         self.header_list = header_list
10 |         self.header = header
11 |         self.csv_delimiter = csv_delimiter
12 |     
13 |     def get_rows(self, input_csv):
14 |         with open(input_csv, 'r+', newline = '', encoding='utf8') as csv_file:
15 |             header_list_copy = self.header_list.copy()
16 |             csv_file.seek(0)
17 |             csv_file_reader =  csv.reader(csv_file, delimiter = self.csv_delimiter)
18 |             header = next(csv_file_reader)
19 |             rows = OrderedDict()
20 |             if len(header) == 0:
21 |                 csv_file.close()
22 |                 raise IOError("{} is empty.".format(input_csv))
23 |             elif len(self.header_list) <= len(header) : 
24 |                 count = 0
25 |                 while len(header_list_copy) != 0:
26 |                     for i in range(0, len(header)):
27 |                         if header[i].lower().strip() == header_list_copy[0].lower().strip():
28 |                             rows.update({header_list_copy[0]: [row[i] for row in csv_file_reader]})
29 |                             count += 1
30 |                             break
31 |                     header_list_copy.pop(0)
32 |                     csv_file.seek(0)
33 |                     next(csv_file_reader)
34 |                 if count != len(self.header_list):
35 |                     csv_file.close()
36 |                     raise IOError("{} not found in {}".format(header_list_copy, input_csv))
37 |             else:
38 |                 csv_file.close()
39 |                 raise IOError("Too many header in the list.")
40 |         csv_file.close()
41 |         return rows
42 | 
43 |     def write(self, data, output_csv):
44 |         CsvUtils.write_to_csv(data, output_csv, self.csv_delimiter)
45 | 


--------------------------------------------------------------------------------
/python/core/utils/csv_utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from multiprocessing import Pool
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | class CsvUtils():
  9 | 
 10 |     @staticmethod
 11 |     def __check_file_existence(file_path):
 12 |         if not os.path.isfile(file_path):
 13 |             return False
 14 |         else:
 15 |             return True
 16 | 
 17 |     @staticmethod       
 18 |     def __check_file_extension(file_path, allowed_extension):
 19 |         extension = os.path.splitext(file_path)[1]
 20 |         if extension not in allowed_extension:
 21 |             return False
 22 |         else:
 23 |             return True
 24 | 
 25 |     @staticmethod       
 26 |     def check_csv(csv_path):
 27 |         if not CsvUtils.__check_file_existence(csv_path):
 28 |             raise OSError ("FILE NOT FOUND : {} wasn't found".format(csv_path))
 29 |         if not CsvUtils.__check_file_extension(csv_path, ['.csv']):
 30 |             raise OSError("WRONG FILE EXTENSION : {} wasn't a csv file.".format(csv_path))
 31 | 
 32 |     @staticmethod
 33 |     def convert_lines(rows):
 34 |         X = np.array([])
 35 |         y = np.array([])
 36 |         first = True
 37 |         for i in range(0, len(rows)):
 38 |             values = rows[i].split(',')
 39 |             splitted_row_features = [float(value) for value in values[1:-2]]
 40 |             splitted_row_label = values[-1].rstrip('\n')
 41 |             if first:
 42 |                 X = np.array(splitted_row_features)
 43 |                 y = np.array(splitted_row_label)
 44 |                 first = False
 45 |             else:
 46 |                 X = np.append(X, np.array(splitted_row_features))
 47 |                 y = np.append(y, np.array(splitted_row_label))
 48 |         return X.reshape((i+1, len(splitted_row_features))), y
 49 | 
 50 |     @staticmethod
 51 |     def from_csv(csv_file, chunk_size, jobs_number):
 52 |         stop = False
 53 |         rows = []
 54 |         chunk_size = int(chunk_size / jobs_number)
 55 |         with open(csv_file, 'r+') as csv:
 56 |             next(csv)
 57 |             while not stop:
 58 |                 read_rows = []
 59 |                 try:
 60 |                     for _ in range(jobs_number):
 61 |                         temp_rows = []
 62 |                         for _ in range (chunk_size):
 63 |                             temp_rows.append(next(csv))
 64 |                         read_rows.append(temp_rows)
 65 |                 except StopIteration:
 66 |                     stop = True
 67 |                     read_rows.append(temp_rows)
 68 |                 finally:
 69 |                     if len(temp_rows) != 0:
 70 |                         with Pool(jobs_number) as p:
 71 |                             results = p.map(CsvUtils.convert_lines, read_rows)
 72 |                         for result in results:
 73 |                             rows.append(result)
 74 |         csv.close()
 75 |         first = True
 76 |         for row in rows:
 77 |             if first:
 78 |                 X = row[0]
 79 |                 y = row[1]
 80 |                 first = False
 81 |             else:
 82 |                 X = np.concatenate((X, row[0]))
 83 |                 y = np.concatenate((y, row[1]))
 84 |         return X, y
 85 | 
 86 |     @staticmethod
 87 |     def write_to_csv(data, output_csv, csv_delimiter, print_header = False, mode = 'w+'):
 88 |         with open(output_csv, mode, newline = '', encoding='utf8') as csv_file:
 89 |             csv_file_writer = csv.writer(csv_file, delimiter = csv_delimiter)
 90 |             if print_header == True:
 91 |                 header = data.keys()
 92 |                 csv_file_writer.writerow(header)
 93 |             data = zip(*data.values())
 94 |             csv_file_writer.writerows(data)
 95 |         csv_file.close()
 96 | 
 97 |     @staticmethod
 98 |     def order_csv(input_csv, column_name):
 99 |         #csv_delimiter = CsvUtils.find_csv_delimiter(input_csv)
100 |         temp = pd.read_csv(input_csv, delimiter = ',')
101 |         temp = temp.sort_values(by=[column_name])
102 |         temp.to_csv(input_csv, index = False)
103 | 


--------------------------------------------------------------------------------
/python/core/utils/report.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score
 3 | 
 4 | 
 5 | class Report():
 6 | 
 7 |     def __init__(self, y_true, y_pred):
 8 |         self.y_true = y_true
 9 |         self.y_pred = y_pred
10 | 
11 |     def print_report(self):
12 |         print(classification_report(self.y_true, self.y_pred))
13 | 
14 |     def get_report(self):
15 |         return classification_report(self.y_true, self.y_pred)
16 | 
17 |     def get_micro_score(self):
18 |         return precision_recall_fscore_support(self.y_true, self.y_pred, average='micro')
19 | 
20 |     def get_macro_score(self):
21 |         return precision_recall_fscore_support(self.y_true, self.y_pred, average='macro')
22 | 
23 |     def get_accuracy_score(self):
24 |         return accuracy_score(self.y_true, self.y_pred)
25 | 
26 |     def get_classes_score(self):
27 |         unique, counts = np.unique(self.y_pred, return_counts=True)
28 |         scores = precision_recall_fscore_support(self.y_true, self.y_pred, average=None, labels=unique)
29 |         scores_dict = OrderedDict()
30 |         for value in unique:
31 |             scores_dict.update({value: []})
32 |         for i in range(len(scores) - 1):
33 |             for j in range(len(unique)):
34 |                 scores_dict[unique[j]].append(scores[i][j])
35 |         scores_dict.update({"support": scores[-1]})
36 |         return scores_dict


--------------------------------------------------------------------------------
/python/csv_processing.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | import sys
 5 | import os
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
 7 | 
 8 | from core.utils.csv_formatter import CsvFormatter
 9 | from core.utils.csv_utils import CsvUtils
10 | 
11 | logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description = "Csv file processing")
15 |     parser.add_argument('-i',
16 |                        '--input',
17 |                        help = "path to csv file",
18 |                        type = str,
19 |                        required = True)
20 |     parser.add_argument('-d',
21 |                        '--delimiter',
22 |                        help = 'csv delimiter, use c for comma and sc for semicolon',
23 |                        type = str,
24 |                        default = 'c')
25 |     parser.add_argument('-c',
26 |                        '--columns',
27 |                        help = "column or columns to extract from csv [default = 'text']",
28 |                        type = str,
29 |                        action = 'append',
30 |                        required = True)
31 |     args = parser.parse_args()
32 |     input_csv = args.input
33 |     input_csv = Path(input_csv).resolve()
34 |     output_csv = "{}/{}_jar.csv".format(input_csv.parent, input_csv.name.split('.')[0])
35 |     try:
36 |         CsvUtils.check_csv(input_csv)
37 |         logging.info("Start formatting csv file")
38 |         try:
39 |             if(args.delimiter == 'c'):
40 |                 csvFormatter = CsvFormatter(args.columns, ',')
41 |             elif(args.delimiter == 'sc'):
42 |                 csvFormatter = CsvFormatter(args.columns, ';')
43 |             else:
44 |                 logging.error('Wrong csv delimiter. Use "c" for comma and "sc" for semicolon.')
45 |                 sys.exit(1)
46 |             data = csvFormatter.get_rows(input_csv)
47 |             csvFormatter.write(data, output_csv)
48 |         except IOError as e:
49 |             logging.error(e)
50 |             sys.exit(1)
51 |         logging.info("End formatting csv file")
52 |     except OSError as e:
53 |         logging.error(e)
54 |         sys.exit(1)
55 |     
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/python/train.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core'))
  4 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/utils'))
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'core/liblinear_multicore'))
  6 | 
  7 | import logging
  8 | import argparse
  9 | from pathlib import Path
 10 | 
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | from core.tuning_parameter import Tuning
 15 | from core.train_model import Train
 16 | from core.utils.csv_utils import CsvUtils
 17 | from core.utils.core_utils import CoreUtils
 18 | 
 19 | 
 20 | logging.basicConfig(level = logging.INFO, format = "[%(levelname)s] %(asctime)s - %(message)s")
 21 |     
 22 | 
 23 | def main():
 24 |     parser = argparse.ArgumentParser(description = "Hyperparameter tuning")
 25 |     parser.add_argument('-i',
 26 |                        '--input',
 27 |                        help = "path to train set and to test set csv.",
 28 |                        type = str,
 29 |                        action = 'append',
 30 |                        required = True)
 31 |     parser.add_argument('-c',
 32 |                         '--chunk-size',
 33 |                         help = 'chunk size  --default = 1000',
 34 |                         type = int,
 35 |                         default = 1000)
 36 |     parser.add_argument('-j', 
 37 |                         '--jobs-number',
 38 |                         help = 'number of jobs',
 39 |                         type = int,
 40 |                         default = 1)
 41 |     parser.add_argument('-m', 
 42 |                         '--model',
 43 |                         help = 'model file name',
 44 |                         type = str,
 45 |                         default = 'Senti4SD')
 46 |     args = parser.parse_args()
 47 | 
 48 |     seed = np.random.seed(42)
 49 | 
 50 |     jobs_number = CoreUtils.check_jobs_number(args.jobs_number)
 51 |     
 52 |     if len(args.input) == 1:
 53 | 
 54 |         train_file_path = Path(args.input[0]).resolve()
 55 | 
 56 |         # Check file existence in advance to avoid missing test set
 57 |         try:
 58 |             CsvUtils.check_csv(train_file_path)
 59 |         except OSError as e:
 60 |             print(e)
 61 |             sys.exit(1)
 62 | 
 63 |         try:
 64 |             logging.info("Start reading dataset in chunk...")
 65 |             X, y = CsvUtils.from_csv(train_file_path, args.chunk_size, jobs_number)
 66 |             logging.info("End reading dataset in chunk...")
 67 |         except OSError as e:
 68 |             print(e)
 69 |             sys.exit(1)
 70 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify = y, random_state = seed)
 71 |         del X, y
 72 |     elif len(args.input) == 2:
 73 | 
 74 |         train_file_path = Path(args.input[0]).resolve()
 75 |         test_file_path = Path(args.input[1]).resolve()
 76 |         
 77 |         #Check file existence in advance to avoid missing test set
 78 |         try:
 79 |             CsvUtils.check_csv(train_file_path)
 80 |             CsvUtils.check_csv(test_file_path)
 81 |         except OSError as e:
 82 |             print(e)
 83 |             sys.exit(1)
 84 | 
 85 |         #read the train set in chunk
 86 |         logging.info("Start reading training set in chunk...")
 87 |         X_train, y_train = CsvUtils.from_csv(train_file_path, args.chunk_size, jobs_number)
 88 |         logging.info("End reading training set in chunk...")
 89 |         logging.info("Start reading test set in chunk...")
 90 |         X_test, y_test = CsvUtils.from_csv(test_file_path, args.chunk_size, jobs_number)
 91 |         logging.info("End reading test set in chunk...")
 92 |     
 93 |     else:
 94 |         print("Too many input arguments.")
 95 | 
 96 |     #create path
 97 |     output_path = Path('liblinear_perfomance')
 98 |     output_path.mkdir(parents=True, exist_ok=True)
 99 |     output_path = output_path.resolve()
100 |     dir_path = output_path.parent
101 |     model_path = f"{dir_path}/{args.model}.model"
102 | 
103 |     logging.info("Start parameter tuning")
104 |     current_path = Path.cwd()
105 |     solvers_path = Path(f'{current_path}/liblinear_solvers').resolve()
106 |     tuning = Tuning(jobs_number, solvers_path, output_path)
107 |     best_solver_name, best_solver_value, best_c_value = tuning.tuning_parameter(X_train, X_test, y_train, y_test)
108 |     logging.info("End parameter tuning")
109 | 
110 |     logging.info("Start training model")
111 |     train = Train(jobs_number, best_solver_name, best_solver_value, best_c_value, model_path)
112 |     train.train_model(X_train, X_test, y_train, y_test)
113 |     train.save_best_perfomance(dir_path)
114 |     logging.info("End training model")
115 |     
116 |    
117 | if __name__ == '__main__':
118 |     main()
119 |                         
120 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==1.2.0
 2 | llvmlite==0.28.0
 3 | numba==0.43.1
 4 | numpy==1.22.0
 5 | pandas==0.24.2
 6 | python-dateutil==2.8.0
 7 | pytz==2019.1
 8 | scikit-learn==1.5.0
 9 | scipy==1.10.0
10 | six==1.12.0
11 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | SCRIPTDIR=$(dirname "$0")
  4 | 
  5 | csvDelimiter='c'
  6 | features='A'
  7 | grams=false
  8 | chunkSize=1000
  9 | jobsNumber=1
 10 | modelFile="$SCRIPTDIR/Senti4SD"
 11 | 
 12 | help(){
 13 |     echo "Usage-1: sh train.sh -i train.csv [-d csv-delimiter] [-F features] [-g] [-c chunk_size] [-j jobs_number] [-o Senti4SD.model]"
 14 |     echo "or"
 15 |     echo "Usage-2: sh train.sh -i train.csv -i test.csv [-d csv-delimiter] [-g] [-c chunk_size] [-j jobs_number] [-o Senti4SD.model]"
 16 |     echo "-i -- the input file, containing the corpus for the training; it's possible to run the script with two separated datasets, one for training and the other for testing [see Usage-2]. [required]"
 17 |     echo '-d -- the delimiter used in the csv file, where c stands for comma and sc for semicolon. [Default value: "c"]'
 18 |     echo '-F -- all features to be considered. A stands for all, L stands for lexicon fetures, S stands for semantic features and K stands for keyword features. [Default value: A]'
 19 |     echo '-g -- enables the extraction of n-grams (i.e,. bigrams and unigrams)'
 20 |     echo "-c -- the number of rows to read from the dataset per time, to avoid high memory usage. [Default value: 1000]"
 21 |     echo "-j -- the number of cores to use during csv reading phase. If you pass -1 all cores will be used.
 22 | 		If you pass a number higher than your total core number, the script will use all the cores. [Default value: 1] "
 23 |     echo "-o -- the name of trained model. [Default value: 'Senti4SD.model']"
 24 |     exit 1
 25 | }
 26 | 
 27 | NUMARGS=$#
 28 | if [ $NUMARGS -eq 0 ]; then
 29 |     help
 30 |     exit 1
 31 | fi
 32 | 
 33 | while getopts "h:i:d:F:m:c:j:o:g" OPTIONS; do
 34 |     case $OPTIONS in
 35 |         h)
 36 |           help
 37 |           ;;
 38 |         i)
 39 |           inputFiles+=($OPTARG)
 40 |           ;;
 41 |         d)
 42 |           csvDelimiter=$OPTARG
 43 |           ;;
 44 |         F)
 45 |           features=$OPTARG
 46 |           ;;
 47 | 	    g)
 48 | 	      grams=true
 49 | 	      ;;
 50 |         c)
 51 |           chunkSize=$OPTARG
 52 |           ;;
 53 |         j)
 54 |           jobsNumber=$OPTARG
 55 |           ;;
 56 |         m)
 57 |           modelFile="$SCRIPTDIR/$OPTARG"
 58 |           ;;
 59 |         \?)
 60 |           echo -e \\n"Option $OPTARG not allowed."
 61 |           help
 62 |           ;;
 63 |     esac
 64 | done
 65 | 
 66 | INPUTFILESLENGTH=${#inputFiles[@]} 
 67 | echo $INPUTFILESLENGTH
 68 | 
 69 | if [ $INPUTFILESLENGTH -lt 1 ]; then
 70 |     echo "Train data file is required!"
 71 |     exit 1
 72 | else
 73 | if [ $INPUTFILESLENGTH -gt 2 ]; then
 74 |     echo "Too many input file!"
 75 |     exit 1
 76 | else
 77 | if [ $INPUTFILESLENGTH -eq 1 ]; then
 78 | 
 79 |   mkdir -p $SCRIPTDIR/temp_features;
 80 | 
 81 |   inputFile=$inputFiles
 82 | 
 83 |   python $SCRIPTDIR/python/csv_processing.py -i $inputFile -d $csvDelimiter -c text -c polarity
 84 | 
 85 |   IFS='.' read -ra FILENAMESPLIT <<< "$inputFile"
 86 |   jarInputFile="${FILENAMESPLIT[0]}_jar.csv"
 87 | 
 88 |   echo $jarInputFile
 89 | 
 90 |     #-F A: all features to be considered
 91 |     #-i file_name: a file containg a document for every line
 92 |     #-W cbow600.bin: DSM to be loaded
 93 |     #-oc file_name.csv: output dataset containg the features extracted
 94 |     #-vd numeric: vectors size (for cbow600.bin the size is 600)
 95 |     #-L: if present corpus have a label column [optional]
 96 |     #-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
 97 |     #-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
 98 | 
 99 |   java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarInputFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeatures.csv -vd 600 -L
100 | 
101 |   python $SCRIPTDIR/python/train.py -i $SCRIPTDIR/temp_features/extractedFeatures.csv -c $chunkSize -j $jobsNumber -m $modelFile
102 |     
103 |   rm -rf $SCRIPTDIR/temp_features 
104 |   rm $jarInputFile
105 | else
106 | 
107 |   for file in ${inputFiles[@]}; do
108 |     if [ ! -f $file ]; then
109 |       echo "File $file not found!"
110 |       exit 1
111 |     fi
112 |   done
113 | 
114 |   mkdir -p $SCRIPTDIR/temp_features;
115 | 
116 |   trainFile=${inputFiles[0]}
117 |   testFile=${inputFiles[1]} 
118 | 
119 |   python $SCRIPTDIR/python/csv_processing.py -i $trainFile -d $csvDelimiter -c Text -c Polarity
120 |   python $SCRIPTDIR/python/csv_processing.py -i $testFile -d $csvDelimiter -c Text -c Polarity
121 | 
122 |   IFS='.' read -ra FILENAMESPLIT <<< "$trainFile"
123 |   jarTrainFile="${FILENAMESPLIT[0]}_jar.csv"
124 | 
125 |   IFS='.' read -ra FILENAMESPLIT <<< "$testFile"
126 |   jarTestFile="${FILENAMESPLIT[0]}_jar.csv"
127 | 
128 |   echo $jarTrainFile
129 |   echo $jarTestFile
130 |   
131 |   if [ "$grams" = true ] ; then
132 | 	java -jar $SCRIPTDIR/java/NgramsExtraction.jar $jarTrainFile -L
133 |   
134 | 
135 |     	#-F A: all features to be considered
136 |     	#-i file_name: a file containg a document for every line
137 |     	#-W cbow600.bin: DSM to be loaded
138 |     	#-oc file_name.csv: output dataset containg the features extracted
139 |     	#-vd numeric: vectors size (for cbow600.bin the size is 600)
140 |     	#-L: if present corpus have a label column [optional]
141 |     	#-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
142 |     	#-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
143 | 
144 |   	java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTrainFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -vd 600 -L -ul $SCRIPTDIR/UnigramsList -bl 	$SCRIPTDIR/BigramsList
145 |   	java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTestFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -vd 600 -L -ul $SCRIPTDIR/UnigramsList -bl 		$SCRIPTDIR/BigramsList
146 |   else
147 |         #-F A: all features to be considered
148 |     	#-i file_name: a file containg a document for every line
149 |     	#-W cbow600.bin: DSM to be loaded
150 |     	#-oc file_name.csv: output dataset containg the features extracted
151 |     	#-vd numeric: vectors size (for cbow600.bin the size is 600)
152 |     	#-L: if present corpus have a label column [optional]
153 |     	#-ul file_name: unigram's list to use for feature extraction. If not present default Senti4SD unigram's list will be used [optional]
154 |     	#-bl file_name: bigram's list to use for feature extraction. If not present default Senti4SD bigram's list will be used [optional]
155 | 
156 |   	java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTrainFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -vd 600 -L
157 |   	java -jar $SCRIPTDIR/java/Senti4SD-fast.jar -F $features -i $jarTestFile -W $SCRIPTDIR/java/dsm.bin -oc $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -vd 600 -L
158 |   fi
159 | 
160 |   python $SCRIPTDIR/python/train.py -i $SCRIPTDIR/temp_features/extractedFeaturesTrain.csv -i $SCRIPTDIR/temp_features/extractedFeaturesTest.csv -c $chunkSize -j $jobsNumber -m $modelFile
161 |     
162 |   rm -rf $SCRIPTDIR/temp_features  
163 |   rm $jarTrainFile
164 |   rm $jarTestFile
165 | 
166 | fi
167 | fi
168 | fi
169 | 


--------------------------------------------------------------------------------